def main(): """Create the model and start the training.""" args = get_arguments() h, w = map(int, args.input_size.split(',')) input_size = (h, w) coord = tf.train.Coordinator() with tf.name_scope("create_inputs"): reader = ImageReader( DATA_DIR, DATA_LIST_PATH, input_size, args.random_scale, args.random_mirror, args.ignore_label, IMG_MEAN, coord) image_batch, label_batch = reader.dequeue(args.batch_size) #with g.as_default(): net = ICNet_BN({'data': image_batch}, is_training=True, num_classes=args.num_classes, filter_scale=args.filter_scale) sub4_out = net.layers['sub4_out'] sub24_out = net.layers['sub24_out'] sub124_out = net.layers['conv6_cls'] restore_var = tf.global_variables() all_trainable = [v for v in tf.trainable_variables() if ('beta' not in v.name and 'gamma' not in v.name) or args.train_beta_gamma] loss_sub4 = create_loss(sub4_out, label_batch, args.num_classes, args.ignore_label) loss_sub24 = create_loss(sub24_out, label_batch, args.num_classes, args.ignore_label) loss_sub124 = create_loss(sub124_out, label_batch, args.num_classes, args.ignore_label) l2_losses = [args.weight_decay * tf.nn.l2_loss(v) for v in tf.trainable_variables() if 'weights' in v.name] reduced_loss = LAMBDA1 * loss_sub4 + LAMBDA2 * loss_sub24 + LAMBDA3 * loss_sub124 + tf.add_n(l2_losses) # Using Poly learning rate policy base_lr = tf.constant(args.learning_rate) step_ph = tf.placeholder(dtype=tf.float32, shape=()) learning_rate = tf.scalar_mul(base_lr, tf.pow((1 - step_ph / args.num_steps), args.power)) # Gets moving_mean and moving_variance update operations from tf.GraphKeys.UPDATE_OPS if args.update_mean_var == False: update_ops = None else: update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): opt_conv = tf.train.MomentumOptimizer(learning_rate, args.momentum) grads = tf.gradients(reduced_loss, all_trainable) train_op = opt_conv.apply_gradients(zip(grads, all_trainable))
def run(): recreate_directory_structure() # Create queue coordinator. coord = tf.train.Coordinator() # Load reader. with tf.name_scope("create_inputs"): reader = ImageReader("./train.npy", True, coord) image_batch, label_list_batch = reader.dequeue(FLAGS.batch_size) global_step = tf.Variable(0, dtype=tf.int32, name='global_step', trainable=False) net = CatznDogs({'data': image_batch}, global_step) net.train(image_batch, label_list_batch, coord)
def test_image_queue(h=321, w=321): input_size = (h, w) # Create queue coordinator coord = tf.train.Coordinator() # Load Image Reader with tf.name_scope('create_inputs'): reader = ImageReader(INDEX_FILE, DATA_DIRECTORY, MASK_DIRECTORY, input_size, True, True, IGNORE_LABEL, IMG_MEAN, coord) image_batch, mask_batch = reader.dequeue(BATCH_SIZE) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) threads = tf.train.start_queue_runners(coord=coord, sess=sess) for _ in range(10): images, masks = sess.run([image_batch, mask_batch]) # img = sess.run(mask_batch) print np.unique(masks)
def main(): """Create the model and start the training.""" args = get_arguments() h, w = map(int, args.input_size.split(',')) input_size = (h, w) #tf.set_random_seed(args.random_seed) coord = tf.train.Coordinator() with tf.Graph().as_default(), tf.device('/cpu:0'): # Using Poly learning rate policy base_lr = tf.constant(args.learning_rate) step_ph = tf.placeholder(dtype=tf.float32, shape=()) learning_rate = tf.train.exponential_decay(base_lr, step_ph, 20000, 0.5, staircase=True) tf.summary.scalar('lr', learning_rate) opt = tf.train.MomentumOptimizer(learning_rate, 0.9) #opt = tf.train.RMSPropOptimizer(learning_rate, 0.9, momentum=0.9, epsilon=1e-10) #opt = tf.train.AdamOptimizer(learning_rate) losses = [] train_op = [] total_batch_size = args.batch_size*args.gpu_nums with tf.name_scope('DeepLabResNetModel') as scope: with tf.name_scope("create_inputs"): reader = ImageReader( args.data_dir, args.data_list, input_size, args.random_blur, args.random_scale, args.random_mirror, args.random_rotate, args.ignore_label, IMG_MEAN, coord) image_batch, label_batch = reader.dequeue(total_batch_size) images_splits = tf.split(axis=0, num_or_size_splits=args.gpu_nums, value=image_batch) labels_splits = tf.split(axis=0, num_or_size_splits=args.gpu_nums, value=label_batch) net = DeepLabResNetModel({'data': images_splits}, is_training=True, num_classes=args.num_classes) raw_output_list = net.layers['fc_voc12'] num_valide_pixel = 0 for i in range(len(raw_output_list)): with tf.device('/gpu:%d' % i): raw_output_up = tf.image.resize_bilinear(raw_output_list[i], size=input_size, align_corners=True) tf.summary.image('images_{}'.format(i), images_splits[i]+IMG_MEAN, max_outputs = 4) tf.summary.image('labels_{}'.format(i), labels_splits[i], max_outputs = 4) tf.summary.image('predict_{}'.format(i), tf.cast(tf.expand_dims(tf.argmax(raw_output_up, -1),3),tf.float32), max_outputs = 4) all_trainable = [v for v in tf.trainable_variables()] # Predictions: ignoring all predictions with labels greater or equal than n_classes raw_prediction = tf.reshape(raw_output_up, [-1, args.num_classes]) label_proc = prepare_label(labels_splits[i], tf.stack(raw_output_up.get_shape()[1:3]), num_classes=args.num_classes, one_hot=False) # [batch_size, h, w] raw_gt = tf.reshape(label_proc, [-1,]) #indices = tf.squeeze(tf.where(tf.less_equal(raw_gt, args.num_classes - 1)), 1) indices = tf.where(tf.logical_and(tf.less(raw_gt, args.num_classes), tf.greater_equal(raw_gt, 0))) gt = tf.cast(tf.gather(raw_gt, indices), tf.int32) prediction = tf.gather(raw_prediction, indices) mIoU, update_op = tf.contrib.metrics.streaming_mean_iou(tf.argmax(tf.nn.softmax(prediction), axis=-1), gt, num_classes=args.num_classes) tf.summary.scalar('mean IoU_{}'.format(i), mIoU) train_op.append(update_op) # Pixel-wise softmax loss. loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=prediction, labels=gt) num_valide_pixel += tf.shape(gt)[0] losses.append(tf.reduce_sum(loss)) l2_losses = [args.weight_decay * tf.nn.l2_loss(v) for v in tf.trainable_variables() if 'weights' in v.name] reduced_loss = tf.truediv(tf.reduce_sum(losses), tf.cast(num_valide_pixel, tf.float32)) + tf.add_n(l2_losses) tf.summary.scalar('average_loss', reduced_loss) grads = tf.gradients(reduced_loss, all_trainable, colocate_gradients_with_ops=True) variable_averages = tf.train.ExponentialMovingAverage(0.99, step_ph) variables_to_average = (tf.trainable_variables() + tf.moving_average_variables()) variables_averages_op = variable_averages.apply(variables_to_average) train_op = tf.group(opt.apply_gradients(zip(grads, all_trainable)), *train_op) train_op = tf.group(train_op, variables_averages_op) summary_op = tf.summary.merge_all() # Set up tf session and initialize variables. config = tf.ConfigProto() config.allow_soft_placement=True sess = tf.Session(config=config) init = [tf.global_variables_initializer(),tf.local_variables_initializer()] sess.run(init) # Saver for storing checkpoints of the model. saver = tf.train.Saver(var_list=tf.global_variables(), max_to_keep=2) #restore from resnet imagenet, bised and local_step is in moving_average #restore_var = [v for v in tf.trainable_variables() if 'fc' not in v.name]+[v for v in tf.global_variables() if ('moving_mean' in v.name or 'moving_variance' in v.name) and ('biased' not in v.name and 'local_step' not in v.name)] restore_var = [v for v in tf.trainable_variables() if 'fc' not in v.name] ckpt = tf.train.get_checkpoint_state(args.restore_from) if ckpt and ckpt.model_checkpoint_path: loader = tf.train.Saver(var_list=restore_var) load(loader, sess, ckpt.model_checkpoint_path) else: print('No checkpoint file found.') """ #restore from snapshot restore_var = tf.global_variables() ckpt = tf.train.get_checkpoint_state(args.snapshot_dir) if ckpt and ckpt.model_checkpoint_path: loader = tf.train.Saver(var_list=restore_var, allow_empty=True) load_step = int(os.path.basename(ckpt.model_checkpoint_path).split('-')[1]) load(loader, sess, ckpt.model_checkpoint_path) else: print('No checkpoint file found.') load_step = 0 """ # Start queue threads. threads = tf.train.start_queue_runners(coord=coord, sess=sess) summary_writer = tf.summary.FileWriter(args.snapshot_dir, graph=sess.graph) # Iterate over training steps. for step in range(args.num_steps): start_time = time.time() feed_dict = {step_ph: step} if step % args.save_pred_every == 0 and step != 0: loss_value, _ = sess.run([reduced_loss, train_op], feed_dict=feed_dict) save(saver, sess, args.snapshot_dir, step) elif step%100 == 0: summary_str, loss_value, _, IOU = sess.run([summary_op, reduced_loss, train_op, mIoU], feed_dict=feed_dict) duration = time.time() - start_time summary_writer.add_summary(summary_str, step) print('step {:d} \t loss = {:.3f}, mean_IoU = {:.3f}, ({:.3f} sec/step)'.format(step, loss_value, IOU, duration)) else: loss_value, _ = sess.run([reduced_loss, train_op], feed_dict=feed_dict) coord.request_stop() coord.join(threads)
def main(): """Create the model and start the training.""" args = get_arguments() print("SAVE TO " + args.snapshot_dir) datalists_epoch = { 1: args.datalist_path_epoch1, 2: args.datalist_path_epoch2, 3: args.datalist_path_epoch3, 4: args.datalist_path_epoch4, 5: args.datalist_path_epoch5 } if args.cross_val: val_epoch = int(args.cross_val) train_epochs = [1, 2, 3, 4, 5] train_epochs.remove(val_epoch) train_lists = [datalists_epoch[i] for i in train_epochs] val_lists = datalists_epoch[val_epoch] h, w = map(int, args.input_size.split(',')) input_size = (h, w) max_runtime = args.max_runtime max_time_seconds = 3600 * max_runtime epochs_until_val = 3 global dataset_class_weights if args.weights_for_dataset is None: dataset_class_weights = None elif args.weights_for_dataset == 'de_top15': dataset_class_weights = weights_detop15 elif args.weights_for_dataset == 'eu_top25': dataset_class_weights = weights_eutop25 elif args.weights_for_dataset == 'world2k': dataset_class_weights = weights_world2k elif args.weights_for_dataset == 'kaggle_dstl': dataset_class_weights = weights_kaggledstl elif args.weights_for_dataset == 'vaihingen': dataset_class_weights = weights_vaihingen elif args.weights_for_dataset == 'de_top15_nores': dataset_class_weights = weights_detop15_nores elif args.weights_for_dataset == 'eu_top25_nores': dataset_class_weights = weights_eutop25_nores elif args.weights_for_dataset == 'world2k_nores': dataset_class_weights = weights_world2k_nores coord = tf.train.Coordinator() if args.cross_val: with tf.name_scope("create_inputs"): reader = ImageReader(args.datadir, train_lists, input_size, args.random_scale, args.random_mirror, args.ignore_label, IMG_MEAN, coord) image_batch, label_batch = reader.dequeue(args.batch_size) # for validation reader_val = ImageReader(args.datadir, val_lists, input_size, args.random_scale, args.random_mirror, args.ignore_label, IMG_MEAN, coord) image_batch_val, label_batch_val = reader_val.dequeue( args.batch_size) else: with tf.name_scope("create_inputs"): reader = ImageReader(args.datadir, args.datalist_path, input_size, args.random_scale, args.random_mirror, args.ignore_label, IMG_MEAN, coord) image_batch, label_batch = reader.dequeue(args.batch_size) # for validation reader_val = ImageReader(args.datadir, args.datalist_path_val, input_size, args.random_scale, args.random_mirror, args.ignore_label, IMG_MEAN, coord) image_batch_val, label_batch_val = reader_val.dequeue( args.batch_size) net = ICNet_BN({'data': image_batch}, is_training=True, num_classes=args.num_classes, filter_scale=args.filter_scale) with tf.variable_scope("val"): net_val = ICNet_BN({'data': image_batch_val}, is_training=True, num_classes=args.num_classes, filter_scale=args.filter_scale) sub4_out = net.layers['sub4_out'] sub24_out = net.layers['sub24_out'] sub124_out = net.layers['conv6_cls'] # early stop variables last_val_loss_tf = tf.Variable(10000.0, name="last_loss") steps_total_tf = tf.Variable(0, name="steps_total") val_increased_t_tf = tf.Variable(0, name="loss_increased_t") if args.not_restore_last: restore_var = [ v for v in tf.global_variables() if 'conv6_cls' not in v.name and 'val' not in v.name and 'sub4_out' not in v.name and 'sub24_out' not in v.name and 'sub124_out' not in v.name ] else: # to load last layer, the line 78 in network.py has to be removed too and ignore_missing set to False # see https://github.com/hellochick/ICNet-tensorflow/issues/50 BCJuan # don't restore val vars restore_var = [ v for v in tf.trainable_variables() if 'val' not in v.name ] #tf.global_variables() # don't train val variables all_trainable = [ v for v in tf.trainable_variables() if (('beta' not in v.name and 'gamma' not in v.name) or args.train_beta_gamma) and 'val' not in v.name ] # all_trainable = [v for v in tf.trainable_variables() if # ('beta' not in v.name and 'gamma' not in v.name) or args.train_beta_gamma] # print([v for v in tf.global_variables() if v.name in["last_val_loss","steps_total","val_increased_t"]]) # restore_var.extend([v for v in tf.global_variables() if v.name in["last_val_loss","steps_total","val_increased_t"]]) # assert not np.any(np.isnan(sub4_out)) loss_sub4 = create_loss(sub4_out, label_batch, args.num_classes, args.ignore_label) loss_sub24 = create_loss(sub24_out, label_batch, args.num_classes, args.ignore_label) loss_sub124 = create_loss(sub124_out, label_batch, args.num_classes, args.ignore_label) # l2_losses = [args.weight_decay * tf.nn.l2_loss(v) for v in tf.trainable_variables() if 'weights' in v.name] l2_losses = [ args.weight_decay * tf.nn.l2_loss(v) for v in tf.trainable_variables() if ('weights' in v.name and 'val' not in v.name) ] reduced_loss = LAMBDA1 * loss_sub4 + LAMBDA2 * loss_sub24 + LAMBDA3 * loss_sub124 + tf.add_n( l2_losses) ####################### Loss Calculation FOR VALIDATION sub4_out_val = net_val.layers['sub4_out'] sub24_out_val = net_val.layers['sub24_out'] sub124_out_val = net_val.layers['conv6_cls'] loss_sub4_val = create_loss(sub4_out_val, label_batch_val, args.num_classes, args.ignore_label) loss_sub24_val = create_loss(sub24_out_val, label_batch_val, args.num_classes, args.ignore_label) loss_sub124_val = create_loss(sub124_out_val, label_batch_val, args.num_classes, args.ignore_label) l2_losses_val = [ args.weight_decay * tf.nn.l2_loss(v) for v in tf.trainable_variables() if ('weights' in v.name and 'val' in v.name) ] reduced_loss_val = LAMBDA1 * loss_sub4_val + LAMBDA2 * loss_sub24_val + LAMBDA3 * loss_sub124_val + tf.add_n( l2_losses_val) ####################### End Loss Calculation FOR VALIDATION # Using Poly learning rate policy base_lr = tf.constant(args.learning_rate) step_ph = tf.placeholder(dtype=tf.float32, shape=()) learning_rate = tf.scalar_mul( base_lr, tf.pow((1 - step_ph / args.num_steps), args.power)) # Gets moving_mean and moving_variance update operations from tf.GraphKeys.UPDATE_OPS if args.update_mean_var == False: update_ops = None else: update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): opt_conv = tf.train.MomentumOptimizer(learning_rate, args.momentum) grads = tf.gradients(reduced_loss, all_trainable) train_op = opt_conv.apply_gradients(zip(grads, all_trainable)) # Set up tf session and initialize variables. config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) init = tf.global_variables_initializer() sess.run(init) # start time glob_start_time = time.time() # Saver for storing checkpoints of the model. saver = tf.train.Saver(var_list=tf.global_variables(), max_to_keep=10) if '.npy' not in args.restore_from: ckpt = tf.train.get_checkpoint_state(args.restore_from) else: ckpt = tf.train.get_checkpoint_state(args.snapshot_dir) if ckpt and ckpt.model_checkpoint_path: vars_to_restore = get_tensors_in_checkpoint_file( file_name=ckpt.model_checkpoint_path) # print(vars_to_restore) # print([v.name for v in restore_var]) # thanks to https://stackoverflow.com/a/50216949/8862202 # v.name[:-2] to transform 'conv1_1_3x3_s2/weights:0' to 'conv1_1_3x3_s2/weights' vars_to_restore = [ v for v in restore_var if 'val' not in v.name and v.name[:-2] in vars_to_restore ] # print(vars_to_restore) #loader = tf.train.Saver(var_list=restore_var) loader = tf.train.Saver(var_list=vars_to_restore) load_step = int( os.path.basename(ckpt.model_checkpoint_path).split('-')[1]) load(loader, sess, ckpt.model_checkpoint_path) else: print('Restore from pre-trained model...') net.load(args.restore_from, sess) # Start queue threads. threads = tf.train.start_queue_runners(coord=coord, sess=sess) if args.reset_patience: z = tf.assign(val_increased_t_tf, 0) sess.run(z) print(sess.run(last_val_loss_tf)) print(sess.run(steps_total_tf)) print(sess.run(val_increased_t_tf)) if not args.cross_val: val_epoch_len = len(reader_val.image_list) val_num_steps = val_epoch_len // args.batch_size # Iterate over training steps. last_val_loss = sess.run(last_val_loss_tf) val_increased_t = sess.run(val_increased_t_tf) best_model_step = 0 total_steps = sess.run(steps_total_tf) for step in range(total_steps, args.num_steps + total_steps): start_time = time.time() feed_dict = {step_ph: step} if step % args.save_pred_every == 0: # validating if args.validate: print("validating: ") print_assign_vars(sess) print("Assigned vars for validation. ") loss_sum = 0 for val_step in trange(val_num_steps, desc='validation', leave=True): loss_value_v, loss1_v, loss2_v, loss3_v = sess.run( [ reduced_loss_val, loss_sub4_val, loss_sub24_val, loss_sub124_val ], feed_dict=feed_dict) loss_sum = loss_sum + loss_value_v loss_avg = loss_sum / val_num_steps if loss_avg > last_val_loss: val_increased_t = val_increased_t + 1 if val_increased_t >= args.patience: print( "Terminated Training, Best Model (at step %d) saved 4 validations ago" % best_model_step) f = open("./FINISHED_ICNET", "w+") f.close() break else: val_increased_t = 0 best_model_step = step print( 'VALIDATION COMPLETE step {:d}\tVal_Loss Increased {:d}/{:d} times\t total loss = {:.3f}' ' last loss = {:.3f}'.format(step, val_increased_t, args.patience, loss_avg, last_val_loss)) last_val_loss = loss_avg steps_assign = tf.assign(steps_total_tf, step) last_val_assign = tf.assign(last_val_loss_tf, last_val_loss) increased_assign = tf.assign(val_increased_t_tf, val_increased_t) print("loss avg " + str(loss_avg)) print(sess.run(steps_assign)) print(sess.run(last_val_assign)) print(sess.run(increased_assign)) # Saving loss_value, loss1, loss2, loss3, _ = sess.run( [ reduced_loss, loss_sub4, loss_sub24, loss_sub124, train_op ], feed_dict=feed_dict) save(saver, sess, args.snapshot_dir, step) # check if max run time is already over elapsed = time.time() - glob_start_time if (elapsed + 300) > max_time_seconds: print("Training stopped: max run time elapsed") os.remove("./RUNNING_ICNET") break else: loss_value, loss1, loss2, loss3, _ = sess.run( [ reduced_loss, loss_sub4, loss_sub24, loss_sub124, train_op ], feed_dict=feed_dict) duration = time.time() - start_time print( 'step {:d} \t total loss = {:.3f}, sub4 = {:.3f}, sub24 = {:.3f}, sub124 = {:.3f} ({:.3f} sec/step)' .format(step, loss_value, loss1, loss2, loss3, duration)) train_duration = time.time() - glob_start_time print('Total training time: ' + str(train_duration)) else: # Training with cross validation print("Training-Mode CROSS VALIDATION") val_epoch_len = len(reader_val.image_list) val_num_steps = val_epoch_len // args.batch_size print("Val epoch length %d, Num steps %d" % (val_epoch_len, val_num_steps)) last_val_loss = math.inf val_not_imp_t = 0 # train for step in range(1000000): feed_dict = {step_ph: step} train_start = time.time() loss_value, loss1, loss2, loss3, _ = sess.run( [reduced_loss, loss_sub4, loss_sub24, loss_sub124, train_op], feed_dict=feed_dict) duration_t = time.time() - train_start if args.print_steps: print( 'step {:d} \t total loss = {:.3f}, sub4 = {:.3f}, sub24 = {:.3f}, sub124 = {:.3f} ({:.3f} sec/step)' .format(step, loss_value, loss1, loss2, loss3, duration_t)) if step % args.save_pred_every == 0: # save and validate # SAVE previously trained model save(saver, sess, args.snapshot_dir, step) # Validate print("validating: ") start_time = time.time() print_assign_vars(sess) print("Assigned vars for validation. ") loss_sum = 0 for val_step in trange(val_num_steps, desc='validation', leave=True): loss_value_v, loss1_v, loss2_v, loss3_v = sess.run( [ reduced_loss_val, loss_sub4_val, loss_sub24_val, loss_sub124_val ], feed_dict=feed_dict) loss_sum = loss_sum + loss_value_v duration = time.time() - start_time loss_avg = loss_sum / val_num_steps print( 'VALIDATION COMPLETE step {:d} \t total loss = {:.3f} \t duration = {:.3f}' .format(step, loss_avg, duration)) if loss_avg >= last_val_loss: val_not_imp_t = val_not_imp_t + 1 if val_not_imp_t >= 4: print( "Terminated Training, Best Model saved 5 validations before" ) f = open("./FINISHED_ICNET", "w+") f.close() break else: val_not_imp_t = 0 last_val_loss = loss_avg # check if max run time is already over elapsed = time.time() - glob_start_time if (elapsed + 300) > max_time_seconds: print("Training stopped: max run time elapsed") os.remove("./RUNNING_ICNET") break coord.request_stop() coord.join(threads)
def _tf_common_init(self): gpu_count = len(self.device_ids) src_size = self.config['input_size'] input_size_wh = (src_size['width'], src_size['height']) init_lr = self.config['lr'] power = self.config['lr_decreasing']['power'] momentum = self.config['momentum'] weight_decay = self.config['weight_decay'] num_classes = len(self.out_classes) train_beta_gamma = self.config['train_beta_gamma'] update_mean_var = self.config['update_mean_var'] with tf.device('/cpu:0'): self.coord = tf.train.Coordinator() splitted_images = {} splitted_labels = {} with tf.name_scope("create_inputs"): for name, need_shuffle in [ ('train', True), ('val', False), ]: reader = ImageReader( ia_descrs=self.samples_dct[name], input_size_wh=input_size_wh, random_scale=False, random_mirror=False, img_mean=IMG_MEAN, coord=self.coord, in_pr_meta=self.helper.in_project_meta, class_to_idx=self.class_title_to_idx, shuffle=need_shuffle ) batch_sz = self.config['batch_size'][name] img_batch, lbl_batch = reader.dequeue(batch_sz * gpu_count) split_images = tf.split(img_batch, gpu_count, 0) split_labels = tf.split(lbl_batch, gpu_count, 0) splitted_images[name] = split_images splitted_labels[name] = split_labels global_step = tf.get_variable( 'global_step', [], initializer=tf.constant_initializer(0), trainable=False, dtype=tf.int32) self.tf_label = tf.placeholder(dtype=tf.int32) # , shape=[None]) self.tf_prediction = tf.placeholder(dtype=tf.int32) # , shape=[None]) self.tf_metric, self.tf_metric_update = tf.metrics.accuracy( self.tf_label, self.tf_prediction, name="use_metric_acc" ) running_vars = tf.get_collection(tf.GraphKeys.LOCAL_VARIABLES, scope="use_metric_acc") self.running_vars_initializer = tf.variables_initializer(var_list=running_vars) base_lr = tf.constant(init_lr) self.step_ph = tf.placeholder(dtype=tf.float32, shape=()) learning_rate = tf.scalar_mul(base_lr, tf.pow((1 - self.step_ph / self.total_train_iters), power)) opt_conv = tf.train.MomentumOptimizer(learning_rate, momentum) opt_fc_w = tf.train.MomentumOptimizer(learning_rate * 10.0, momentum) opt_fc_b = tf.train.MomentumOptimizer(learning_rate * 20.0, momentum) all_grads_conv = [] all_grads_fc_w = [] all_grads_fc_b = [] losses = [] with tf.variable_scope(tf.get_variable_scope()): for curr_dev_id in self.device_ids: with tf.device('/gpu:{}'.format(curr_dev_id)): with tf.name_scope('clone_{}'.format(curr_dev_id)) as scope: spl_img = splitted_images['train'][curr_dev_id] spl_lbl = splitted_labels['train'][curr_dev_id] net = get_model(spl_img, num_classes) prediction, gt, self.v1, self.v2 = forward(net, spl_lbl, num_classes) # print('shapes', tf.shape(prediction), tf.shape(gt), tf.shape(split_labels[i])) reduced_loss = get_loss(prediction, gt, weight_decay) losses.append(reduced_loss) tf.get_variable_scope().reuse_variables() grads_conv, grads_fc_w, grads_fc_b = get_grads(reduced_loss, train_beta_gamma, update_mean_var) all_grads_conv.append(grads_conv) all_grads_fc_w.append(grads_fc_w) all_grads_fc_b.append(grads_fc_b) self.total_loss = tf.stack(values=losses) self.total_loss = tf.reduce_mean(self.total_loss) mean_grads_conv = average_gradients(all_grads_conv) mean_grads_fc_w = average_gradients(all_grads_fc_w) mean_grads_fc_b = average_gradients(all_grads_fc_b) conv_trainable, fc_w_trainable, fc_b_trainable = get_trainable_vars(train_beta_gamma) # Apply the gradients to adjust the shared variables. apply_gradient_conv_op = opt_conv.apply_gradients(zip(mean_grads_conv, conv_trainable), global_step=global_step) apply_gradient_fc_w_op = opt_fc_w.apply_gradients(zip(mean_grads_fc_w, fc_w_trainable), global_step=global_step) apply_gradient_fc_b_op = opt_fc_b.apply_gradients(zip(mean_grads_fc_b, fc_b_trainable), global_step=global_step) # Group all updates to into a single train op. self.train_op = tf.group(apply_gradient_conv_op, apply_gradient_fc_w_op, apply_gradient_fc_b_op) self.total_val_loss, self.v1_val, self.v2_val = get_val_loss( splitted_images['val'], splitted_labels['val'], num_classes, weight_decay, self.device_ids ) # Set up tf session and initialize variables. config = tf.ConfigProto() config.gpu_options.allow_growth = True config.allow_soft_placement = True config.log_device_placement = False self.sess = tf.Session(config=config) init = tf.global_variables_initializer() self.sess.run(init) # Saver for storing checkpoints of the model. self.saver = tf.train.Saver(var_list=tf.global_variables(), save_relative_paths=True)
def main(): # lr_decay = 0.5 # decay_every = 100 """Create the model and start the training.""" args = get_arguments() h, w = map(int, args.input_size.split(',')) input_size = (h, w) tf.set_random_seed(args.random_seed) coord = tf.train.Coordinator() with tf.name_scope("create_inputs"): reader = ImageReader( args.data_list, input_size, args.random_scale, args.random_mirror, args.ignore_label, IMG_MEAN, coord) image_batch, label_batch = reader.dequeue(args.batch_size) # Set up tf session and initialize variables. config = tf.ConfigProto() # config.gpu_options.allow_growth = True # config.allow_soft_placement = True # config.intra_op_parallelism_threads = 1 sess = tf.Session(config = config) net = unext(image_batch, is_train = True, reuse = False, n_out = NUM_CLASSES) # Predictions: ignoring all predictions with labels greater or equal than n_classes raw_output = net.outputs raw_prediction = tf.reshape(raw_output, [-1, args.num_classes]) label_proc = prepare_label(label_batch, tf.stack(raw_output.get_shape()[1:3]), num_classes=args.num_classes, one_hot=False) # [batch_size, h, w] raw_gt = tf.reshape(label_proc, [-1,]) indices = tf.squeeze(tf.where(tf.less_equal(raw_gt, args.num_classes - 1)), 1) gt = tf.cast(tf.gather(raw_gt, indices), dtype = tf.int32) prediction = tf.gather(raw_prediction, indices) main_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits = prediction, labels = gt) t_vars = tf.trainable_variables() l2_losses = [args.weight_decay * tf.nn.l2_loss(v) for v in t_vars if 'kernel' in v.name] #reduced_loss = 0.5 * tf.reduce_mean(main_loss) + generalised_dice_loss(prediction, gt) + tf.add_n(l2_losses) reduced_loss = tf.reduce_mean(main_loss) + tf.add_n(l2_losses) # Processed predictions: for visualisation. raw_output_up = tf.image.resize_bilinear(raw_output, tf.shape(image_batch)[1:3,]) raw_output_up = tf.argmax(raw_output_up, dimension = 3) pred = tf.expand_dims(raw_output_up, dim = 3) # Image summary. images_summary = tf.py_func(inv_preprocess, [image_batch, args.save_num_images, IMG_MEAN], tf.uint8) labels_summary = tf.py_func(decode_labels, [label_batch, args.save_num_images, args.num_classes], tf.uint8) preds_summary = tf.py_func(decode_labels, [pred, args.save_num_images, args.num_classes], tf.uint8) total_summary = tf.summary.image('images', tf.concat(axis=2, values=[images_summary, labels_summary, preds_summary]), max_outputs=args.save_num_images) # Concatenate row-wise. loss_summary = tf.summary.scalar('TotalLoss', reduced_loss) summary_writer = tf.summary.FileWriter(args.snapshot_dir, graph=tf.get_default_graph()) # Using Poly learning rate policy base_lr = tf.constant(args.learning_rate) step_ph = tf.placeholder(dtype=tf.float32, shape=()) learning_rate = tf.train.exponential_decay(base_lr, step_ph, args.num_steps, args.power) lr_summary = tf.summary.scalar('LearningRate', learning_rate) #train_op = tf.train.MomentumOptimizer(learning_rate, args.momentum).minimize(reduced_loss, var_list = t_vars) train_op = tf.train.AdamOptimizer(learning_rate).minimize(reduced_loss, var_list = t_vars) init = tf.global_variables_initializer() sess.run(init) # Saver for storing checkpoints of the model. saver = tf.train.Saver(var_list = tf.global_variables(), max_to_keep = 10) ckpt = tf.train.get_checkpoint_state(SNAPSHOT_DIR) if ckpt and ckpt.model_checkpoint_path: #restore_vars = list([t for t in tf.global_variables() if not 'uconv1' in t.name]) loader = tf.train.Saver(var_list = tf.global_variables()) load_step = int(os.path.basename(ckpt.model_checkpoint_path).split('-')[1]) load(loader, sess, ckpt.model_checkpoint_path) else: print('No checkpoint file found.') load_step = 0 # Start queue threads. threads = tf.train.start_queue_runners(coord = coord, sess = sess) # Iterate over training steps. save_summary_every = 10 for step in range(args.num_steps): start_time = time.time() feed_dict = {step_ph: step} if not step % args.save_pred_every == 0: loss_value, _, l_summary, lr_summ = sess.run([reduced_loss, train_op, loss_summary, lr_summary], feed_dict=feed_dict) duration = time.time() - start_time elif step % args.save_pred_every == 0: loss_value, _, summary, l_summary, lr_summ = sess.run([reduced_loss, train_op, total_summary, loss_summary, lr_summary], feed_dict=feed_dict) duration = time.time() - start_time save(saver, sess, args.snapshot_dir, step) summary_writer.add_summary(summary, step) if step % save_summary_every == 0: summary_writer.add_summary(l_summary, step) summary_writer.add_summary(lr_summ, step) print('step {:d} \t loss = {:.3f}, ({:.3f} sec/step)'.format(step, loss_value, duration)) coord.request_stop() coord.join(threads)
def main(): """Create the model and start the training.""" args = get_arguments() h, w = map(int, args.input_size.split(',')) input_size = (h, w) coord = tf.train.Coordinator() with tf.name_scope("create_inputs"): reader = ImageReader( args.data_list, input_size, args.random_scale, args.random_mirror, args.ignore_label, IMG_MEAN, coord) image_batch, label_batch = reader.dequeue(args.batch_size) net = ICNet_BN({'data': image_batch}, is_training=True, num_classes=args.num_classes) sub4_out = net.layers['sub4_out'] sub24_out = net.layers['sub24_out'] sub124_out = net.layers['conv6_cls'] fc_list = ['conv6_cls'] restore_var = tf.global_variables() all_trainable = [v for v in tf.trainable_variables() if ('beta' not in v.name and 'gamma' not in v.name) or args.train_beta_gamma] restore_var = [v for v in tf.global_variables() if not (len([f for f in fc_list if f in v.name])) or not args.not_restore_last] for v in restore_var: print(v.name) loss_sub4 = create_loss(sub4_out, label_batch, args.num_classes, args.ignore_label, args.use_class_weights) loss_sub24 = create_loss(sub24_out, label_batch, args.num_classes, args.ignore_label, args.use_class_weights) loss_sub124 = create_loss(sub124_out, label_batch, args.num_classes, args.ignore_label, args.use_class_weights) l2_losses = [args.weight_decay * tf.nn.l2_loss(v) for v in tf.trainable_variables() if 'weights' in v.name] loss = LAMBDA1 * loss_sub4 + LAMBDA2 * loss_sub24 + LAMBDA3 * loss_sub124 reduced_loss = loss + tf.add_n(l2_losses) ############################## # visualization and summary ############################## # Processed predictions: for visualisation. # Sub 4 raw_output_up4 = tf.image.resize_bilinear(sub4_out, tf.shape(image_batch)[1:3,]) raw_output_up4 = tf.argmax(raw_output_up4, dimension = 3) pred4 = tf.expand_dims(raw_output_up4, dim = 3) # Sub 24 raw_output_up24 = tf.image.resize_bilinear(sub24_out, tf.shape(image_batch)[1:3,]) raw_output_up24 = tf.argmax(raw_output_up24, dimension=3) pred24 = tf.expand_dims(raw_output_up24, dim=3) # Sub 124 raw_output_up124 = tf.image.resize_bilinear(sub124_out, tf.shape(image_batch)[1:3,]) raw_output_up124 = tf.argmax(raw_output_up124, dimension=3) pred124 = tf.expand_dims(raw_output_up124, dim=3) images_summary = tf.py_func(inv_preprocess, [image_batch, SAVE_NUM_IMAGES, IMG_MEAN], tf.uint8) labels_summary = tf.py_func(decode_labels, [label_batch,SAVE_NUM_IMAGES, args.num_classes], tf.uint8) preds_summary4 = tf.py_func(decode_labels, [pred4, SAVE_NUM_IMAGES, args.num_classes], tf.uint8) preds_summary24 = tf.py_func(decode_labels, [pred24, SAVE_NUM_IMAGES, args.num_classes], tf.uint8) preds_summary124 = tf.py_func(decode_labels, [pred124, SAVE_NUM_IMAGES, args.num_classes], tf.uint8) total_images_summary = tf.summary.image('images', tf.concat(axis=2, values=[images_summary, labels_summary, preds_summary124]), max_outputs=SAVE_NUM_IMAGES) # Concatenate row-wise. total_summary = [total_images_summary] loss_summary = tf.summary.scalar('Total_loss', reduced_loss) total_summary.append(loss_summary) summary_writer = tf.summary.FileWriter(args.snapshot_dir, graph=tf.get_default_graph()) ############################## ############################## # Using Poly learning rate policy if LR_SHEDULE == {}: base_lr = tf.constant(args.learning_rate) step_ph = tf.placeholder(dtype=tf.float32, shape=()) learning_rate = tf.scalar_mul(base_lr, tf.pow((1 - step_ph / args.num_steps), args.power)) else: step_ph = tf.placeholder(dtype=tf.float32, shape=()) learning_rate = tf.Variable(LR_SHEDULE.popitem()[1], tf.float32) lr_summary = tf.summary.scalar('Learning_rate', learning_rate) total_summary.append(lr_summary) # Gets moving_mean and moving_variance update operations from tf.GraphKeys.UPDATE_OPS if args.update_mean_var == False: update_ops = None else: update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): opt_conv = tf.train.MomentumOptimizer(learning_rate, args.momentum) grads = tf.gradients(reduced_loss, all_trainable) train_op = opt_conv.apply_gradients(zip(grads, all_trainable)) # Set up tf session and initialize variables. config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) init = tf.global_variables_initializer() sess.run(init) # Saver for storing checkpoints of the model. saver = tf.train.Saver(var_list = tf.global_variables(), max_to_keep = 10) ckpt = tf.train.get_checkpoint_state(args.snapshot_dir) if ckpt and ckpt.model_checkpoint_path: loader = tf.train.Saver(var_list=restore_var) load_step = int(os.path.basename(ckpt.model_checkpoint_path).split('-')[1]) load(loader, sess, ckpt.model_checkpoint_path) else: print('Restore from pre-trained model...') net.load(args.restore_from, sess, ignore_layers = fc_list) # Start queue threads. threads = tf.train.start_queue_runners(coord=coord, sess=sess) summ_op = tf.summary.merge(total_summary) # Iterate over training steps. for step in range(args.num_steps): start_time = time.time() if LR_SHEDULE != {}: if step == LR_SHEDULE.keys()[0]: tf.assign(learning_rate, LR_SHEDULE.popitem()[0]) feed_dict = {step_ph: step} if step % args.save_pred_every == 0: loss_value, loss1, loss2, loss3, _, summary =\ sess.run([reduced_loss, loss_sub4, loss_sub24, loss_sub124, train_op, summ_op], feed_dict = feed_dict) save(saver, sess, args.snapshot_dir, step) summary_writer.add_summary(summary, step) else: loss_value, loss1, loss2, loss3, _ = sess.run([reduced_loss, loss_sub4, loss_sub24, loss_sub124, train_op], feed_dict=feed_dict) duration = time.time() - start_time #print('shape', sess.run(tf.shape(sub124_out))) #quit() print('step {:d} \t total loss = {:.3f}, sub4 = {:.3f}, sub24 = {:.3f}, sub124 = {:.3f} ({:.3f} sec/step)'.format(step, loss_value, loss1, loss2, loss3, duration)) coord.request_stop() coord.join(threads)
def main(): # Create model and start training args = get_arguments() h, w = map(int, args.input_size.split(',')) input_size = (h, w) tf.set_random_seed(args.random_seed) # Create queue coordinator coord = tf.train.Coordinator() # Load Image Reader with tf.name_scope('create_inputs'): reader = ImageReader( args.index_loc, args.data_dir, args.mask_dir, input_size, args.random_scale, args.random_mirror, args.ignore_label, IMG_MEAN, coord) image_batch, label_batch = reader.dequeue(args.batch_size) mode = tf.contrib.learn.ModeKeys.TRAIN net = DeepLabResNetModel(image_batch, mode, args.num_classes, args.atrous_blocks) raw_output = net.output # Trainable Variables restore_vars = [v for v in tf.global_variables() if 'fc' not in v.name or not args.not_restore_last] all_trainable = [v for v in tf.trainable_variables() if 'beta' not in v.name and 'gamma' not in v.name] fc_trainable = [v for v in all_trainable if 'fc' in v.name] conv_trainable = [v for v in all_trainable if 'fc' not in v.name] fc_w_trainable = [v for v in fc_trainable if 'weights' in v.name] fc_b_trainable = [v for v in fc_trainable if 'biases' in v.name] # Predictions: ignoring all predictions with labels greater or equal than n_classes raw_prediction = tf.reshape(raw_output, [-1, args.num_classes]) label_proc = prepare_labels(label_batch, tf.stack(raw_output.get_shape()[1:3]), num_classes=args.num_classes, one_hot=False) raw_gt = tf.reshape(label_proc, [-1,]) indices = tf.squeeze(tf.where(tf.less_equal(raw_gt, args.num_classes - 1)), 1) gt = tf.cast(tf.gather(raw_gt, indices), tf.int32) prediction = tf.gather(raw_prediction, indices) # Pixel-wise Softmax Loss loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=prediction, labels=gt) l2_losses = [args.weight_decay * tf.nn.l2_loss(v) for v in tf.trainable_variables() if 'weights' in v.name] reduced_loss = tf.reduce_mean(loss) + tf.add_n(l2_losses) variable_summaries(reduced_loss, name='loss') # Processed predictions: for visualization raw_output_up = tf.image.resize_bilinear(raw_output, tf.shape(image_batch)[1:3,]) raw_output_up = tf.argmax(raw_output_up, dimension=3) pred = tf.expand_dims(raw_output_up, dim=3) # Define loss and optimization parameters base_lr = tf.constant(args.learning_rate, tf.float64) global_step = tf.Variable(0, trainable=False, name='global_step') increment_step = tf.assign(global_step, global_step + 1) learning_rate = tf.scalar_mul(base_lr, tf.pow((1 - global_step / args.num_steps), args.power)) learning_rate = tf.maximum(learning_rate, 8e-7) opt_conv = tf.train.MomentumOptimizer(learning_rate, args.momentum) opt_fc_w = tf.train.MomentumOptimizer(learning_rate * 5.0, args.momentum) opt_fc_b = tf.train.MomentumOptimizer(learning_rate * 10.0, args.momentum) grads = tf.gradients(reduced_loss, conv_trainable + fc_w_trainable + fc_b_trainable) grads_conv = grads[:len(conv_trainable)] grads_fc_w = grads[len(conv_trainable) : (len(conv_trainable) + len(fc_w_trainable))] grads_fc_b = grads[(len(conv_trainable) + len(fc_w_trainable)):] train_op_conv = opt_conv.apply_gradients(zip(grads_conv, conv_trainable)) train_op_fc_w = opt_fc_w.apply_gradients(zip(grads_fc_w, fc_w_trainable)) train_op_fc_b = opt_fc_b.apply_gradients(zip(grads_fc_b, fc_b_trainable)) train_op = tf.group(increment_step, train_op_conv, train_op_fc_w, train_op_fc_b) # initial_learning_rate = 1e-2 # learning_rate = tf.train.exponential_decay(initial_learning_rate, global_step, 300, 0.96) # adam = tf.train.AdamOptimizer(learning_rate).minimize(reduced_loss, global_step=global_step) # Image Summary model_dir = args.snapshot_dir + args.model_name images_summary = tf.py_func(inv_preprocess, [image_batch, args.save_num_images, IMG_MEAN], tf.uint8) preds_summary = tf.py_func(decode_labels, [pred, args.save_num_images, args.num_classes], tf.uint8) labels_summary = tf.py_func(decode_labels, [label_batch, args.save_num_images, args.num_classes], tf.uint8) image_summaries = [images_summary, preds_summary, labels_summary] image_summary = tf.summary.image('images', tf.concat(axis=2, values=image_summaries), max_outputs=args.save_num_images) # Variable Summary variable_summaries(fc_w_trainable, 'fc_w') variable_summaries(fc_b_trainable, 'fc_b') variable_summaries(learning_rate, 'learning_rate') # variable_summaries(net.weights, 'aconv_w') # variable_summaries(net.biases, 'aconv_b') total_summary = tf.summary.merge_all() summary_writer = tf.summary.FileWriter(model_dir, graph=tf.get_default_graph()) # Set up session with tf.Session() as sess: tf.global_variables_initializer().run() saver = tf.train.Saver(max_to_keep=3) if args.snapshot_dir is not None and args.model_name is not None and os.path.exists(model_dir): loader = tf.train.Saver() load_model(loader, sess, model_dir) threads = tf.train.start_queue_runners(coord=coord, sess=sess) # train_op = adam for step in range(args.num_steps): start_time = time.time() if step % args.save_pred_every == 0: feed = [reduced_loss, image_batch, label_batch, pred, total_summary, global_step, train_op] loss_value, images, labels, preds, summary, total_steps, _ = sess.run(feed) summary_writer.add_summary(summary, total_steps) save_model(saver, sess, model_dir, global_step) else: feed = [reduced_loss, global_step, train_op] loss_value, total_steps, _ = sess.run(feed) duration = time.time() - start_time results = 'global step: {:d}, step: {:d} \t loss = {:.3f}, ({:.3f} secs)'\ .format(total_steps, step, loss_value, duration) if step % WRITE_EVERY == 0: with open(WRITE_FILE, 'a') as f: f.write(results + '\n') print(results) coord.request_stop() coord.join(threads)
def evaluate_checkpoint(model_path, args): coord = tf.train.Coordinator() tf.reset_default_graph() reader = ImageReader( args.data_list, INPUT_SIZE, random_scale = False, random_mirror = False, ignore_label = IGNORE_LABEL, img_mean = IMG_MEAN, coord = coord, train = False) image_batch, label_batch = reader.dequeue(batch_size) # Set up tf session and initialize variables. config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) # Start queue threads. threads = tf.train.start_queue_runners(coord = coord, sess = sess) # Create network. net = ICNet_BN({'data': image_batch}, num_classes = num_classes) # Which variables to load. restore_var = tf.global_variables() # Predictions. raw_output = net.layers['conv6_cls'] raw_output_up = tf.image.resize_bilinear(raw_output, size = INPUT_SIZE, align_corners = True) raw_output_up = tf.argmax(raw_output_up, dimension = 3) pred = tf.expand_dims(raw_output_up, dim = 3) # mIoU pred_flatten = tf.reshape(pred, [-1,]) raw_gt = tf.reshape(label_batch, [-1,]) if args.ignore_zero: indices = tf.squeeze(tf.where( tf.logical_and( tf.less_equal(raw_gt, num_classes - 1), tf.greater(raw_gt, 0) ),), 1) else: indices = tf.squeeze(tf.where(tf.less_equal(raw_gt, num_classes - 1)), 1) #indices = tf.squeeze(tf.where(tf.less_equal(raw_gt, num_classes - 1)), 1) gt = tf.cast(tf.gather(raw_gt, indices), tf.int32) pred = tf.gather(pred_flatten, indices) metric, op = tf.contrib.metrics.streaming_mean_iou(pred, gt, num_classes = num_classes) mIoU, update_op = metric, op # Summaries miou_op = tf.summary.scalar('mIOU', mIoU) start = time.time() logging.info('Starting evaluation at ' + time.strftime('%Y-%m-%d-%H:%M:%S', time.gmtime())) sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) saver = tf.train.Saver(var_list = restore_var) load(saver, sess, model_path) for step in range(num_steps): preds, _ = sess.run([pred, update_op]) if step % 500 == 0: print('Finish {0}/{1}'.format(step + 1, num_steps)) iou, summ = sess.run([mIoU, miou_op]) sess.close() coord.request_stop() #coord.join(threads) return summ, iou
def main(): """Create the model and start the training.""" args = get_arguments() h, w = map(int, args.input_size.split(',')) input_size = (h, w) if args.center_crop_size is None: center_crop_size = None else: hc, wc = map(int, args.center_crop_size.split(',')) center_crop_size = (hc, wc) with tf.name_scope("create_inputs"): reader = ImageReader( DATA_DIR, DATA_LIST_PATH, input_size, center_crop_size, args.random_scale, args.random_mirror, args.ignore_label, IMG_MEAN) image_batch, label_batch = reader.dequeue(args.batch_size) net = ICNet_BN({'data': image_batch}, is_training=True, num_classes=args.num_classes, filter_scale=args.filter_scale) sub4_recls, sub24_recls, sub124_recls = bn_common.extend_reclassifier(net) restore_var = tf.global_variables() all_trainable = [v for v in tf.trainable_variables() if ('beta' not in v.name and 'gamma' not in v.name) or args.train_beta_gamma] loss_sub4 = create_loss(sub4_recls, label_batch, args) loss_sub24 = create_loss(sub24_recls, label_batch, args) loss_sub124 = create_loss(sub124_recls, label_batch, args) l2_losses = [args.weight_decay * tf.nn.l2_loss(v) for v in tf.trainable_variables() if ('weights' in v.name) or ('kernel' in v.name)] reduced_loss = LAMBDA1 * loss_sub4 + LAMBDA2 * loss_sub24 + LAMBDA3 * loss_sub124 + tf.add_n(l2_losses) # print(tf.get_variable_scope().name) # print(','.join([v.__op.original_name_scope for v in l2_losses])) # print(','.join([v for v in tf.trainable_variables() if ('beta' in v.name or 'gamma' in v.name)])) # tf.summary.FileWriter('./summary', tf.get_default_graph()) # exit(0) # Using Poly learning rate policy base_lr = tf.constant(args.learning_rate) step_ph = tf.placeholder(dtype=tf.float32, shape=()) learning_rate = tf.scalar_mul(base_lr, tf.pow((1 - step_ph / args.num_steps), args.power)) # Gets moving_mean and moving_variance update operations from tf.GraphKeys.UPDATE_OPS if args.update_mean_var == False: update_ops = None else: update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): opt_conv = tf.train.MomentumOptimizer(learning_rate, args.momentum) grads = tf.gradients(reduced_loss, all_trainable) train_op = opt_conv.apply_gradients(zip(grads, all_trainable)) # Set up tf session and initialize variables. config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) sess.run(tf.global_variables_initializer()) # Saver for storing checkpoints of the model. saver = tf.train.Saver(var_list=tf.global_variables(), max_to_keep=99) ckpt = tf.train.get_checkpoint_state(args.snapshot_dir) if ckpt and ckpt.model_checkpoint_path: loader = tf.train.Saver(var_list=restore_var) load(loader, sess, ckpt.model_checkpoint_path) else: print('Restore from pre-trained model...') net.load(args.restore_from, sess) # Start queue threads. coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(coord=coord, sess=sess) # Iterate over training steps. for step in range(args.num_steps): start_time = time.time() feed_dict = {step_ph: step} if step % args.save_pred_every == 0: loss_value, loss1, loss2, loss3, _ = sess.run([reduced_loss, loss_sub4, loss_sub24, loss_sub124, train_op], feed_dict=feed_dict) save(saver, sess, args.snapshot_dir, step) else: loss_value, loss1, loss2, loss3, _ = sess.run([reduced_loss, loss_sub4, loss_sub24, loss_sub124, train_op], feed_dict=feed_dict) duration = time.time() - start_time print('step {:d} \t total loss = {:.3f}, sub4 = {:.3f}, sub24 = {:.3f}, sub124 = {:.3f} ({:.3f} sec/step)'.format(step, loss_value, loss1, loss2, loss3, duration)) coord.request_stop() coord.join(threads) sess.close()
def run(self): tf.set_random_seed(self.random_seed) coord = tf.train.Coordinator() # 读取数据 with tf.name_scope("create_inputs"): reader = ImageReader(self.data_dir, self.data_train_list, self.input_size, self.random_scale, self.random_mirror, self.ignore_label, self.img_mean, coord) image_batch, label_batch = reader.dequeue(self.batch_size) # 网络 net = PSPNet({'data': image_batch}, is_training=True, num_classes=self.num_classes) raw_output = net.layers['conv6'] # According from the prototxt in Caffe implement, learning rate must multiply by 10.0 in pyramid module fc_list = [ 'conv5_3_pool1_conv', 'conv5_3_pool2_conv', 'conv5_3_pool3_conv', 'conv5_3_pool6_conv', 'conv6', 'conv5_4' ] # 所有的变量 restore_var = [v for v in tf.global_variables()] # 所有可训练变量 all_trainable = [ v for v in tf.trainable_variables() if ('beta' not in v.name and 'gamma' not in v.name) or self.train_beta_gamma ] # fc_list中的全连接层可训练变量和卷积可训练变量 fc_trainable = [ v for v in all_trainable if v.name.split('/')[0] in fc_list ] conv_trainable = [ v for v in all_trainable if v.name.split('/')[0] not in fc_list ] # lr * 1.0 fc_w_trainable = [v for v in fc_trainable if 'weights' in v.name] # lr * 10.0 fc_b_trainable = [v for v in fc_trainable if 'biases' in v.name] # lr * 20.0 # 验证 assert (len(all_trainable) == len(fc_trainable) + len(conv_trainable)) assert (len(fc_trainable) == len(fc_w_trainable) + len(fc_b_trainable)) # Predictions: ignoring all predictions with labels greater or equal than n_classes raw_prediction = tf.reshape(raw_output, [-1, self.num_classes]) label_process = prepare_label(label_batch, tf.stack(raw_output.get_shape()[1:3]), num_classes=self.num_classes, one_hot=False) # [batch_size, h, w] raw_gt = tf.reshape(label_process, [ -1, ]) indices = tf.squeeze( tf.where(tf.less_equal(raw_gt, self.num_classes - 1)), 1) gt = tf.cast(tf.gather(raw_gt, indices), tf.int32) prediction = tf.gather(raw_prediction, indices) # Pixel-wise softmax loss. loss = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=prediction, labels=gt) l2_losses = [ self.weight_decay * tf.nn.l2_loss(v) for v in tf.trainable_variables() if 'weights' in v.name ] reduced_loss = tf.reduce_mean(loss) + tf.add_n(l2_losses) # Using Poly learning rate policy base_lr = tf.constant(self.learning_rate) step_ph = tf.placeholder(dtype=tf.float32, shape=()) learning_rate = tf.scalar_mul( base_lr, tf.pow((1 - step_ph / self.num_steps), self.power)) # Gets moving_mean and moving_variance update operations from tf.GraphKeys.UPDATE_OPS update_ops = None if not self.update_mean_var else tf.get_collection( tf.GraphKeys.UPDATE_OPS) # 对变量以不同的学习率优化:分别求梯度、应用梯度 with tf.control_dependencies(update_ops): opt_conv = tf.train.MomentumOptimizer(learning_rate, self.momentum) opt_fc_w = tf.train.MomentumOptimizer(learning_rate * 10.0, self.momentum) opt_fc_b = tf.train.MomentumOptimizer(learning_rate * 20.0, self.momentum) grads = tf.gradients( reduced_loss, conv_trainable + fc_w_trainable + fc_b_trainable) grads_conv = grads[:len(conv_trainable)] grads_fc_w = grads[len(conv_trainable):(len(conv_trainable) + len(fc_w_trainable))] grads_fc_b = grads[(len(conv_trainable) + len(fc_w_trainable)):] train_op_conv = opt_conv.apply_gradients( zip(grads_conv, conv_trainable)) train_op_fc_w = opt_fc_w.apply_gradients( zip(grads_fc_w, fc_w_trainable)) train_op_fc_b = opt_fc_b.apply_gradients( zip(grads_fc_b, fc_b_trainable)) train_op = tf.group(train_op_conv, train_op_fc_w, train_op_fc_b) pass sess = tf.Session(config=self.config) sess.run(tf.global_variables_initializer()) # Saver for storing checkpoints of the model. saver = tf.train.Saver(var_list=tf.global_variables(), max_to_keep=10) # 加载模型 ckpt = tf.train.get_checkpoint_state(self.log_dir) if ckpt and ckpt.model_checkpoint_path: tf.train.Saver(var_list=restore_var).restore( sess, ckpt.model_checkpoint_path) Tools.print_info("Restored model parameters from {}".format( ckpt.model_checkpoint_path)) else: Tools.print_info('No checkpoint file found.') pass # Start queue threads. threads = tf.train.start_queue_runners(coord=coord, sess=sess) # Iterate over training steps. for step in range(self.num_steps): start_time = time.time() if step % self.save_pred_freq == 0: loss_value, _ = sess.run([reduced_loss, train_op], feed_dict={step_ph: step}) saver.save(sess, self.checkpoint_path, global_step=step) Tools.print_info('The checkpoint has been created.') else: loss_value, _ = sess.run([reduced_loss, train_op], feed_dict={step_ph: step}) duration = time.time() - start_time Tools.print_info( 'step {:d} \t loss = {:.3f}, ({:.3f} sec/step)'.format( step, loss_value, duration)) coord.request_stop() coord.join(threads) pass
def main(): temp_flags = FLAGS.__flags.items() temp_flags.sort() for params, value in FLAGS.__flags.items(): print('{}: {}'.format(params, value)) input_size = (FLAGS.train_image_size, FLAGS.train_image_size) tf.set_random_seed(1234) coord = tf.train.Coordinator() reader = ImageReader(FLAGS.data_dir, FLAGS.data_list, input_size, FLAGS.random_scale, FLAGS.random_mirror, FLAGS.ignore_label, IMG_MEAN, coord) image_batch, label_batch = reader.dequeue(FLAGS.batch_size) raw_output = MobileNet(image_batch, isTraining=True, updateBeta=FLAGS.update_beta) psp_list = [ 'conv_ds_15a', 'conv_ds_15b', 'conv_ds_15c', 'conv_ds_15d', 'conv_ds_16', 'conv_ds_17' ] all_trainable = [v for v in tf.trainable_variables()] if FLAGS.update_beta == False: all_trainable = [v for v in all_trainable if 'beta' not in v.name] psp_trainable = [ v for v in all_trainable if v.name.split('/')[1] in psp_list and ( 'weights' in v.name or 'biases' in v.name) ] conv_trainable = [v for v in all_trainable if v not in psp_trainable] # lr * 1.0 psp_w_trainable = [v for v in psp_trainable if 'weights' in v.name] # lr * 10.0 psp_b_trainable = [v for v in psp_trainable if 'biases' in v.name] # lr * 20.0 assert (len(all_trainable) == len(psp_trainable) + len(conv_trainable)) assert (len(psp_trainable) == len(psp_w_trainable) + len(psp_b_trainable)) # Predictions: ignoring all predictions with labels greater or equal than n_classes raw_prediction = tf.reshape(raw_output, [-1, FLAGS.num_classes]) label_proc = prepare_label(label_batch, tf.stack(raw_output.get_shape()[1:3]), num_classes=FLAGS.num_classes, one_hot=False) # [batch_size, h, w] raw_gt = tf.reshape(label_proc, [ -1, ]) indices = tf.squeeze( tf.where(tf.less_equal(raw_gt, FLAGS.num_classes - 1)), 1) gt = tf.cast(tf.gather(raw_gt, indices), tf.int32) prediction = tf.gather(raw_prediction, indices) # Pixel-wise softmax loss. loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=prediction, labels=gt) # Regularisation loss l2_losses = [ FLAGS.weight_decay * tf.nn.l2_loss(v) for v in tf.trainable_variables() if 'weights' in v.name ] reduced_loss = tf.reduce_mean(loss) + tf.add_n(l2_losses) #TODO auxilary loss #Using Poly learning rate policy current_epoch = tf.placeholder(dtype=tf.float32, shape=()) learning_rate = tf.train.polynomial_decay( FLAGS.start_learning_rate, current_epoch, FLAGS.decay_steps, end_learning_rate=FLAGS.end_learning_rate, power=FLAGS.learning_rate_decay_power, name="poly_learning_rate") if FLAGS.update_mean_var == False: update_ops = None else: update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): if FLAGS.optimizer == 'momentum': opt_conv = tf.train.MomentumOptimizer(learning_rate, FLAGS.momentum) opt_psp_w = tf.train.MomentumOptimizer(learning_rate * 10.0, FLAGS.momentum) opt_psp_b = tf.train.MomentumOptimizer(learning_rate * 20.0, FLAGS.momentum) elif FLAGS.optimizer == 'rmsprop': opt_conv = tf.train.RMSPropOptimizer( learning_rate, decay=FLAGS.rmsprop_decay, momentum=FLAGS.rmsprop_momentum, epsilon=FLAGS.opt_epsilon) opt_psp_w = tf.train.RMSPropOptimizer( learning_rate * 10.0, decay=FLAGS.rmsprop_decay, momentum=FLAGS.rmsprop_momentum, epsilon=FLAGS.opt_epsilon) opt_psp_b = tf.train.RMSPropOptimizer( learning_rate * 20.0, decay=FLAGS.rmsprop_decay, momentum=FLAGS.rmsprop_momentum, epsilon=FLAGS.opt_epsilon) grads = tf.gradients( reduced_loss, conv_trainable + psp_w_trainable + psp_b_trainable) grads_conv = grads[:len(conv_trainable)] grads_psp_w = grads[len(conv_trainable):(len(conv_trainable) + len(psp_w_trainable))] grads_psp_b = grads[(len(conv_trainable) + len(psp_w_trainable)):] train_op_conv = opt_conv.apply_gradients( zip(grads_conv, conv_trainable)) train_op_psp_w = opt_psp_w.apply_gradients( zip(grads_psp_w, psp_w_trainable)) train_op_psp_b = opt_psp_b.apply_gradients( zip(grads_psp_b, psp_b_trainable)) train_op = tf.group(train_op_conv, train_op_psp_w, train_op_psp_b) restore_var = tf.global_variables() # Set up tf session and initialize variables. config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) init = tf.global_variables_initializer() sess.run(init) # Saver for storing checkpoints of the model. saver = tf.train.Saver(var_list=tf.global_variables(), max_to_keep=500) load(sess, FLAGS.pretrained_checkpoint, restore_var) # Start queue threads. threads = tf.train.start_queue_runners(coord=coord, sess=sess) # Iterate over training steps. for epoch in range(FLAGS.start_epoch, FLAGS.start_epoch + FLAGS.num_epochs): total_loss = 0.0 for step in range(1, FLAGS.num_steps + 1): start_time = time.time() feed_dict = {current_epoch: epoch} loss_value, _ = sess.run([reduced_loss, train_op], feed_dict=feed_dict) duration = time.time() - start_time print('step {:d} \t loss = {:.3f}, ({:.3f} sec/step)'.format( step, loss_value, duration)) #TODO ignore NaN loss total_loss += loss_value save(saver, sess, FLAGS.log_dir, epoch) total_loss /= FLAGS.num_steps print('Epoch {:d} completed! Total Loss = {:.3f}'.format( epoch, total_loss)) coord.request_stop() coord.join(threads)
def main(): """Create the model and start the training.""" args = get_arguments() h, w = map(int, args.input_size.split(',')) input_size = (h, w) tf.set_random_seed(args.random_seed) coord = tf.train.Coordinator() with tf.name_scope("create_inputs"): reader = ImageReader(args.data_list, input_size, args.random_scale, args.random_mirror, args.ignore_label, IMG_MEAN, coord) image_batch, label_batch = reader.dequeue(args.batch_size) net = PSPNet50({'data': image_batch}, is_training=True, num_classes=args.num_classes) raw_output = net.layers['conv6'] # According from the prototxt in Caffe implement, learning rate must multiply by 10.0 in pyramid module fc_list = [ 'conv5_3_pool1_conv', 'conv5_3_pool2_conv', 'conv5_3_pool3_conv', 'conv5_3_pool6_conv', 'conv6', 'conv5_4' ] restore_var = [ v for v in tf.global_variables() if not (len([f for f in fc_list if f in v.name])) or not args.not_restore_last ] all_trainable = [ v for v in tf.trainable_variables() if 'gamma' not in v.name and 'beta' not in v.name ] fc_trainable = [ v for v in all_trainable if v.name.split('/')[0] in fc_list ] conv_trainable = [ v for v in all_trainable if v.name.split('/')[0] not in fc_list ] # lr * 1.0 fc_w_trainable = [v for v in fc_trainable if 'weights' in v.name] # lr * 10.0 fc_b_trainable = [v for v in fc_trainable if 'biases' in v.name] # lr * 20.0 assert (len(all_trainable) == len(fc_trainable) + len(conv_trainable)) assert (len(fc_trainable) == len(fc_w_trainable) + len(fc_b_trainable)) # Predictions: ignoring all predictions with labels greater or equal than n_classes raw_prediction = tf.reshape(raw_output, [-1, args.num_classes]) label_proc = prepare_label(label_batch, tf.stack(raw_output.get_shape()[1:3]), num_classes=args.num_classes, one_hot=False) # [batch_size, h, w] raw_gt = tf.reshape(label_proc, [ -1, ]) indices = tf.squeeze(tf.where(tf.less_equal(raw_gt, args.num_classes - 1)), 1) gt = tf.cast(tf.gather(raw_gt, indices), tf.int32) prediction = tf.gather(raw_prediction, indices) # Pixel-wise softmax loss. loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=prediction, labels=gt) # Make mistakes for class N more important for network if USE_CLASS_WEIGHTS: if len(CLASS_WEIGHTS) != NUM_CLASSES: print('Incorrect class weights, it will be not used') else: mask = tf.zeros_like(loss) for i, w in enumerate(CLASS_WEIGHTS): mask = mask + tf.cast(tf.equal(gt, i), tf.float32) * tf.constant(w) loss = loss * mask l2_losses = [ args.weight_decay * tf.nn.l2_loss(v) for v in tf.trainable_variables() if 'weights' in v.name ] reduced_loss = tf.reduce_mean(loss) + tf.add_n(l2_losses) # Processed predictions: for visualisation. raw_output_up = tf.image.resize_bilinear(raw_output, tf.shape(image_batch)[1:3, ]) raw_output_up = tf.argmax(raw_output_up, dimension=3) pred = tf.expand_dims(raw_output_up, dim=3) # Image summary. images_summary = tf.py_func(inv_preprocess, [image_batch, args.save_num_images, IMG_MEAN], tf.uint8) labels_summary = tf.py_func( decode_labels, [label_batch, args.save_num_images, args.num_classes], tf.uint8) preds_summary = tf.py_func(decode_labels, [pred, args.save_num_images, args.num_classes], tf.uint8) total_summary = tf.summary.image( 'images', tf.concat(axis=2, values=[images_summary, labels_summary, preds_summary]), max_outputs=args.save_num_images) # Concatenate row-wise. summary_writer = tf.summary.FileWriter(args.snapshot_dir, graph=tf.get_default_graph()) # Using Poly learning rate policy base_lr = tf.constant(args.learning_rate) step_ph = tf.placeholder(dtype=tf.float32, shape=()) learning_rate = tf.scalar_mul( base_lr, tf.pow((1 - step_ph / args.num_steps), args.power)) # Gets moving_mean and moving_variance update operations from tf.GraphKeys.UPDATE_OPS if args.update_mean_var == False: update_ops = None else: update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): opt_conv = tf.train.MomentumOptimizer(learning_rate, args.momentum) opt_fc_w = tf.train.MomentumOptimizer(learning_rate * 10.0, args.momentum) opt_fc_b = tf.train.MomentumOptimizer(learning_rate * 20.0, args.momentum) grads = tf.gradients(reduced_loss, conv_trainable + fc_w_trainable + fc_b_trainable) grads_conv = grads[:len(conv_trainable)] grads_fc_w = grads[len(conv_trainable):(len(conv_trainable) + len(fc_w_trainable))] grads_fc_b = grads[(len(conv_trainable) + len(fc_w_trainable)):] train_op_conv = opt_conv.apply_gradients( zip(grads_conv, conv_trainable)) train_op_fc_w = opt_fc_w.apply_gradients( zip(grads_fc_w, fc_w_trainable)) train_op_fc_b = opt_fc_b.apply_gradients( zip(grads_fc_b, fc_b_trainable)) train_op = tf.group(train_op_conv, train_op_fc_w, train_op_fc_b) # Set up tf session and initialize variables. config = tf.ConfigProto() # config.gpu_options.allow_growth = True # config.allow_soft_placement = True # config.intra_op_parallelism_threads = 1 sess = tf.Session(config=config) init = tf.global_variables_initializer() sess.run(init) # Saver for storing checkpoints of the model. saver = tf.train.Saver(var_list=tf.global_variables(), max_to_keep=10) ckpt = tf.train.get_checkpoint_state(SNAPSHOT_DIR) if ckpt and ckpt.model_checkpoint_path: loader = tf.train.Saver(var_list=restore_var) load_step = int( os.path.basename(ckpt.model_checkpoint_path).split('-')[1]) load(loader, sess, ckpt.model_checkpoint_path) else: print('No checkpoint file found.') load_step = 0 # Start queue threads. threads = tf.train.start_queue_runners(coord=coord, sess=sess) # Iterate over training steps. for step in range(args.num_steps): start_time = time.time() feed_dict = {step_ph: step} if step % args.save_pred_every == 0: loss_value, _, summary = sess.run( [reduced_loss, train_op, total_summary], feed_dict=feed_dict) summary_writer.add_summary(summary, step) save(saver, sess, args.snapshot_dir, step) else: z, t, o, p, loss_value, _ = sess.run( [raw_gt, raw_output, gt, prediction, reduced_loss, train_op], feed_dict=feed_dict) print(z.shape, t.shape, o.shape, p.shape) duration = time.time() - start_time print('step {:d} \t loss = {:.3f}, ({:.3f} sec/step)'.format( step, loss_value, duration)) coord.request_stop() coord.join(threads)
def evaluate_checkpoint(model_path, args): coord = tf.train.Coordinator() tf.reset_default_graph() reader = ImageReader( args.data_list, INPUT_SIZE, random_scale = False, random_mirror = False, ignore_label = IGNORE_LABEL, img_mean = IMG_MEAN, coord = coord, train = False) image_batch, label_batch = reader.dequeue(args.batch_size) # Set up tf session and initialize variables. config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) # Start queue threads. threads = tf.train.start_queue_runners(coord = coord, sess = sess) # Create network. #net = ICNet_BN({'data': image_batch}, is_training = False, num_classes = num_classes) net = unext(image_batch, is_train = False, n_out = NUM_CLASSES) # Predictions. #raw_output = net.layers['conv6'] raw_output = net.outputs raw_output_up = tf.image.resize_bilinear(raw_output, size = INPUT_SIZE, align_corners = True) raw_output_up = tf.argmax(raw_output_up, dimension = 3) pred = tf.expand_dims(raw_output_up, dim = 3) # mIoU pred_flatten = tf.reshape(pred, [-1,]) raw_gt = tf.reshape(label_batch, [-1,]) indices = tf.squeeze(tf.where(tf.not_equal(raw_gt, IGNORE_LABEL)), 1) gt = tf.cast(tf.gather(raw_gt, indices), tf.int32) pred = tf.gather(pred_flatten, indices) iou_metric, iou_op = tf.metrics.mean_iou(pred, gt, num_classes = num_classes) acc_metric, acc_op = tf.metrics.accuracy(pred, gt) # Summaries iou_summ_op = tf.summary.scalar('mIOU', iou_metric) acc_summ_op = tf.summary.scalar('Accuracy', acc_metric) start = time.time() logging.info('Starting evaluation at ' + time.strftime('%Y-%m-%d-%H:%M:%S', time.gmtime())) sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) saver = tf.train.Saver(var_list = tf.global_variables()) load(saver, sess, model_path) for step in range(int(num_steps / batch_size)): preds, _, _ = sess.run([raw_output_up, iou_op, acc_op]) if step % int(100 / batch_size) == 0: print('Finish {0}/{1}'.format(step + 1, int(num_steps / batch_size))) iou, iou_summ, acc, acc_summ = sess.run([iou_metric, iou_summ_op, acc_metric, acc_summ_op]) sess.close() coord.request_stop() #coord.join(threads) return iou, iou_summ, acc, acc_summ
def main(argv=None): input_size = (cfg.IMAGE_HEIGHT, cfg.IMAGE_WIDTH) # Create queue coordinator. coord = tf.train.Coordinator() # Load reader. # Train print('Train ' + cfg.train_data_list) with tf.name_scope("create_inputs"): reader = ImageReader(cfg.train_data_dir, cfg.train_data_list, input_size, cfg.random_scale, cfg.random_resize, cfg.random_mirror, cfg.random_color, cfg.random_crop_pad, cfg.ignore_label, cfg.IMG_MEAN, coord) image_batch, label_batch = reader.dequeue(cfg.batch_size) # Define Network pred_annotation, logits = inference_deeplabv3_plus_16( image_batch, is_training=True) # Modified logits_loss = cross_entropy_loss(logits, label_batch) # loss1 for ECP dataset # logits_loss = weighted_cross_entropy_loss(logits, label_batch) # loss2 for RueMonge dataset # logits_loss = weighted_cross_entropy_loss_4class(logits, label_batch) # # PSPNet # pred_annotation, logits, logits_dsn = inference_pspnet(image_batch, is_training=True) # PSPNet # # logits_loss = cross_entropy_loss(logits, label_batch) + \ # # cross_entropy_loss(logits_dsn, label_batch) # loss1 for ECP dataset # logits_loss = weighted_cross_entropy_loss(logits, label_batch) + \ # weighted_cross_entropy_loss(logits_dsn, label_batch) # loss2 for RueMonge dataset ce_loss = logits_loss # cross entropy loss # Show acc for validation or train dataset if cfg.is_time_acc or cfg.is_epoch_acc: with tf.variable_scope('', reuse=True): val_image_batch = tf.placeholder( tf.float32, shape=[1, IMAGE_HEIGHT, IMAGE_WIDTH, 3], name="input_image") f = open(cfg.val_data_list, 'r') val_img_list = [] val_label_list = [] for line in f: try: image_name, label = line.strip("\n").split(' ') except ValueError: # Adhoc for test. image_name = label = line.strip("\n") val_img_list.append(cfg.val_data_dir + image_name) val_label_list.append(cfg.val_data_dir + label) _, val_logits = inference_deeplabv3_plus_16_init( val_image_batch, is_training=False) # Modified # _, val_logits, _ = inference_pspnet(val_image_batch, is_training=False) # PSPNet val_logits_softmax = tf.nn.softmax(val_logits) tf.group() l2_loss = [ weight_decay * tf.nn.l2_loss(v) for v in tf.trainable_variables() if 'weights' or 'w' in v.name or 'W' in v.name ] # encode: W, facade: weights l2_losses = tf.add_n(l2_loss) # Total loss loss = ce_loss + l2_losses # + stru_loss tf.summary.scalar("loss_ce", ce_loss) tf.summary.scalar("l2_losses", l2_losses) tf.summary.scalar("total_loss", loss) step_ph = tf.placeholder(dtype=tf.float32, shape=()) # Using Poly learning rate policy base_lr = tf.constant(cfg.learning_rate) learning_rate = tf.scalar_mul(base_lr, tf.pow((1 - step_ph / global_step), power)) trainable_var = tf.trainable_variables() # Optimizer if cfg.optimizer == 'Adam': optimizer = tf.train.AdamOptimizer(learning_rate) print('Optimizer: Adam') elif cfg.optimizer == 'Adam2': optimizer = tf.train.AdamOptimizer(learning_rate, beta1=0.9, beta2=0.99) elif cfg.optimizer == 'SGD': optimizer = tf.train.GradientDescentOptimizer(learning_rate) elif cfg.optimizer == 'Momentum': optimizer = tf.train.MomentumOptimizer(learning_rate, momentum=0.9) print('Optimizer: Momentum') elif cfg.optimizer == 'RMSProp': optimizer = tf.train.RMSPropOptimizer(learning_rate) # grads = optimizer.compute_gradients(loss, var_list=trainable_var) # train_op = optimizer.apply_gradients(grads) ## Optimizer definition - nothing different from any classical example opt = optimizer ## Retrieve all trainable variables you defined in your graph if cfg.freeze_bn: tvs = [ v for v in tf.trainable_variables() if 'beta' not in v.name and 'gamma' not in v.name ] else: tvs = [v for v in tf.trainable_variables()] ## Creation of a list of variables with the same shape as the trainable ones # initialized with 0s accum_vars = [ tf.Variable(tf.zeros_like(tv.initialized_value()), trainable=False) for tv in tvs ] zero_ops = [tv.assign(tf.zeros_like(tv)) for tv in accum_vars] ## Calls the compute_gradients function of the optimizer to obtain... the list of gradients gvs = opt.compute_gradients(loss, tvs) ## Adds to each element from the list you initialized earlier with zeros its gradient (works because accum_vars and gvs are in the same order) accum_ops = [accum_vars[i].assign_add(gv[0]) for i, gv in enumerate(gvs)] ## Define the training step (part with variable value update) train_step = opt.apply_gradients([(accum_vars[i], gv[1]) for i, gv in enumerate(gvs)]) print("Setting up summary op...") summary_op = tf.summary.merge_all() # Set gpu usage config = tf.ConfigProto(allow_soft_placement=True) config.gpu_options.allow_growth = True config.gpu_options.per_process_gpu_memory_fraction = 1.0 sess = tf.Session(config=config) print("Setting up Saver...") saver = tf.train.Saver(max_to_keep=cfg.model_save_num) # Start queue threads. threads = tf.train.start_queue_runners(coord=coord, sess=sess) # create two summary writers to show training loss and validation loss in the same graph # need to create two folders 'train' and 'validation' inside FLAGS.logs_dir if not os.path.exists(cfg.logs_dir): os.makedirs(cfg.logs_dir) train_writer = tf.summary.FileWriter(cfg.logs_dir + 'train', sess.graph) test_writer = tf.summary.FileWriter(cfg.logs_dir + 'test') if not os.path.exists(cfg.save_dir): os.makedirs(cfg.save_dir) count = 0 files = os.path.join(cfg.save_dir + 'model.ckpt-*.index') sfile = glob.glob(files) if len(sfile) > 0: sess.run(tf.global_variables_initializer()) sfile = glob.glob(files) steps = [] for s in sfile: part = s.split('.') step = int(part[1].split('-')[1]) steps.append(step) count = max(steps) model = cfg.save_dir + 'model.ckpt-' + str(count) print('\nRestoring weights from: ' + model) saver.restore(sess, model) print('End Restore') else: # # restore from pre-train on imagenet variables = tf.global_variables() sess.run(tf.variables_initializer(variables, name='init')) # # tensorflow 1 if os.path.exists(cfg.pre_trained_model) or os.path.exists( cfg.pre_trained_model + '.index'): var_keep_dic = get_variables_in_checkpoint_file( cfg.pre_trained_model) # Get the variables to restore, ignoring the variables to fix variables_to_restore = get_variables_to_restore( variables, var_keep_dic) if len(variables_to_restore) > 0: restorer = tf.train.Saver(variables_to_restore) restorer.restore(sess, cfg.pre_trained_model) print('Model pre-train loaded from ' + cfg.pre_trained_model) else: print('Model inited random.') else: print('Model inited random.') # RGB -> BGR if 'res' in cfg.pre_trained_model: conv1_rgb = tf.get_variable("conv1_rgb", [7, 7, 3, 64], trainable=False) restorer_fc = tf.train.Saver( {'resnet_v1_50/conv1/weights': conv1_rgb}) restorer_fc.restore(sess, cfg.pre_trained_model) sess.run(tf.assign(variables[0], tf.reverse(conv1_rgb, [2]))) print('ResNet Conv 1 RGB->BGR') elif 'vgg' in cfg.pre_trained_model: conv1_rgb = tf.get_variable("conv1_rgb", [3, 3, 3, 64], trainable=False) restorer_fc = tf.train.Saver( {'vgg_16/conv1/conv1_1/weights': conv1_rgb}) restorer_fc.restore(sess, cfg.pre_trained_model) sess.run(tf.assign(variables[0], tf.reverse(conv1_rgb, [2]))) print('Vgg Conv 1 RGB->BGR') _mask = pred_annotation[0] _img = image_batch[0] _gt = label_batch[0] if not os.path.exists(cfg.save_dir + 'temp_img'): os.mkdir(cfg.save_dir + 'temp_img') print('Start train ' + cfg.data_dir) print('---------------Hyper Paras---------------') print('-- batch_size: ', cfg.batch_size) print('-- Gradient Accumulation: ', cfg.Gradient_Accumulation) print('-- image height: ', cfg.IMAGE_HEIGHT) print('-- image width: ', cfg.IMAGE_WIDTH) print('-- learning rate: ', cfg.learning_rate) print('-- GPU: ', cfg.use_gpu) print('-- optimizer: ', cfg.optimizer) print('-- class num: ', cfg.NUM_OF_CLASSESS) print('-- total iter: ', cfg.total_iter) print('-- Time acc: ', cfg.is_time_acc) print('-- Acc interval: ', cfg.acc_interval) print('-- Start Acc iter: ', cfg.start_show_iter) print('-- Is save step: ', cfg.is_save_step) print('-- Start save step: ', cfg.start_save_step) print('-- save ecpoch: ', cfg.save_step_inter) print('-- model save num: ', cfg.model_save_num) print('-- summary interval: ', cfg.summary_interval) print('-- weight decay: ', cfg.weight_decay) print('-- Freeze BN: ', cfg.freeze_bn) print('-- Decay rate: ', cfg.decay_rate) print('-- minScale: ', cfg.minScale) print('-- maxScale: ', cfg.maxScale) print('-- random scale: ', cfg.random_scale) print('-- random mirror: ', cfg.random_mirror) print('-- random crop: ', cfg.random_crop_pad) print('-- Validation on :' + str(cfg.val_data_list)) print('-- Pre-trained: ' + cfg.pre_trained_model) print('----------------End---------------------') fcfg = open(cfg.save_dir + 'cfg.txt', 'w') fcfg.write('-- batch_size: ' + str(cfg.batch_size) + '\n') fcfg.write('-- Gradient Accumulation: ' + str(cfg.Gradient_Accumulation) + '\n') fcfg.write('-- image height: ' + str(cfg.IMAGE_HEIGHT) + '\n') fcfg.write('-- image width: ' + str(cfg.IMAGE_WIDTH) + '\n') fcfg.write('-- learning rate: ' + str(cfg.learning_rate) + '\n') fcfg.write('-- GPU: ' + str(cfg.use_gpu) + '\n') fcfg.write('-- optimizer: ' + str(cfg.optimizer) + '\n') fcfg.write('-- class num: ' + str(cfg.NUM_OF_CLASSESS) + '\n') fcfg.write('-- total iter: ' + str(cfg.total_iter) + '\n') fcfg.write('-- Time acc: ' + str(cfg.is_time_acc) + '\n') fcfg.write('-- Acc interval: ' + str(cfg.acc_interval) + '\n') fcfg.write('-- Start Acc iter: ' + str(cfg.start_show_iter) + '\n') fcfg.write('-- Is save step: ' + str(cfg.is_save_step) + '\n') fcfg.write('-- Start save step: ' + str(cfg.start_save_step) + '\n') fcfg.write('-- save ecpoch: ' + str(cfg.save_step_inter) + '\n') fcfg.write('-- model save num: ' + str(cfg.model_save_num) + '\n') fcfg.write('-- summary interval: ' + str(cfg.summary_interval) + '\n') fcfg.write('-- weight decay: ' + str(cfg.weight_decay) + '\n') fcfg.write('-- Freeze BN: ' + str(cfg.freeze_bn) + '\n') fcfg.write('-- Decay rate: ' + str(cfg.decay_rate) + '\n') fcfg.write('-- minScale: ' + str(cfg.minScale) + '\n') fcfg.write('-- maxScale: ' + str(cfg.maxScale) + '\n') fcfg.write('-- random scale: ' + str(cfg.random_scale) + '\n') fcfg.write('-- random mirror: ' + str(cfg.random_mirror) + '\n') fcfg.write('-- random crop: ' + str(cfg.random_crop_pad) + '\n') fcfg.write('-- Validation on :' + str(cfg.val_data_list) + '\n') fcfg.write('-- Pre-trained: ' + cfg.pre_trained_model + '\n') fcfg.close() last_summary_time = time.time() last_acc_time = time.time() record = train_number / cfg.batch_size # iter number of each epoch if cfg.is_save_step: # save with step running_count = count epo = int(count / record) if cfg.is_save_epoch: # save with epoch running_count = int(epo * record) epo = count best_acc = 0.5 best_step = 0 train_start_time = time.time() start_step = running_count lossTr_list = [] stepes = [] Acc_val_list = [] # Change the graph for read only sess.graph.finalize() while running_count < cfg.total_iter: time_start = time.time() itr = 0 while itr < int(record): itr += 1 running_count += 1 # log last 10 model if running_count > (cfg.total_iter - 10) and cfg.is_save_last10_model: saver.save(sess, cfg.save_dir + 'model.ckpt', int(running_count)) print('Model has been saved:' + str(running_count)) # more than total iter, stopping training if running_count > cfg.total_iter: break feed_dict = {step_ph: (running_count)} # save summary now = time.time() if now - last_summary_time > cfg.summary_interval: summary_str = sess.run(summary_op, feed_dict={step_ph: running_count}) train_writer.add_summary(summary_str, running_count) last_summary_time = now score_map, img, gt = sess.run([_mask, _img, _gt], feed_dict=feed_dict) img = np.array(img + cfg.IMG_MEAN, np.uint8) score_map = score_map * 20 gt = gt * 20 save_temp = np.zeros( (cfg.IMAGE_HEIGHT, cfg.IMAGE_WIDTH * 3, 3), np.uint8) save_temp[0:cfg.IMAGE_HEIGHT, 0:cfg.IMAGE_WIDTH, :] = img save_temp[0:cfg.IMAGE_HEIGHT, cfg.IMAGE_WIDTH:cfg.IMAGE_WIDTH * 2, :] = gt save_temp[0:cfg.IMAGE_HEIGHT, cfg.IMAGE_WIDTH * 2:cfg.IMAGE_WIDTH * 3, :] = score_map cv2.imwrite( cfg.save_dir + 'temp_img/' + str(now) + '_mask.jpg', save_temp) time_s = time.time() # Run the zero_ops to initialize it sess.run(zero_ops) # Accumulate the gradients 'n_minibatches' times in accum_vars using accum_ops for i in range(cfg.Gradient_Accumulation): sess.run(accum_ops, feed_dict=feed_dict) train_loss, ls_ce, ls_l2, lr = sess.run( [loss, ce_loss, l2_losses, learning_rate], feed_dict=feed_dict) if running_count > 50: lossTr_list.append(ls_ce) if start_step == 0: start_step = 50 # Run the train_step ops to update the weights based on your accumulated gradients sess.run(train_step, feed_dict=feed_dict) time_e = time.time() print( "Epo: %d, Step: %d, Train_loss:%g, ce: %g, l2:%g, lr:%g, time:%g" % (epo, running_count, train_loss, ls_ce, ls_l2, lr, time_e - time_s)) # check accuracy per step of training data if cfg.is_time_acc and running_count >= cfg.start_show_iter and \ running_count <= cfg.total_iter and (now-last_acc_time) > cfg.acc_interval: # Test accuracy in val hist = np.zeros((cfg.NUM_OF_CLASSESS, cfg.NUM_OF_CLASSESS)) for i, img_name in enumerate(val_img_list): true_val = np.expand_dims(misc.imread(val_label_list[i]), axis=2) pred_val = evaluate_accuracy(val_logits_softmax, sess, val_image_batch, img_name) hist += fast_hist(true_val.flatten(), pred_val.flatten(), cfg.NUM_OF_CLASSESS) hist[0, :] = 0 # overall accuracy over_acc = np.diag(hist).sum() / hist.sum() print('>>> Step', running_count, 'overall accuracy', over_acc) if over_acc > best_acc: saver.save(sess, cfg.save_dir + 'best.ckpt') best_acc = over_acc best_step = running_count fshow = open( cfg.save_dir + 'acc: ' + str(best_acc) + ', step: ' + str(best_step), 'w') print('>>> best acc: ', best_acc, 'best step: ', best_step) # per-class accuracy acc = np.diag(hist) / hist.sum(0) print('>>> Step', running_count, 'mean accuracy', acc) last_acc_time = now stepes.append(running_count) Acc_val_list.append(over_acc) # draw plots for visualization ---------------------------- # Plot the figures per 60s import matplotlib.pyplot as plt fig1, ax1 = plt.subplots(figsize=(11, 8)) ax1.plot(range(start_step, running_count), lossTr_list) ax1.set_title("Average training loss vs steps") ax1.set_xlabel("Steps") ax1.set_ylabel("Current loss") plt.savefig(cfg.save_dir + "loss_vs_steps.png") plt.clf() fig2, ax2 = plt.subplots(figsize=(11, 8)) ax2.plot(stepes, Acc_val_list, label="Val total acc.") ax2.set_title(" Acc vs steps") ax2.set_xlabel("Steps") ax2.set_ylabel("Current Acc") plt.legend(loc='lower right') plt.savefig(cfg.save_dir + "acc_vs_steps.png") plt.close('all') # ---------------------------------------------------------- # Save step model if cfg.is_save_step and (running_count % cfg.save_step_inter) == 0 \ and running_count >= cfg.start_save_step: saver.save(sess, cfg.save_dir + 'model.ckpt', int(running_count)) print('Model has been saved:' + str(running_count)) files = os.path.join(cfg.save_dir + 'model.ckpt-*.data-00000-of-00001') sfile = glob.glob(files) if len(sfile) > cfg.model_save_num: steps = [] for s in sfile: part = s.split('.') re = int(part[1].split('-')[1]) steps.append(re) re = min(steps) model = cfg.save_dir + 'model.ckpt-' + str(re) os.remove(model + '.data-00000-of-00001') os.remove(model + '.index') os.remove(model + '.meta') print('Remove Model:' + model) # Check accuracy per Epoch of training data if cfg.is_epoch_acc and running_count >= cfg.start_show_iter \ and running_count <= cfg.total_iter: # Test accuracy in val hist = np.zeros((cfg.NUM_OF_CLASSESS, cfg.NUM_OF_CLASSESS)) for i, img_name in enumerate(val_img_list): true_val = np.expand_dims(misc.imread(val_label_list[i]), axis=2) pred_val = evaluate_accuracy(val_logits_softmax, sess, val_image_batch, img_name) hist += fast_hist(pred_val.flatten(), true_val.flatten(), cfg.NUM_OF_CLASSESS) hist[:, 0] = 0 # overall accuracy over_acc = np.diag(hist).sum() / hist.sum() print('>>> Step', running_count, 'overall accuracy', over_acc) if over_acc > best_acc: saver.save(sess, cfg.save_dir + 'best.ckpt') best_acc = over_acc best_step = running_count fshow = open( cfg.save_dir + 'acc: ' + str(best_acc) + ', step: ' + str(best_step), 'w') print('>>> best acc: ', best_acc, 'best step: ', best_step) # per-class accuracy acc = np.diag(hist) / hist.sum(0) print('>>> Step', running_count, 'mean accuracy', acc) epo += 1 # Save epoch model if cfg.is_save_epoch and (epo % cfg.save_epoch_inter ) == 0 and epo >= cfg.start_save_epoch: saver.save(sess, cfg.save_dir + 'model.ckpt', epo) print('Model has been saved:' + str(epo)) files = os.path.join(cfg.save_dir + 'model.ckpt-*.data-00000-of-00001') sfile = glob.glob(files) if len(sfile) > cfg.model_save_num: steps = [] for s in sfile: part = s.split('.') re = int(part[1].split('-')[1]) steps.append(re) re = min(steps) model = cfg.save_dir + 'model.ckpt-' + str(re) os.remove(model + '.data-00000-of-00001') os.remove(model + '.index') os.remove(model + '.meta') print('Remove Model:' + model) time_end = time.time() print('Epo ' + str(epo) + ' use time: ' + str(time_end - time_start)) # saver.save(sess, cfg.save_dir + 'last.ckpt') # save last model train_end_time = time.time() print('Train total use: ' + str((train_end_time - train_start_time) / 3600) + ' h') coord.request_stop() coord.join(threads)
def main(): """Create the model and start the training.""" args = get_arguments() h, w = map(int, args.input_size.split(',')) input_size = (h, w) coord = tf.train.Coordinator() with tf.name_scope("create_inputs"): reader = ImageReader(DATA_DIR, DATA_LIST_PATH, input_size, args.random_scale, args.random_mirror, args.ignore_label, IMG_MEAN, coord) image_batch, label_batch = reader.dequeue(args.batch_size) net = ICNet_BN({'data': image_batch}, is_training=True, num_classes=args.num_classes, filter_scale=args.filter_scale) sub4_out = net.layers['sub4_out'] sub24_out = net.layers['sub24_out'] sub124_out = net.layers['conv6_cls'] restore_var = tf.global_variables() all_trainable = [ v for v in tf.trainable_variables() if ('beta' not in v.name and 'gamma' not in v.name) or args.train_beta_gamma ] with tf.name_scope('loss'): loss_sub4 = create_loss(sub4_out, label_batch, args.num_classes, args.ignore_label) loss_sub24 = create_loss(sub24_out, label_batch, args.num_classes, args.ignore_label) loss_sub124 = create_loss(sub124_out, label_batch, args.num_classes, args.ignore_label) l2_losses = [ args.weight_decay * tf.nn.l2_loss(v) for v in tf.trainable_variables() if 'weights' in v.name ] reduced_loss = LAMBDA1 * loss_sub4 + LAMBDA2 * loss_sub24 + LAMBDA3 * loss_sub124 + tf.add_n( l2_losses) tf.summary.scalar('sub4', loss_sub4) tf.summary.scalar('sub24', loss_sub24) tf.summary.scalar('sub124', loss_sub124) tf.summary.scalar('total_loss', reduced_loss) # Using Poly learning rate policy base_lr = tf.constant(args.learning_rate) step_ph = tf.placeholder(dtype=tf.float32, shape=()) learning_rate = tf.scalar_mul( base_lr, tf.pow((1 - step_ph / args.num_steps), args.power)) # Gets moving_mean and moving_variance update operations from tf.GraphKeys.UPDATE_OPS if args.update_mean_var == False: update_ops = None else: update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): opt_conv = tf.train.MomentumOptimizer(learning_rate, args.momentum) grads = tf.gradients(reduced_loss, all_trainable) train_op = opt_conv.apply_gradients(zip(grads, all_trainable)) # Set up tf session and initialize variables. config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) init = tf.global_variables_initializer() sess.run(init) # Saver for storing checkpoints of the model. saver = tf.train.Saver(var_list=tf.global_variables(), max_to_keep=20) summ = tf.summary.merge_all() tenboard_dir = tfboard_dir + str(LEARNING_RATE) + '_' + str(NUM_STEPS) writer = tf.summary.FileWriter(tenboard_dir) writer.add_graph(sess.graph) ckpt = tf.train.get_checkpoint_state(args.snapshot_dir) # net.load(args.restore_from, sess) if ckpt and ckpt.model_checkpoint_path: loader = tf.train.Saver(var_list=restore_var) load_step = int( os.path.basename(ckpt.model_checkpoint_path).split('-')[1]) load(loader, sess, './snapshots/3wDataSet/model.ckpt-' + str(START_STEP)) else: print('Restore from pre-trained model...') net.load(args.restore_from, sess) # Start queue threads. threads = tf.train.start_queue_runners(coord=coord, sess=sess) # Iterate over training steps. for step in range(START_STEP, args.num_steps): start_time = time.time() feed_dict = {step_ph: step} if step % args.save_pred_every == 0: s, loss_value, loss1, loss2, loss3, _ = sess.run( [ summ, reduced_loss, loss_sub4, loss_sub24, loss_sub124, train_op ], feed_dict=feed_dict) save(saver, sess, args.snapshot_dir, step) writer.add_summary(s, step) else: s, loss_value, loss1, loss2, loss3, _ = sess.run( [ summ, reduced_loss, loss_sub4, loss_sub24, loss_sub124, train_op ], feed_dict=feed_dict) writer.add_summary(s, step) duration = time.time() - start_time print( 'step {:d} \t total loss = {:.3f}, sub4 = {:.3f}, sub24 = {:.3f}, sub124 = {:.3f} ({:.3f} sec/step)' .format(step, loss_value, loss1, loss2, loss3, duration)) coord.request_stop() coord.join(threads)
def main(): """Create the model and start the training.""" args = get_arguments() h, w = map(int, args.input_size.split(',')) input_size = (h, w) coord = tf.train.Coordinator() with tf.name_scope("create_inputs"): reader = ImageReader( ' ', args.data_list, input_size, args.random_scale, args.random_mirror, args.ignore_label, IMG_MEAN, coord) image_batch, label_batch = reader.dequeue(args.batch_size) net = ICNet_BN({'data': image_batch}, is_training=True, num_classes=args.num_classes) sub4_out = net.layers['sub4_out'] sub24_out = net.layers['sub24_out'] sub124_out = net.layers['conv6_cls'] restore_var = tf.global_variables() all_trainable = [v for v in tf.trainable_variables() if ('beta' not in v.name and 'gamma' not in v.name) or args.train_beta_gamma] loss_sub4 = create_loss(sub4_out, label_batch, args.num_classes, args.ignore_label) loss_sub24 = create_loss(sub24_out, label_batch, args.num_classes, args.ignore_label) loss_sub124 = create_loss(sub124_out, label_batch, args.num_classes, args.ignore_label) l2_losses = [args.weight_decay * tf.nn.l2_loss(v) for v in tf.trainable_variables() if 'weights' in v.name] reduced_loss = LAMBDA1 * loss_sub4 + LAMBDA2 * loss_sub24 + LAMBDA3 * loss_sub124 + tf.add_n(l2_losses) # Using Poly learning rate policy base_lr = tf.constant(args.learning_rate) step_ph = tf.placeholder(dtype=tf.float32, shape=()) learning_rate = tf.scalar_mul(base_lr, tf.pow((1 - step_ph / args.num_steps), args.power)) # Gets moving_mean and moving_variance update operations from tf.GraphKeys.UPDATE_OPS if args.update_mean_var == False: update_ops = None else: update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): opt_conv = tf.train.MomentumOptimizer(learning_rate, args.momentum) grads = tf.gradients(reduced_loss, all_trainable) train_op = opt_conv.apply_gradients(zip(grads, all_trainable)) # Set up tf session and initialize variables. config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) init = tf.global_variables_initializer() sess.run(init) # Saver for storing checkpoints of the model. saver = tf.train.Saver(var_list=tf.global_variables(), max_to_keep=5) ckpt = tf.train.get_checkpoint_state(args.snapshot_dir) if ckpt and ckpt.model_checkpoint_path: loader = tf.train.Saver(var_list=restore_var) load_step = int(os.path.basename(ckpt.model_checkpoint_path).split('-')[1]) load(loader, sess, ckpt.model_checkpoint_path) else: print('Restore from pre-trained model...') net.load(args.restore_from, sess) # Start queue threads. threads = tf.train.start_queue_runners(coord=coord, sess=sess) # Iterate over training steps. for step in range(args.num_steps): start_time = time.time() feed_dict = {step_ph: step} if step % args.save_pred_every == 0: loss_value, loss1, loss2, loss3, _ = sess.run([reduced_loss, loss_sub4, loss_sub24, loss_sub124, train_op], feed_dict=feed_dict) save(saver, sess, args.snapshot_dir, step) else: loss_value, loss1, loss2, loss3, _ = sess.run([reduced_loss, loss_sub4, loss_sub24, loss_sub124, train_op], feed_dict=feed_dict) duration = time.time() - start_time print('step {:d} \t total loss = {:.3f}, sub4 = {:.3f}, sub24 = {:.3f}, sub124 = {:.3f} ({:.3f} sec/step)'.format(step, loss_value, loss1, loss2, loss3, duration)) coord.request_stop() coord.join(threads)
def main(): """Create the model and start the training.""" args = get_arguments() h, w = map(int, args.input_size.split(',')) input_size = (h, w) tf.set_random_seed(args.random_seed) coord = tf.train.Coordinator() with tf.name_scope("create_inputs"): reader = ImageReader(args.data_dir, args.data_list, input_size, args.random_scale, args.random_mirror, args.ignore_label, IMG_MEAN, coord) image_batch, label_batch = reader.dequeue(args.batch_size) net = PSPNet101({'data': image_batch}, is_training=True, num_classes=args.num_classes) raw_output = net.layers['conv6'] # According from the prototxt in Caffe implement, learning rate must multiply by 10.0 in pyramid module fc_list = [ 'conv5_3_pool1_conv', 'conv5_3_pool2_conv', 'conv5_3_pool3_conv', 'conv5_3_pool6_conv', 'conv6', 'conv5_4' ] restore_var = [v for v in tf.global_variables()] all_trainable = [ v for v in tf.trainable_variables() if ('beta' not in v.name and 'gamma' not in v.name) or args.train_beta_gamma ] fc_trainable = [ v for v in all_trainable if v.name.split('/')[0] in fc_list ] conv_trainable = [ v for v in all_trainable if v.name.split('/')[0] not in fc_list ] # lr * 1.0 fc_w_trainable = [v for v in fc_trainable if 'weights' in v.name] # lr * 10.0 fc_b_trainable = [v for v in fc_trainable if 'biases' in v.name] # lr * 20.0 assert (len(all_trainable) == len(fc_trainable) + len(conv_trainable)) assert (len(fc_trainable) == len(fc_w_trainable) + len(fc_b_trainable)) # Predictions: ignoring all predictions with labels greater or equal than n_classes raw_prediction = tf.reshape(raw_output, [-1, args.num_classes]) label_proc = prepare_label(label_batch, tf.stack(raw_output.get_shape()[1:3]), num_classes=args.num_classes, one_hot=False) # [batch_size, h, w] raw_gt = tf.reshape(label_proc, [ -1, ]) indices = tf.squeeze(tf.where(tf.less_equal(raw_gt, args.num_classes - 1)), 1) gt = tf.cast(tf.gather(raw_gt, indices), tf.int32) prediction = tf.gather(raw_prediction, indices) # Pixel-wise softmax loss. loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=prediction, labels=gt) l2_losses = [ args.weight_decay * tf.nn.l2_loss(v) for v in tf.trainable_variables() if 'weights' in v.name ] reduced_loss = tf.reduce_mean(loss) + tf.add_n(l2_losses) # Using Poly learning rate policy base_lr = tf.constant(args.learning_rate) step_ph = tf.placeholder(dtype=tf.float32, shape=()) learning_rate = tf.scalar_mul( base_lr, tf.pow((1 - step_ph / args.num_steps), args.power)) # Gets moving_mean and moving_variance update operations from tf.GraphKeys.UPDATE_OPS if args.update_mean_var == False: update_ops = None else: update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): opt_conv = tf.train.MomentumOptimizer(learning_rate, args.momentum) opt_fc_w = tf.train.MomentumOptimizer(learning_rate * 10.0, args.momentum) opt_fc_b = tf.train.MomentumOptimizer(learning_rate * 20.0, args.momentum) grads = tf.gradients(reduced_loss, conv_trainable + fc_w_trainable + fc_b_trainable) grads_conv = grads[:len(conv_trainable)] grads_fc_w = grads[len(conv_trainable):(len(conv_trainable) + len(fc_w_trainable))] grads_fc_b = grads[(len(conv_trainable) + len(fc_w_trainable)):] train_op_conv = opt_conv.apply_gradients( zip(grads_conv, conv_trainable)) train_op_fc_w = opt_fc_w.apply_gradients( zip(grads_fc_w, fc_w_trainable)) train_op_fc_b = opt_fc_b.apply_gradients( zip(grads_fc_b, fc_b_trainable)) train_op = tf.group(train_op_conv, train_op_fc_w, train_op_fc_b) # Set up tf session and initialize variables. config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) init = tf.global_variables_initializer() sess.run(init) # Saver for storing checkpoints of the model. saver = tf.train.Saver(var_list=tf.global_variables(), max_to_keep=10) ckpt = tf.train.get_checkpoint_state(SNAPSHOT_DIR) if ckpt and ckpt.model_checkpoint_path: loader = tf.train.Saver(var_list=restore_var) load_step = int( os.path.basename(ckpt.model_checkpoint_path).split('-')[1]) load(loader, sess, ckpt.model_checkpoint_path) else: print('No checkpoint file found.') load_step = 0 # Start queue threads. threads = tf.train.start_queue_runners(coord=coord, sess=sess) # Iterate over training steps. for step in range(args.num_steps): start_time = time.time() feed_dict = {step_ph: step} if step % args.save_pred_every == 0: loss_value, _ = sess.run([reduced_loss, train_op], feed_dict=feed_dict) save(saver, sess, args.snapshot_dir, step) else: loss_value, _ = sess.run([reduced_loss, train_op], feed_dict=feed_dict) duration = time.time() - start_time print('step {:d} \t loss = {:.3f}, ({:.3f} sec/step)'.format( step, loss_value, duration)) coord.request_stop() coord.join(threads)
class Evaluator(): def __init__(self): self.stop = False if not tf.gfile.Exists(FLAGS.test_load_queue_path): self.settings = { "best_acc": None, "best_checkpoint": None, "last_checkpoint": None, "acc_increasing": None, "last_accs": deque() } else: self.settings = np.load(FLAGS.test_load_queue_path)[()] self.setup() def setup(self): self.recreate_directory_structure() self.coord = tf.train.Coordinator() # Load reader. with tf.name_scope("create_inputs"): self.reader = ImageReader("./val.npy", True, self.coord) self.image_batch, self.label_list_batch = self.reader.dequeue( FLAGS.batch_size) global_step = tf.Variable(0, dtype=tf.int32, name='global_step', trainable=False) self.net = CatznDogs({'data': self.image_batch}, global_step) # Set up tf session and initialize variables. config = tf.ConfigProto() config.gpu_options.allow_growth = True self.sess = tf.Session(config=config) # Start queue threads. self.threads = tf.train.start_queue_runners(coord=self.coord, sess=self.sess) self.ckpt = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir) def run(self): while not self.stop: all_models_paths = self.ckpt.all_model_checkpoint_paths index_current_model = list(all_models_paths).index( self.ckpt.model_checkpoint_path) if self.settings["last_checkpoint"]: index_last_evaluated_model = list(all_models_paths).index( self.settings["last_checkpoint"]) else: index_last_evaluated_model = -1 if index_current_model != index_last_evaluated_model: index_model_under_evaluation = index_last_evaluated_model + 1 self.settings["last_checkpoint"] = all_models_paths[ index_model_under_evaluation] print("Evaluator started evaluating") acc_pred = self.net.test( self.image_batch, self.label_list_batch, self.coord, self.sess, self.reader.nb_samples, checkpoint_iteration=index_model_under_evaluation) self.settings["last_accs"].append(acc_pred) if not self.settings[ "best_acc"] or acc_pred < self.settings["best_acc"]: self.settings["best_acc"] = acc_pred self.settings["best_checkpoint"] = self.settings[ "last_checkpoint"] self.settings["acc_increasing"] = 0 else: self.settings["acc_increasing"] += 1 if self.settings["acc_increasing"] >= 5: self.stop = 1 np.save(FLAGS.test_load_queue_path, self.settings) print("Best model is {} with best Acc {}".format( self.settings["best_checkpoint"], self.settings["best_acc"])) else: time.sleep(10) self.coord.request_stop() self.coord.join(self.threads) print("Best model is {} with best Acc {}".format( self.settings["best_checkpoint"], self.settings["best_acc"])) def recreate_directory_structure(self): if not tf.gfile.Exists(FLAGS.test_summaries_dir): tf.gfile.MakeDirs(FLAGS.test_summaries_dir) else: tf.gfile.DeleteRecursively(FLAGS.test_summaries_dir) tf.gfile.MakeDirs(FLAGS.test_summaries_dir)