def show_tf_record(tf_record_dir): train_dataset = lanenet_data_feed_pipline.LaneNetDataFeeder( dataset_dir=tf_record_dir, flags='train') val_dataset = lanenet_data_feed_pipline.LaneNetDataFeeder( dataset_dir=tf_record_dir, flags='val') # set compute graph node for training train_images, train_binary_labels, train_instance_labels = train_dataset.inputs( 10) # set compute graph node for validation val_images, val_binary_labels, val_instance_labels = val_dataset.inputs(10) # Set sess configuration # ============================== config GPU sess_config = tf.ConfigProto(allow_soft_placement=True) sess_config.gpu_options.per_process_gpu_memory_fraction = 0.8 sess_config.gpu_options.allow_growth = True sess_config.gpu_options.allocator_type = 'BFC' # ============================== sess = tf.Session(config=sess_config) with sess.as_default(): train_images_np, train_binary_labels_np, train_instance_labels_np = sess.run( [train_images, train_binary_labels, train_instance_labels]) print()
def train_lanenet_multi_gpu(dataset_dir, weights_path=None, net_flag='vgg'): """ train lanenet with multi gpu :param dataset_dir: :param weights_path: :param net_flag: :return: """ # set lanenet dataset train_dataset = lanenet_data_feed_pipline.LaneNetDataFeeder( dataset_dir=dataset_dir, flags='train') val_dataset = lanenet_data_feed_pipline.LaneNetDataFeeder( dataset_dir=dataset_dir, flags='val') # set lanenet train_net = lanenet.LaneNet(net_flag=net_flag, phase='train', reuse=False) val_net = lanenet.LaneNet(net_flag=net_flag, phase='val', reuse=True) # set compute graph node train_images, train_binary_labels, train_instance_labels = train_dataset.inputs( CFG.TRAIN.BATCH_SIZE, 1) val_images, val_binary_labels, val_instance_labels = val_dataset.inputs( CFG.TRAIN.VAL_BATCH_SIZE, 1) # set average container tower_grads = [] train_tower_loss = [] val_tower_loss = [] batchnorm_updates = None train_summary_op_updates = None # set lr global_step = tf.Variable(0, trainable=False) learning_rate = tf.train.polynomial_decay( learning_rate=CFG.TRAIN.LEARNING_RATE, global_step=global_step, decay_steps=CFG.TRAIN.EPOCHS, power=0.9) # set optimizer optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=CFG.TRAIN.MOMENTUM) # set distributed train op with tf.variable_scope(tf.get_variable_scope()): for i in range(CFG.TRAIN.GPU_NUM): with tf.device('/gpu:{:d}'.format(i)): with tf.name_scope('tower_{:d}'.format(i)) as _: train_loss, grads = compute_net_gradients( train_images, train_binary_labels, train_instance_labels, train_net, optimizer) # Only use the mean and var in the first gpu tower to update the parameter if i == 0: batchnorm_updates = tf.get_collection( tf.GraphKeys.UPDATE_OPS) train_summary_op_updates = tf.get_collection( tf.GraphKeys.SUMMARIES) tower_grads.append(grads) train_tower_loss.append(train_loss) with tf.name_scope('validation_{:d}'.format(i)) as _: val_loss, _ = compute_net_gradients( val_images, val_binary_labels, val_instance_labels, val_net, optimizer) val_tower_loss.append(val_loss) grads = average_gradients(tower_grads) avg_train_loss = tf.reduce_mean(train_tower_loss) avg_val_loss = tf.reduce_mean(val_tower_loss) # Track the moving averages of all trainable variables variable_averages = tf.train.ExponentialMovingAverage( CFG.TRAIN.MOVING_AVERAGE_DECAY, num_updates=global_step) variables_to_average = tf.trainable_variables( ) + tf.moving_average_variables() variables_averages_op = variable_averages.apply(variables_to_average) # Group all the op needed for training batchnorm_updates_op = tf.group(*batchnorm_updates) apply_gradient_op = optimizer.apply_gradients(grads, global_step=global_step) train_op = tf.group(apply_gradient_op, variables_averages_op, batchnorm_updates_op) # Set tf summary save path tboard_save_path = 'tboard/tusimple_lanenet_multi_gpu_{:s}'.format( net_flag) if not os.path.exists(tboard_save_path): os.makedirs(tboard_save_path) summary_writer = tf.summary.FileWriter(tboard_save_path) avg_train_loss_scalar = tf.summary.scalar(name='average_train_loss', tensor=avg_train_loss) avg_val_loss_scalar = tf.summary.scalar(name='average_val_loss', tensor=avg_val_loss) learning_rate_scalar = tf.summary.scalar(name='learning_rate_scalar', tensor=learning_rate) train_merge_summary_op = tf.summary.merge( [avg_train_loss_scalar, learning_rate_scalar] + train_summary_op_updates) val_merge_summary_op = tf.summary.merge([avg_val_loss_scalar]) # set tensorflow saver saver = tf.train.Saver() model_save_dir = 'model/tusimple_lanenet_multi_gpu_{:s}'.format(net_flag) if not os.path.exists(model_save_dir): os.makedirs(model_save_dir) train_start_time = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime(time.time())) model_name = 'tusimple_lanenet_{:s}_{:s}.ckpt'.format( net_flag, str(train_start_time)) model_save_path = ops.join(model_save_dir, model_name) # set sess config sess_config = tf.ConfigProto(device_count={'GPU': CFG.TRAIN.GPU_NUM}, allow_soft_placement=True) sess_config.gpu_options.per_process_gpu_memory_fraction = CFG.TRAIN.GPU_MEMORY_FRACTION sess_config.gpu_options.allow_growth = CFG.TRAIN.TF_ALLOW_GROWTH sess_config.gpu_options.allocator_type = 'BFC' # Set the training parameters train_epochs = CFG.TRAIN.EPOCHS log.info('Global configuration is as follows:') log.info(CFG) sess = tf.Session(config=sess_config) summary_writer.add_graph(sess.graph) with sess.as_default(): tf.train.write_graph( graph_or_graph_def=sess.graph, logdir='', name='{:s}/lanenet_model.pb'.format(model_save_dir)) if weights_path is None: log.info('Training from scratch') init = tf.global_variables_initializer() sess.run(init) else: log.info('Restore model from last model checkpoint {:s}'.format( weights_path)) saver.restore(sess=sess, save_path=weights_path) train_cost_time_mean = [] val_cost_time_mean = [] for epoch in range(train_epochs): # training part t_start = time.time() _, train_loss_value, train_summary, lr = \ sess.run( fetches=[train_op, avg_train_loss, train_merge_summary_op, learning_rate] ) if math.isnan(train_loss_value): log.error('Train loss is nan') return cost_time = time.time() - t_start train_cost_time_mean.append(cost_time) summary_writer.add_summary(summary=train_summary, global_step=epoch) # validation part t_start_val = time.time() val_loss_value, val_summary = \ sess.run(fetches=[avg_val_loss, val_merge_summary_op]) summary_writer.add_summary(val_summary, global_step=epoch) cost_time_val = time.time() - t_start_val val_cost_time_mean.append(cost_time_val) if epoch % CFG.TRAIN.DISPLAY_STEP == 0: log.info('Epoch_Train: {:d} total_loss= {:6f} ' 'lr= {:6f} mean_cost_time= {:5f}s '.format( epoch + 1, train_loss_value, lr, np.mean(train_cost_time_mean))) del train_cost_time_mean[:] if epoch % CFG.TRAIN.VAL_DISPLAY_STEP == 0: log.info('Epoch_Val: {:d} total_loss= {:6f}' ' mean_cost_time= {:5f}s '.format( epoch + 1, val_loss_value, np.mean(val_cost_time_mean))) del val_cost_time_mean[:] if epoch % 2000 == 0: saver.save(sess=sess, save_path=model_save_path, global_step=epoch) return
def train_lanenet(dataset_dir, weights_path=None, net_flag='vgg'): """ :param dataset_dir: :param net_flag: choose which base network to use :param weights_path: :return: """ train_dataset = lanenet_data_feed_pipline.LaneNetDataFeeder( dataset_dir=dataset_dir, flags='train') val_dataset = lanenet_data_feed_pipline.LaneNetDataFeeder( dataset_dir=dataset_dir, flags='val') with tf.device('/gpu:1'): # set lanenet train_net = lanenet.LaneNet(net_flag=net_flag, phase='train', reuse=False) val_net = lanenet.LaneNet(net_flag=net_flag, phase='val', reuse=True) # set compute graph node for training train_images, train_binary_labels, train_instance_labels = train_dataset.inputs( CFG.TRAIN.BATCH_SIZE, 1) train_compute_ret = train_net.compute_loss( input_tensor=train_images, binary_label=train_binary_labels, instance_label=train_instance_labels, name='lanenet_model') train_total_loss = train_compute_ret['total_loss'] train_binary_seg_loss = train_compute_ret['binary_seg_loss'] train_disc_loss = train_compute_ret['discriminative_loss'] train_pix_embedding = train_compute_ret['instance_seg_logits'] train_prediction_logits = train_compute_ret['binary_seg_logits'] train_prediction_score = tf.nn.softmax(logits=train_prediction_logits) train_prediction = tf.argmax(train_prediction_score, axis=-1) train_accuracy = evaluate_model_utils.calculate_model_precision( train_compute_ret['binary_seg_logits'], train_binary_labels) train_fp = evaluate_model_utils.calculate_model_fp( train_compute_ret['binary_seg_logits'], train_binary_labels) train_fn = evaluate_model_utils.calculate_model_fn( train_compute_ret['binary_seg_logits'], train_binary_labels) train_binary_seg_ret_for_summary = evaluate_model_utils.get_image_summary( img=train_prediction) train_embedding_ret_for_summary = evaluate_model_utils.get_image_summary( img=train_pix_embedding) train_cost_scalar = tf.summary.scalar(name='train_cost', tensor=train_total_loss) train_accuracy_scalar = tf.summary.scalar(name='train_accuracy', tensor=train_accuracy) train_binary_seg_loss_scalar = tf.summary.scalar( name='train_binary_seg_loss', tensor=train_binary_seg_loss) train_instance_seg_loss_scalar = tf.summary.scalar( name='train_instance_seg_loss', tensor=train_disc_loss) train_fn_scalar = tf.summary.scalar(name='train_fn', tensor=train_fn) train_fp_scalar = tf.summary.scalar(name='train_fp', tensor=train_fp) train_binary_seg_ret_img = tf.summary.image( name='train_binary_seg_ret', tensor=train_binary_seg_ret_for_summary) train_embedding_feats_ret_img = tf.summary.image( name='train_embedding_feats_ret', tensor=train_embedding_ret_for_summary) train_merge_summary_op = tf.summary.merge([ train_accuracy_scalar, train_cost_scalar, train_binary_seg_loss_scalar, train_instance_seg_loss_scalar, train_fn_scalar, train_fp_scalar, train_binary_seg_ret_img, train_embedding_feats_ret_img ]) # set compute graph node for validation val_images, val_binary_labels, val_instance_labels = val_dataset.inputs( CFG.TRAIN.VAL_BATCH_SIZE, 1) val_compute_ret = val_net.compute_loss( input_tensor=val_images, binary_label=val_binary_labels, instance_label=val_instance_labels, name='lanenet_model') val_total_loss = val_compute_ret['total_loss'] val_binary_seg_loss = val_compute_ret['binary_seg_loss'] val_disc_loss = val_compute_ret['discriminative_loss'] val_pix_embedding = val_compute_ret['instance_seg_logits'] val_prediction_logits = val_compute_ret['binary_seg_logits'] val_prediction_score = tf.nn.softmax(logits=val_prediction_logits) val_prediction = tf.argmax(val_prediction_score, axis=-1) val_accuracy = evaluate_model_utils.calculate_model_precision( val_compute_ret['binary_seg_logits'], val_binary_labels) val_fp = evaluate_model_utils.calculate_model_fp( val_compute_ret['binary_seg_logits'], val_binary_labels) val_fn = evaluate_model_utils.calculate_model_fn( val_compute_ret['binary_seg_logits'], val_binary_labels) val_binary_seg_ret_for_summary = evaluate_model_utils.get_image_summary( img=val_prediction) val_embedding_ret_for_summary = evaluate_model_utils.get_image_summary( img=val_pix_embedding) val_cost_scalar = tf.summary.scalar(name='val_cost', tensor=val_total_loss) val_accuracy_scalar = tf.summary.scalar(name='val_accuracy', tensor=val_accuracy) val_binary_seg_loss_scalar = tf.summary.scalar( name='val_binary_seg_loss', tensor=val_binary_seg_loss) val_instance_seg_loss_scalar = tf.summary.scalar( name='val_instance_seg_loss', tensor=val_disc_loss) val_fn_scalar = tf.summary.scalar(name='val_fn', tensor=val_fn) val_fp_scalar = tf.summary.scalar(name='val_fp', tensor=val_fp) val_binary_seg_ret_img = tf.summary.image( name='val_binary_seg_ret', tensor=val_binary_seg_ret_for_summary) val_embedding_feats_ret_img = tf.summary.image( name='val_embedding_feats_ret', tensor=val_embedding_ret_for_summary) val_merge_summary_op = tf.summary.merge([ val_accuracy_scalar, val_cost_scalar, val_binary_seg_loss_scalar, val_instance_seg_loss_scalar, val_fn_scalar, val_fp_scalar, val_binary_seg_ret_img, val_embedding_feats_ret_img ]) # set optimizer global_step = tf.Variable(0, trainable=False) learning_rate = tf.train.polynomial_decay( learning_rate=CFG.TRAIN.LEARNING_RATE, global_step=global_step, decay_steps=CFG.TRAIN.EPOCHS, power=0.9) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): optimizer = tf.train.MomentumOptimizer( learning_rate=learning_rate, momentum=CFG.TRAIN.MOMENTUM).minimize( loss=train_total_loss, var_list=tf.trainable_variables(), global_step=global_step) # Set tf model save path model_save_dir = 'model/tusimple_lanenet_{:s}'.format(net_flag) if not ops.exists(model_save_dir): os.makedirs(model_save_dir) train_start_time = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime(time.time())) model_name = 'tusimple_lanenet_{:s}_{:s}.ckpt'.format( net_flag, str(train_start_time)) model_save_path = ops.join(model_save_dir, model_name) saver = tf.train.Saver() # Set tf summary save path tboard_save_path = 'tboard/tusimple_lanenet_{:s}'.format(net_flag) if not ops.exists(tboard_save_path): os.makedirs(tboard_save_path) # Set sess configuration sess_config = tf.ConfigProto(allow_soft_placement=True) sess_config.gpu_options.per_process_gpu_memory_fraction = CFG.TRAIN.GPU_MEMORY_FRACTION sess_config.gpu_options.allow_growth = CFG.TRAIN.TF_ALLOW_GROWTH sess_config.gpu_options.allocator_type = 'BFC' sess = tf.Session(config=sess_config) summary_writer = tf.summary.FileWriter(tboard_save_path) summary_writer.add_graph(sess.graph) # Set the training parameters train_epochs = CFG.TRAIN.EPOCHS log.info('Global configuration is as follows:') log.info(CFG) with sess.as_default(): if weights_path is None: log.info('Training from scratch') init = tf.global_variables_initializer() sess.run(init) else: log.info('Restore model from last model checkpoint {:s}'.format( weights_path)) saver.restore(sess=sess, save_path=weights_path) if net_flag == 'vgg' and weights_path is None: load_pretrained_weights(tf.trainable_variables(), './data/vgg16.npy', sess) train_cost_time_mean = [] for epoch in range(train_epochs): # training part t_start = time.time() _, train_c, train_accuracy_figure, train_fn_figure, train_fp_figure, lr, train_summary, train_binary_loss, \ train_instance_loss, train_embeddings, train_binary_seg_imgs, train_gt_imgs, \ train_binary_gt_labels, train_instance_gt_labels = \ sess.run([optimizer, train_total_loss, train_accuracy, train_fn, train_fp, learning_rate, train_merge_summary_op, train_binary_seg_loss, train_disc_loss, train_pix_embedding, train_prediction, train_images, train_binary_labels, train_instance_labels]) if math.isnan(train_c) or math.isnan( train_binary_loss) or math.isnan(train_instance_loss): log.error('cost is: {:.5f}'.format(train_c)) log.error('binary cost is: {:.5f}'.format(train_binary_loss)) log.error( 'instance cost is: {:.5f}'.format(train_instance_loss)) return if epoch % 100 == 0: record_training_intermediate_result( gt_images=train_gt_imgs, gt_binary_labels=train_binary_gt_labels, gt_instance_labels=train_instance_gt_labels, binary_seg_images=train_binary_seg_imgs, pix_embeddings=train_embeddings) summary_writer.add_summary(summary=train_summary, global_step=epoch) if epoch % CFG.TRAIN.DISPLAY_STEP == 0: log.info( 'Epoch: {:d} total_loss= {:6f} binary_seg_loss= {:6f} ' 'instance_seg_loss= {:6f} accuracy= {:6f} fp= {:6f} fn= {:6f}' ' lr= {:6f} mean_cost_time= {:5f}s '.format( epoch + 1, train_c, train_binary_loss, train_instance_loss, train_accuracy_figure, train_fp_figure, train_fn_figure, lr, np.mean(train_cost_time_mean))) del train_cost_time_mean[:] # validation part val_c, val_accuracy_figure, val_fn_figure, val_fp_figure, val_summary, val_binary_loss, \ val_instance_loss, val_embeddings, val_binary_seg_imgs, val_gt_imgs, \ val_binary_gt_labels, val_instance_gt_labels = \ sess.run([val_total_loss, val_accuracy, val_fn, val_fp, val_merge_summary_op, val_binary_seg_loss, val_disc_loss, val_pix_embedding, val_prediction, val_images, val_binary_labels, val_instance_labels]) if math.isnan(val_c) or math.isnan(val_binary_loss) or math.isnan( val_instance_loss): log.error('cost is: {:.5f}'.format(val_c)) log.error('binary cost is: {:.5f}'.format(val_binary_loss)) log.error('instance cost is: {:.5f}'.format(val_instance_loss)) return if epoch % 100 == 0: record_training_intermediate_result( gt_images=val_gt_imgs, gt_binary_labels=val_binary_gt_labels, gt_instance_labels=val_instance_gt_labels, binary_seg_images=val_binary_seg_imgs, pix_embeddings=val_embeddings, flag='val') cost_time = time.time() - t_start train_cost_time_mean.append(cost_time) summary_writer.add_summary(summary=val_summary, global_step=epoch) if epoch % CFG.TRAIN.VAL_DISPLAY_STEP == 0: log.info( 'Epoch_Val: {:d} total_loss= {:6f} binary_seg_loss= {:6f} ' 'instance_seg_loss= {:6f} accuracy= {:6f} fp= {:6f} fn= {:6f}' ' mean_cost_time= {:5f}s '.format( epoch + 1, val_c, val_binary_loss, val_instance_loss, val_accuracy_figure, val_fp_figure, val_fn_figure, np.mean(train_cost_time_mean))) del train_cost_time_mean[:] if epoch % 2000 == 0: saver.save(sess=sess, save_path=model_save_path, global_step=global_step) return
def __init__(self): """ initialize lanenet multi gpu trainner """ # define solver params and dataset self._train_dataset = lanenet_data_feed_pipline.LaneNetDataFeeder( flags='train') self._val_dataset = lanenet_data_feed_pipline.LaneNetDataFeeder( flags='val') self._steps_per_epoch = len(self._train_dataset) self._val_steps_per_epoch = len(self._val_dataset) self._model_name = '{:s}_{:s}'.format(CFG.MODEL.FRONT_END, CFG.MODEL.MODEL_NAME) self._train_epoch_nums = CFG.TRAIN.EPOCH_NUMS self._batch_size = CFG.TRAIN.BATCH_SIZE self._val_batch_size = CFG.TRAIN.VAL_BATCH_SIZE self._snapshot_epoch = CFG.TRAIN.SNAPSHOT_EPOCH self._model_save_dir = ops.join(CFG.TRAIN.MODEL_SAVE_DIR, self._model_name) self._tboard_save_dir = ops.join(CFG.TRAIN.TBOARD_SAVE_DIR, self._model_name) self._enable_miou = CFG.TRAIN.COMPUTE_MIOU.ENABLE if self._enable_miou: self._record_miou_epoch = CFG.TRAIN.COMPUTE_MIOU.EPOCH self._input_tensor_size = [int(tmp) for tmp in CFG.AUG.TRAIN_CROP_SIZE] self._gpu_devices = CFG.TRAIN.MULTI_GPU.GPU_DEVICES self._gpu_nums = len(self._gpu_devices) self._chief_gpu_index = CFG.TRAIN.MULTI_GPU.CHIEF_DEVICE_INDEX self._batch_size_per_gpu = int(self._batch_size / self._gpu_nums) self._init_learning_rate = CFG.SOLVER.LR self._moving_ave_decay = CFG.SOLVER.MOVING_AVE_DECAY self._momentum = CFG.SOLVER.MOMENTUM self._lr_polynimal_decay_power = CFG.SOLVER.LR_POLYNOMIAL_POWER self._optimizer_mode = CFG.SOLVER.OPTIMIZER.lower() if CFG.TRAIN.RESTORE_FROM_SNAPSHOT.ENABLE: self._initial_weight = CFG.TRAIN.RESTORE_FROM_SNAPSHOT.SNAPSHOT_PATH else: self._initial_weight = None if CFG.TRAIN.WARM_UP.ENABLE: self._warmup_epoches = CFG.TRAIN.WARM_UP.EPOCH_NUMS self._warmup_init_learning_rate = self._init_learning_rate / 1000.0 else: self._warmup_epoches = 0 # define tensorflow session sess_config = tf.ConfigProto(allow_soft_placement=True) sess_config.gpu_options.per_process_gpu_memory_fraction = CFG.GPU.GPU_MEMORY_FRACTION sess_config.gpu_options.allow_growth = CFG.GPU.TF_ALLOW_GROWTH sess_config.gpu_options.allocator_type = 'BFC' self._sess = tf.Session(config=sess_config) # define graph input tensor with tf.variable_scope(name_or_scope='graph_input_node'): self._input_src_image_list = [] self._input_binary_label_image_list = [] self._input_instance_label_image_list = [] for i in range(self._gpu_nums): src_imgs, binary_label_imgs, instance_label_imgs = self._train_dataset.next_batch( batch_size=self._batch_size_per_gpu) self._input_src_image_list.append(src_imgs) self._input_binary_label_image_list.append(binary_label_imgs) self._input_instance_label_image_list.append( instance_label_imgs) self._val_input_src_image, self._val_input_binary_label_image, self._val_input_instance_label_image = \ self._val_dataset.next_batch(batch_size=self._val_batch_size) # define model self._model = lanenet.LaneNet(phase='train') self._val_model = lanenet.LaneNet(phase='test') # define average container tower_grads = [] tower_total_loss = [] tower_binary_seg_loss = [] tower_instance_seg_loss = [] batchnorm_updates = None # define learning rate with tf.variable_scope('learning_rate'): self._global_step = tf.Variable(1.0, dtype=tf.float32, trainable=False, name='global_step') self._val_global_step = tf.Variable(1.0, dtype=tf.float32, trainable=False, name='val_global_step') self._val_global_step_update = tf.assign_add( self._val_global_step, 1.0) warmup_steps = tf.constant(self._warmup_epoches * self._steps_per_epoch, dtype=tf.float32, name='warmup_steps') train_steps = tf.constant(self._train_epoch_nums * self._steps_per_epoch, dtype=tf.float32, name='train_steps') self._learn_rate = tf.cond( pred=self._global_step < warmup_steps, true_fn=lambda: self._compute_warmup_lr( warmup_steps=warmup_steps, name='warmup_lr'), false_fn=lambda: tf.train.polynomial_decay( learning_rate=self._init_learning_rate, global_step=self._global_step, decay_steps=train_steps, end_learning_rate=0.000000001, power=self._lr_polynimal_decay_power)) self._learn_rate = tf.identity(self._learn_rate, 'lr') # define optimizer if self._optimizer_mode == 'sgd': optimizer = tf.train.MomentumOptimizer( learning_rate=self._learn_rate, momentum=self._momentum) elif self._optimizer_mode == 'adam': optimizer = tf.train.AdamOptimizer( learning_rate=self._learn_rate, ) else: raise NotImplementedError( 'Not support optimizer: {:s} for now'.format( self._optimizer_mode)) # define distributed train op with tf.variable_scope(tf.get_variable_scope()): is_network_initialized = False for i in range(self._gpu_nums): with tf.device('/gpu:{:d}'.format(i)): with tf.name_scope('tower_{:d}'.format(i)) as _: input_images = self._input_src_image_list[i] input_binary_labels = self._input_binary_label_image_list[ i] input_instance_labels = self._input_instance_label_image_list[ i] tmp_loss, tmp_grads = self._compute_net_gradients( input_images, input_binary_labels, input_instance_labels, optimizer, is_net_first_initialized=is_network_initialized) is_network_initialized = True # Only use the mean and var in the chief gpu tower to update the parameter if i == self._chief_gpu_index: batchnorm_updates = tf.get_collection( tf.GraphKeys.UPDATE_OPS) tower_grads.append(tmp_grads) tower_total_loss.append(tmp_loss['total_loss']) tower_binary_seg_loss.append( tmp_loss['binary_seg_loss']) tower_instance_seg_loss.append( tmp_loss['discriminative_loss']) grads = self._average_gradients(tower_grads) self._loss = tf.reduce_mean(tower_total_loss, name='reduce_mean_tower_total_loss') self._binary_loss = tf.reduce_mean( tower_binary_seg_loss, name='reduce_mean_tower_binary_loss') self._instance_loss = tf.reduce_mean( tower_instance_seg_loss, name='reduce_mean_tower_instance_loss') ret = self._val_model.compute_loss( input_tensor=self._val_input_src_image, binary_label=self._val_input_binary_label_image, instance_label=self._val_input_instance_label_image, name='LaneNet', reuse=True) self._val_loss = ret['total_loss'] self._val_binary_loss = ret['binary_seg_loss'] self._val_instance_loss = ret['discriminative_loss'] # define moving average op with tf.variable_scope(name_or_scope='moving_avg'): if CFG.TRAIN.FREEZE_BN.ENABLE: train_var_list = [ v for v in tf.trainable_variables() if 'beta' not in v.name and 'gamma' not in v.name ] else: train_var_list = tf.trainable_variables() moving_ave_op = tf.train.ExponentialMovingAverage( self._moving_ave_decay).apply(train_var_list + tf.moving_average_variables()) # group all the op needed for training batchnorm_updates_op = tf.group(*batchnorm_updates) apply_gradient_op = optimizer.apply_gradients( grads, global_step=self._global_step) self._train_op = tf.group(apply_gradient_op, moving_ave_op, batchnorm_updates_op) # define prediction self._binary_prediciton, self._instance_prediciton = self._model.inference( input_tensor=self._input_src_image_list[self._chief_gpu_index], name='LaneNet', reuse=True) self._binary_prediciton = tf.identity( self._binary_prediciton, name='binary_segmentation_result') self._val_binary_prediction, self._val_instance_prediciton = self._val_model.inference( input_tensor=self._val_input_src_image, name='LaneNet', reuse=True) self._val_binary_prediction = tf.identity( self._val_binary_prediction, name='val_binary_segmentation_result') # define miou if self._enable_miou: with tf.variable_scope('miou'): pred = tf.reshape(self._binary_prediciton, [ -1, ]) gt = tf.reshape( self._input_binary_label_image_list[self._chief_gpu_index], [ -1, ]) indices = tf.squeeze( tf.where(tf.less_equal(gt, CFG.DATASET.NUM_CLASSES - 1)), 1) gt = tf.gather(gt, indices) pred = tf.gather(pred, indices) self._miou, self._miou_update_op = tf.metrics.mean_iou( labels=gt, predictions=pred, num_classes=CFG.DATASET.NUM_CLASSES) val_pred = tf.reshape(self._val_binary_prediction, [ -1, ]) val_gt = tf.reshape(self._val_input_binary_label_image, [ -1, ]) indices = tf.squeeze( tf.where(tf.less_equal(val_gt, CFG.DATASET.NUM_CLASSES - 1)), 1) val_gt = tf.gather(val_gt, indices) val_pred = tf.gather(val_pred, indices) self._val_miou, self._val_miou_update_op = tf.metrics.mean_iou( labels=val_gt, predictions=val_pred, num_classes=CFG.DATASET.NUM_CLASSES) # define saver and loader with tf.variable_scope('loader_and_saver'): self._net_var = [ vv for vv in tf.global_variables() if 'lr' not in vv.name ] self._loader = tf.train.Saver(self._net_var) self._saver = tf.train.Saver(max_to_keep=10) # define summary with tf.variable_scope('summary'): summary_merge_list = [ tf.summary.scalar("learn_rate", self._learn_rate), tf.summary.scalar("total_loss", self._loss), tf.summary.scalar('binary_loss', self._binary_loss), tf.summary.scalar('instance_loss', self._instance_loss), ] val_summary_merge_list = [ tf.summary.scalar('val_total_loss', self._val_loss), tf.summary.scalar('val_binary_loss', self._val_binary_loss), tf.summary.scalar('val_instance_loss', self._val_instance_loss), ] if self._enable_miou: with tf.control_dependencies([self._miou_update_op]): summary_merge_list_with_miou = [ tf.summary.scalar("learn_rate", self._learn_rate), tf.summary.scalar("total_loss", self._loss), tf.summary.scalar('binary_loss', self._binary_loss), tf.summary.scalar('instance_loss', self._instance_loss), tf.summary.scalar('miou', self._miou) ] self._write_summary_op_with_miou = tf.summary.merge( summary_merge_list_with_miou) with tf.control_dependencies( [self._val_miou_update_op, self._val_global_step_update]): val_summary_merge_list_with_miou = [ tf.summary.scalar("total_loss", self._loss), tf.summary.scalar('binary_loss', self._binary_loss), tf.summary.scalar('instance_loss', self._instance_loss), tf.summary.scalar('val_miou', self._val_miou), ] self._val_write_summary_op_with_miou = tf.summary.merge( val_summary_merge_list_with_miou) if ops.exists(self._tboard_save_dir): shutil.rmtree(self._tboard_save_dir) os.makedirs(self._tboard_save_dir, exist_ok=True) model_params_file_save_path = ops.join( self._tboard_save_dir, CFG.TRAIN.MODEL_PARAMS_CONFIG_FILE_NAME) with open(model_params_file_save_path, 'w', encoding='utf-8') as f_obj: CFG.dump_to_json_file(f_obj) self._write_summary_op = tf.summary.merge(summary_merge_list) self._val_write_summary_op = tf.summary.merge( val_summary_merge_list) self._summary_writer = tf.summary.FileWriter( self._tboard_save_dir, graph=self._sess.graph) LOG.info('Initialize tusimple lanenet multi gpu trainner complete')
def __init__(self, cfg): """ initialize lanenet trainner """ self._cfg = cfg # define solver params and dataset self._train_dataset = lanenet_data_feed_pipline.LaneNetDataFeeder( flags='train') self._steps_per_epoch = len(self._train_dataset) self._model_name = '{:s}_{:s}'.format(self._cfg.MODEL.FRONT_END, self._cfg.MODEL.MODEL_NAME) self._train_epoch_nums = self._cfg.TRAIN.EPOCH_NUMS self._batch_size = self._cfg.TRAIN.BATCH_SIZE self._snapshot_epoch = self._cfg.TRAIN.SNAPSHOT_EPOCH self._model_save_dir = ops.join(self._cfg.TRAIN.MODEL_SAVE_DIR, self._model_name) self._tboard_save_dir = ops.join(self._cfg.TRAIN.TBOARD_SAVE_DIR, self._model_name) self._enable_miou = self._cfg.TRAIN.COMPUTE_MIOU.ENABLE if self._enable_miou: self._record_miou_epoch = self._cfg.TRAIN.COMPUTE_MIOU.EPOCH self._input_tensor_size = [ int(tmp) for tmp in self._cfg.AUG.TRAIN_CROP_SIZE ] self._init_learning_rate = self._cfg.SOLVER.LR self._moving_ave_decay = self._cfg.SOLVER.MOVING_AVE_DECAY self._momentum = self._cfg.SOLVER.MOMENTUM self._lr_polynimal_decay_power = self._cfg.SOLVER.LR_POLYNOMIAL_POWER self._optimizer_mode = self._cfg.SOLVER.OPTIMIZER.lower() if self._cfg.TRAIN.RESTORE_FROM_SNAPSHOT.ENABLE: self._initial_weight = self._cfg.TRAIN.RESTORE_FROM_SNAPSHOT.SNAPSHOT_PATH else: self._initial_weight = None if self._cfg.TRAIN.WARM_UP.ENABLE: self._warmup_epoches = self._cfg.TRAIN.WARM_UP.EPOCH_NUMS self._warmup_init_learning_rate = self._init_learning_rate / 1000.0 else: self._warmup_epoches = 0 # define tensorflow session sess_config = tf.ConfigProto(allow_soft_placement=True) sess_config.gpu_options.per_process_gpu_memory_fraction = self._cfg.GPU.GPU_MEMORY_FRACTION sess_config.gpu_options.allow_growth = self._cfg.GPU.TF_ALLOW_GROWTH sess_config.gpu_options.allocator_type = 'BFC' self._sess = tf.Session(config=sess_config) # define graph input tensor with tf.compat.v1.variable_scope(name_or_scope='graph_input_node'): self._input_src_image, self._input_binary_label_image, self._input_instance_label_image = \ self._train_dataset.next_batch(batch_size=self._batch_size) # define model loss self._model = lanenet.LaneNet(phase='train', cfg=self._cfg) loss_set = self._model.compute_loss( input_tensor=self._input_src_image, binary_label=self._input_binary_label_image, instance_label=self._input_instance_label_image, name='LaneNet', reuse=False) self._binary_prediciton, self._instance_prediction = self._model.inference( input_tensor=self._input_src_image, name='LaneNet', reuse=True) self._loss = loss_set['total_loss'] self._binary_seg_loss = loss_set['binary_seg_loss'] self._disc_loss = loss_set['discriminative_loss'] self._pix_embedding = loss_set['instance_seg_logits'] self._binary_prediciton = tf.identity( self._binary_prediciton, name='binary_segmentation_result') # define miou if self._enable_miou: with tf.compat.v1.variable_scope('miou'): pred = tf.reshape(self._binary_prediciton, [ -1, ]) gt = tf.reshape(self._input_binary_label_image, [ -1, ]) indices = tf.squeeze( tf.where( tf.less_equal(gt, self._cfg.DATASET.NUM_CLASSES - 1)), 1) gt = tf.gather(gt, indices) pred = tf.gather(pred, indices) self._miou, self._miou_update_op = tf.metrics.mean_iou( labels=gt, predictions=pred, num_classes=self._cfg.DATASET.NUM_CLASSES) # define learning rate with tf.compat.v1.variable_scope('learning_rate'): self._global_step = tf.Variable(1.0, dtype=tf.float32, trainable=False, name='global_step') warmup_steps = tf.constant(self._warmup_epoches * self._steps_per_epoch, dtype=tf.float32, name='warmup_steps') train_steps = tf.constant(self._train_epoch_nums * self._steps_per_epoch, dtype=tf.float32, name='train_steps') self._learn_rate = tf.cond( pred=self._global_step < warmup_steps, true_fn=lambda: self._compute_warmup_lr( warmup_steps=warmup_steps, name='warmup_lr'), false_fn=lambda: tf.train.polynomial_decay( learning_rate=self._init_learning_rate, global_step=self._global_step, decay_steps=train_steps, end_learning_rate=0.000001, power=self._lr_polynimal_decay_power)) self._learn_rate = tf.identity(self._learn_rate, 'lr') global_step_update = tf.assign_add(self._global_step, 1.0) # define moving average op with tf.compat.v1.variable_scope(name_or_scope='moving_avg'): if self._cfg.TRAIN.FREEZE_BN.ENABLE: train_var_list = [ v for v in tf.trainable_variables() if 'beta' not in v.name and 'gamma' not in v.name ] else: train_var_list = tf.trainable_variables() moving_ave_op = tf.train.ExponentialMovingAverage( self._moving_ave_decay).apply(train_var_list + tf.moving_average_variables()) # define saver self._loader = tf.train.Saver(tf.moving_average_variables()) # define training op with tf.compat.v1.variable_scope(name_or_scope='train_step'): if self._cfg.TRAIN.FREEZE_BN.ENABLE: train_var_list = [ v for v in tf.trainable_variables() if 'beta' not in v.name and 'gamma' not in v.name ] else: train_var_list = tf.trainable_variables() if self._optimizer_mode == 'sgd': optimizer = tf.train.MomentumOptimizer( learning_rate=self._learn_rate, momentum=self._momentum) elif self._optimizer_mode == 'adam': optimizer = tf.train.AdamOptimizer( learning_rate=self._learn_rate, ) else: raise ValueError('Not support optimizer: {:s}'.format( self._optimizer_mode)) optimize_op = optimizer.minimize(self._loss, var_list=train_var_list) with tf.control_dependencies( tf.get_collection(tf.GraphKeys.UPDATE_OPS)): with tf.control_dependencies([optimize_op, global_step_update]): with tf.control_dependencies([moving_ave_op]): self._train_op = tf.no_op() # define saver and loader with tf.compat.v1.variable_scope('loader_and_saver'): self._net_var = [ vv for vv in tf.global_variables() if 'lr' not in vv.name ] self._saver = tf.train.Saver(tf.global_variables(), max_to_keep=5) # define summary with tf.compat.v1.variable_scope('summary'): summary_merge_list = [ tf.summary.scalar('learn_rate', self._learn_rate), tf.summary.scalar('total_loss', self._loss), tf.summary.scalar('binary_seg_loss', self._binary_seg_loss), tf.summary.scalar('discriminative_loss', self._disc_loss), ] if self._enable_miou: with tf.control_dependencies([self._miou_update_op]): summary_merge_list_with_miou = [ tf.summary.scalar('learn_rate', self._learn_rate), tf.summary.scalar('total_loss', self._loss), tf.summary.scalar('binary_seg_loss', self._binary_seg_loss), tf.summary.scalar('discriminative_loss', self._disc_loss), tf.summary.scalar('miou', self._miou) ] self._write_summary_op_with_miou = tf.summary.merge( summary_merge_list_with_miou) if ops.exists(self._tboard_save_dir): shutil.rmtree(self._tboard_save_dir) os.makedirs(self._tboard_save_dir, exist_ok=True) model_params_file_save_path = ops.join( self._tboard_save_dir, self._cfg.TRAIN.MODEL_PARAMS_CONFIG_FILE_NAME) with open(model_params_file_save_path, 'w', encoding='utf-8') as f_obj: self._cfg.dump_to_json_file(f_obj) self._write_summary_op = tf.summary.merge(summary_merge_list) self._summary_writer = tf.summary.FileWriter( self._tboard_save_dir, graph=self._sess.graph) LOG.info('Initialize tusimple lanenet trainner complete')
def __init__(self, cfg): """ initialize lanenet trainner, lanenet 트레이너 초기화 """ self._cfg = cfg # 설정파일 저장 # define solver params and dataset self._train_dataset = lanenet_data_feed_pipline.LaneNetDataFeeder(flags='train') # 데이터셋 불러오기 self._steps_per_epoch = len(self._train_dataset) # 데이터셋의 길이 이용하여 epoch당 반복횟수 설정 self._model_name = '{:s}_{:s}'.format(self._cfg.MODEL.FRONT_END, self._cfg.MODEL.MODEL_NAME) # 모델이름 설정 self._train_epoch_nums = self._cfg.TRAIN.EPOCH_NUMS # 학습 epoch수 설정 self._batch_size = self._cfg.TRAIN.BATCH_SIZE # 배치 사이즈 설정 self._snapshot_epoch = self._cfg.TRAIN.SNAPSHOT_EPOCH # 몇회 반복후 스냅샷 할것인지 설정 self._model_save_dir = ops.join(self._cfg.TRAIN.MODEL_SAVE_DIR, self._model_name) # 모델 저장경로 설정 self._tboard_save_dir = ops.join(self._cfg.TRAIN.TBOARD_SAVE_DIR, self._model_name) # 텐서보드 저장경로 설정 self._enable_miou = self._cfg.TRAIN.COMPUTE_MIOU.ENABLE # 성능 측정위한 MIOU계산 설정 if self._enable_miou: self._record_miou_epoch = self._cfg.TRAIN.COMPUTE_MIOU.EPOCH # 반복횟수 저장 self._input_tensor_size = [int(tmp) for tmp in self._cfg.AUG.TRAIN_CROP_SIZE] # 훈련위한 이미지 크롭 사이즈 저장(512, 256) self._init_learning_rate = self._cfg.SOLVER.LR # 학습률 저장 self._moving_ave_decay = self._cfg.SOLVER.MOVING_AVE_DECAY # 학습률 감소율 저장 self._momentum = self._cfg.SOLVER.MOMENTUM # 학습률 최적화 위한 모멘텀 설정 self._lr_polynimal_decay_power = self._cfg.SOLVER.LR_POLYNOMIAL_POWER # 학습률 최적화 self._optimizer_mode = self._cfg.SOLVER.OPTIMIZER.lower() # 옵티마이저 설정 if self._cfg.TRAIN.RESTORE_FROM_SNAPSHOT.ENABLE: # 스냅샷 으로부터 가중치 가져오기 self._initial_weight = self._cfg.TRAIN.RESTORE_FROM_SNAPSHOT.SNAPSHOT_PATH # 스냅샷(가중치) 위치 저장 else: # 스냅샷 사용하지 않을경우 self._initial_weight = None # 스냅샷(가중치) 위치 없음 if self._cfg.TRAIN.WARM_UP.ENABLE: # 워밍업 사용할 경우 self._warmup_epoches = self._cfg.TRAIN.WARM_UP.EPOCH_NUMS # 워밍업 횟수 저장 self._warmup_init_learning_rate = self._init_learning_rate / 1000.0 else: # 워밍업 사용하지 않을경우 self._warmup_epoches = 0 # define tensorflow session, 텐서플로우 세션 정의(GPU) sess_config = tf.ConfigProto(allow_soft_placement=True) sess_config.gpu_options.per_process_gpu_memory_fraction = self._cfg.GPU.GPU_MEMORY_FRACTION sess_config.gpu_options.allow_growth = self._cfg.GPU.TF_ALLOW_GROWTH sess_config.gpu_options.allocator_type = 'BFC' self._sess = tf.Session(config=sess_config) # define graph input tensor, 인풋 텐서 그래프 정의 with tf.variable_scope(name_or_scope='graph_input_node'): # 그래프 인풋 노드 열기 self._input_src_image, self._input_binary_label_image, self._input_instance_label_image = self._train_dataset.next_batch(batch_size=self._batch_size) # define model loss, 모델 loss값 정의 self._model = lanenet.LaneNet(phase='train', cfg=self._cfg) # 모델 저장 loss_set = self._model.compute_loss( # 이미지 이용하여 loss 구하는 그래프 정의 input_tensor=self._input_src_image, binary_label=self._input_binary_label_image, instance_label=self._input_instance_label_image, name='LaneNet', reuse=False ) self._binary_prediciton, self._instance_prediction = self._model.inference( input_tensor=self._input_src_image, name='LaneNet', reuse=True ) self._loss = loss_set['total_loss'] # 총 손실값 저장 self._binary_seg_loss = loss_set['binary_seg_loss'] # 이진 분류 loss 저장 self._disc_loss = loss_set['discriminative_loss'] # loss 감소율 저장 self._pix_embedding = loss_set['instance_seg_logits'] # 객체 분할 loss 저장 self._binary_prediciton = tf.identity(self._binary_prediciton, name='binary_segmentation_result') # tf.control_dependencies() 실행위해 설정 # define miou, 성능 측정위한 MIOU설정 if self._enable_miou: with tf.variable_scope('miou'): pred = tf.reshape(self._binary_prediciton, [-1, ]) # 예측값 gt = tf.reshape(self._input_binary_label_image, [-1, ]) # 실제값 indices = tf.squeeze(tf.where(tf.less_equal(gt, self._cfg.DATASET.NUM_CLASSES - 1)), 1) gt = tf.gather(gt, indices) pred = tf.gather(pred, indices) self._miou, self._miou_update_op = tf.metrics.mean_iou( labels=gt, predictions=pred, num_classes=self._cfg.DATASET.NUM_CLASSES ) # define learning rate, 학습률 설정 with tf.variable_scope('learning_rate'): self._global_step = tf.Variable(1.0, dtype=tf.float32, trainable=False, name='global_step') warmup_steps = tf.constant( # 워밍업 단계 정의 self._warmup_epoches * self._steps_per_epoch, dtype=tf.float32, name='warmup_steps' ) train_steps = tf.constant( # 실제 학습 단계 정의 self._train_epoch_nums * self._steps_per_epoch, dtype=tf.float32, name='train_steps' ) self._learn_rate = tf.cond( # 학습률 정의 그래프 pred=self._global_step < warmup_steps, true_fn=lambda: self._compute_warmup_lr(warmup_steps=warmup_steps, name='warmup_lr'), false_fn=lambda: tf.train.polynomial_decay( learning_rate=self._init_learning_rate, global_step=self._global_step, decay_steps=train_steps, end_learning_rate=0.000001, power=self._lr_polynimal_decay_power) ) self._learn_rate = tf.identity(self._learn_rate, 'lr') global_step_update = tf.assign_add(self._global_step, 1.0) # define moving average op, 이동평균 연산 정의 with tf.variable_scope(name_or_scope='moving_avg'): if self._cfg.TRAIN.FREEZE_BN.ENABLE: train_var_list = [ v for v in tf.trainable_variables() if 'beta' not in v.name and 'gamma' not in v.name ] else: train_var_list = tf.trainable_variables() moving_ave_op = tf.train.ExponentialMovingAverage( self._moving_ave_decay).apply(train_var_list + tf.moving_average_variables()) # define saver, 저장 정의 self._loader = tf.train.Saver(tf.moving_average_variables()) # 기존 모델 가중치 가져오는 loader 정의 # define training op, 학습연산 정의 with tf.variable_scope(name_or_scope='train_step'): if self._cfg.TRAIN.FREEZE_BN.ENABLE: # Freeze BN 사용할 경우, default: False train_var_list = [ v for v in tf.trainable_variables() if 'beta' not in v.name and 'gamma' not in v.name ] else: # Freeze BN 사용하지 않을경우 -> default train_var_list = tf.trainable_variables() # 훈련용 변수 가져오기 if self._optimizer_mode == 'sgd': # SGD 옵티마이저 사용할 경우 optimizer = tf.train.MomentumOptimizer( # 옵티마이저 정의 learning_rate=self._learn_rate, momentum=self._momentum ) elif self._optimizer_mode == 'adam': # ADAM 옵티마이저 사용할 경우 optimizer = tf.train.AdamOptimizer( # 옵티마이저 정의 learning_rate=self._learn_rate, ) else: # 둘다 아닐경우 (에러) raise ValueError('Not support optimizer: {:s}'.format(self._optimizer_mode)) # 에러처리 문구 출력 optimize_op = optimizer.minimize(self._loss, var_list=train_var_list) # 옵티마이저 이용 도함수 정의 # tf.control_dependencies 이용해 연산간의 실행 순서(dependency)를 정해줌 with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)): with tf.control_dependencies([optimize_op, global_step_update]): with tf.control_dependencies([moving_ave_op]): self._train_op = tf.no_op() # define saver and loader, saver와 loader 정의 with tf.variable_scope('loader_and_saver'): self._net_var = [vv for vv in tf.global_variables() if 'lr' not in vv.name] self._saver = tf.train.Saver(tf.global_variables(), max_to_keep=5) # define summary, 요약 정의 with tf.variable_scope('summary'): summary_merge_list = [ # 요약할 항목 병합 tf.summary.scalar('learn_rate', self._learn_rate), tf.summary.scalar('total_loss', self._loss), tf.summary.scalar('binary_seg_loss', self._binary_seg_loss), tf.summary.scalar('discriminative_loss', self._disc_loss), ] if self._enable_miou: # MIOU 사용할 경우 추가로 저장할 항목 정의 with tf.control_dependencies([self._miou_update_op]): summary_merge_list_with_miou = [ tf.summary.scalar('learn_rate', self._learn_rate), tf.summary.scalar('total_loss', self._loss), tf.summary.scalar('binary_seg_loss', self._binary_seg_loss), tf.summary.scalar('discriminative_loss', self._disc_loss), tf.summary.scalar('miou', self._miou) ] self._write_summary_op_with_miou = tf.summary.merge(summary_merge_list_with_miou) if ops.exists(self._tboard_save_dir): # 텐서보드 저장경로 이미 존재하는지 확인 shutil.rmtree(self._tboard_save_dir) # 이미 존재하면 지정된 폴더와 하위 디렉토리 폴더, 파일를 모두 삭제 os.makedirs(self._tboard_save_dir, exist_ok=True) # 텐서보드 저장경로에 명시된 폴더 생성 model_params_file_save_path = ops.join(self._tboard_save_dir, self._cfg.TRAIN.MODEL_PARAMS_CONFIG_FILE_NAME) # 모델 학습설정파일 저장경로 생성하여 저장 with open(model_params_file_save_path, 'w', encoding='utf-8') as f_obj: # JSON 파일로 설정한 경로에 학습 설정파일 저장 self._cfg.dump_to_json_file(f_obj) self._write_summary_op = tf.summary.merge(summary_merge_list) # 텐서보드에 저장할 요소 최종 리스트 저장 self._summary_writer = tf.summary.FileWriter(self._tboard_save_dir, graph=self._sess.graph) # 텐서보드에 해당 파일 저장 LOG.info('Initialize tusimple lanenet trainner complete') # 학습위한 초기화 작업 완료 로그 출력
def train_lanenet(dataset_dir, weights_path=None, net_flag='vgg', version_flag='', scratch=False): """ Train LaneNet With One GPU :param dataset_dir: :param weights_path: :param net_flag: :param version_flag: :param scratch: :return: """ train_dataset = lanenet_data_feed_pipline.LaneNetDataFeeder( dataset_dir=dataset_dir, flags='train') val_dataset = lanenet_data_feed_pipline.LaneNetDataFeeder( dataset_dir=dataset_dir, flags='val') # ================================================================ # # Define Network # # ================================================================ # train_net = lanenet.LaneNet(net_flag=net_flag, phase='train', reuse=tf.AUTO_REUSE) val_net = lanenet.LaneNet(net_flag=net_flag, phase='val', reuse=True) # ---------------------------------------------------------------- # # ================================================================ # # Train Input & Output # # ================================================================ # # set compute graph node for training train_images, train_binary_labels, train_instance_labels = train_dataset.inputs( CFG.TRAIN.BATCH_SIZE) train_compute_ret = train_net.compute_loss( input_tensor=train_images, binary_label=train_binary_labels, instance_label=train_instance_labels, name='lanenet_model') train_total_loss = train_compute_ret['total_loss'] train_binary_seg_loss = train_compute_ret['binary_seg_loss'] # 语义分割 loss train_disc_loss = train_compute_ret[ 'discriminative_loss'] # embedding loss train_pix_embedding = train_compute_ret[ 'instance_seg_logits'] # embedding feature, HxWxN train_l2_reg_loss = train_compute_ret['l2_reg_loss'] train_prediction_logits = train_compute_ret[ 'binary_seg_logits'] # 语义分割结果,HxWx2 train_prediction_score = tf.nn.softmax(logits=train_prediction_logits) train_prediction = tf.argmax(train_prediction_score, axis=-1) # 语义分割二值图 train_accuracy = evaluate_model_utils.calculate_model_precision( train_compute_ret['binary_seg_logits'], train_binary_labels) train_fp = evaluate_model_utils.calculate_model_fp( train_compute_ret['binary_seg_logits'], train_binary_labels) train_fn = evaluate_model_utils.calculate_model_fn( train_compute_ret['binary_seg_logits'], train_binary_labels) train_binary_seg_ret_for_summary = evaluate_model_utils.get_image_summary( img=train_prediction) # (I - min) * 255 / (max -min), 归一化到0-255 train_embedding_ret_for_summary = evaluate_model_utils.get_image_summary( img=train_pix_embedding) # (I - min) * 255 / (max -min), 归一化到0-255 # ---------------------------------------------------------------- # # ================================================================ # # Define Optimizer # # ================================================================ # # set optimizer global_step = tf.Variable(0, trainable=False, name='global_step') # learning_rate = tf.train.cosine_decay_restarts( # 余弦衰减 # learning_rate=CFG.TRAIN.LEARNING_RATE, # 初始学习率 # global_step=global_step, # 当前迭代次数 # first_decay_steps=CFG.TRAIN.STEPS/3, # 首次衰减周期 # t_mul=2.0, # 随后每次衰减周期倍数 # m_mul=1.0, # 随后每次初始学习率倍数 # alpha = 0.1, # 最小的学习率=alpha*learning_rate # ) learning_rate = tf.train.polynomial_decay( # 多项式衰减 learning_rate=CFG.TRAIN.LEARNING_RATE, # 初始学习率 global_step=global_step, # 当前迭代次数 decay_steps=CFG.TRAIN.STEPS / 4, # 在迭代到该次数实际,学习率衰减为 learning_rate * dacay_rate end_learning_rate=CFG.TRAIN.LEARNING_RATE / 10, # 最小的学习率 power=0.9, cycle=True) learning_rate_scalar = tf.summary.scalar(name='learning_rate', tensor=learning_rate) update_ops = tf.get_collection( tf.GraphKeys.UPDATE_OPS) # for batch normalization with tf.control_dependencies(update_ops): optimizer = tf.train.MomentumOptimizer( learning_rate=learning_rate, momentum=CFG.TRAIN.MOMENTUM).minimize( loss=train_total_loss, var_list=tf.trainable_variables(), global_step=global_step) # ---------------------------------------------------------------- # # ================================================================ # # Train Summary # # ================================================================ # train_cost_scalar = tf.summary.scalar(name='train_cost', tensor=train_total_loss) train_accuracy_scalar = tf.summary.scalar(name='train_accuracy', tensor=train_accuracy) train_binary_seg_loss_scalar = tf.summary.scalar( name='train_binary_seg_loss', tensor=train_binary_seg_loss) train_instance_seg_loss_scalar = tf.summary.scalar( name='train_instance_seg_loss', tensor=train_disc_loss) train_fn_scalar = tf.summary.scalar(name='train_fn', tensor=train_fn) train_fp_scalar = tf.summary.scalar(name='train_fp', tensor=train_fp) train_binary_seg_ret_img = tf.summary.image( name='train_binary_seg_ret', tensor=train_binary_seg_ret_for_summary) train_embedding_feats_ret_img = tf.summary.image( name='train_embedding_feats_ret', tensor=train_embedding_ret_for_summary) train_merge_summary_op = tf.summary.merge([ train_accuracy_scalar, train_cost_scalar, train_binary_seg_loss_scalar, train_instance_seg_loss_scalar, train_fn_scalar, train_fp_scalar, train_binary_seg_ret_img, train_embedding_feats_ret_img, learning_rate_scalar ]) # ---------------------------------------------------------------- # # ================================================================ # # Val Input & Output # # ================================================================ # # set compute graph node for validation val_images, val_binary_labels, val_instance_labels = val_dataset.inputs( CFG.TEST.BATCH_SIZE) val_compute_ret = val_net.compute_loss(input_tensor=val_images, binary_label=val_binary_labels, instance_label=val_instance_labels, name='lanenet_model') val_total_loss = val_compute_ret['total_loss'] val_binary_seg_loss = val_compute_ret['binary_seg_loss'] val_disc_loss = val_compute_ret['discriminative_loss'] val_pix_embedding = val_compute_ret['instance_seg_logits'] val_prediction_logits = val_compute_ret['binary_seg_logits'] val_prediction_score = tf.nn.softmax(logits=val_prediction_logits) val_prediction = tf.argmax(val_prediction_score, axis=-1) val_accuracy = evaluate_model_utils.calculate_model_precision( val_compute_ret['binary_seg_logits'], val_binary_labels) val_fp = evaluate_model_utils.calculate_model_fp( val_compute_ret['binary_seg_logits'], val_binary_labels) val_fn = evaluate_model_utils.calculate_model_fn( val_compute_ret['binary_seg_logits'], val_binary_labels) val_binary_seg_ret_for_summary = evaluate_model_utils.get_image_summary( img=val_prediction) val_embedding_ret_for_summary = evaluate_model_utils.get_image_summary( img=val_pix_embedding) # ---------------------------------------------------------------- # # ================================================================ # # VAL Summary # # ================================================================ # val_cost_scalar = tf.summary.scalar(name='val_cost', tensor=val_total_loss) val_accuracy_scalar = tf.summary.scalar(name='val_accuracy', tensor=val_accuracy) val_binary_seg_loss_scalar = tf.summary.scalar(name='val_binary_seg_loss', tensor=val_binary_seg_loss) val_instance_seg_loss_scalar = tf.summary.scalar( name='val_instance_seg_loss', tensor=val_disc_loss) val_fn_scalar = tf.summary.scalar(name='val_fn', tensor=val_fn) val_fp_scalar = tf.summary.scalar(name='val_fp', tensor=val_fp) val_binary_seg_ret_img = tf.summary.image( name='val_binary_seg_ret', tensor=val_binary_seg_ret_for_summary) val_embedding_feats_ret_img = tf.summary.image( name='val_embedding_feats_ret', tensor=val_embedding_ret_for_summary) val_merge_summary_op = tf.summary.merge([ val_accuracy_scalar, val_cost_scalar, val_binary_seg_loss_scalar, val_instance_seg_loss_scalar, val_fn_scalar, val_fp_scalar, val_binary_seg_ret_img, val_embedding_feats_ret_img ]) # ---------------------------------------------------------------- # # ================================================================ # # Config Saver & Session # # ================================================================ # # Set tf model save path model_save_dir = 'model/tusimple_lanenet_{:s}_{:s}'.format( net_flag, version_flag) os.makedirs(model_save_dir, exist_ok=True) train_start_time = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime(time.time())) model_name = 'tusimple_lanenet_{:s}_{:s}.ckpt'.format( net_flag, str(train_start_time)) model_save_path = ops.join(model_save_dir, model_name) # ============================== if scratch: """ 删除 Momentum 的参数, 注意这里保存的 meta 文件也会删了 tensorflow 在 save model 的时候,如果选择了 global_step 选项,会 global_step 值也保存下来, 然后 restore 的时候也就会接着这个 global_step 继续训练下去,因此需要去掉 """ variables = tf.contrib.framework.get_variables_to_restore() variables_to_resotre = [ v for v in variables if 'Momentum' not in v.name.split('/')[-1] ] variables_to_resotre = [ v for v in variables_to_resotre if 'global_step' not in v.name.split('/')[-1] ] restore_saver = tf.train.Saver(variables_to_resotre) else: restore_saver = tf.train.Saver() saver = tf.train.Saver(max_to_keep=10) # ============================== # Set tf summary save path tboard_save_path = 'tboard/tusimple_lanenet_{:s}_{:s}'.format( net_flag, version_flag) os.makedirs(tboard_save_path, exist_ok=True) # Set sess configuration # ============================== config GPU sess_config = tf.ConfigProto(allow_soft_placement=True) sess_config.gpu_options.per_process_gpu_memory_fraction = CFG.TRAIN.GPU_MEMORY_FRACTION sess_config.gpu_options.allow_growth = CFG.TRAIN.TF_ALLOW_GROWTH sess_config.gpu_options.allocator_type = 'BFC' # ============================== sess = tf.Session(config=sess_config) summary_writer = tf.summary.FileWriter(tboard_save_path) summary_writer.add_graph(sess.graph) # ---------------------------------------------------------------- # # Set the training parameters import math train_steps = CFG.TRAIN.STEPS val_steps = math.ceil(CFG.TRAIN.VAL_SIZE / CFG.TEST.BATCH_SIZE) # 测试一个 epoch 需要的 batch 数量 one_epoch2step = math.ceil(CFG.TRAIN.TRAIN_SIZE / CFG.TRAIN.BATCH_SIZE) # 训练一个 epoch 需要的 batch 数量 log.info('Global configuration is as follows:') log.info(CFG) max_acc = 0.9 save_num = 0 # ================================================================ # # Train & Val # # ================================================================ # with sess.as_default(): # ============================== load pretrain model if weights_path is None: log.info('Training from scratch') sess.run(tf.global_variables_initializer()) elif net_flag == 'vgg' and weights_path is None: load_pretrained_weights(tf.trainable_variables(), './data/vgg16.npy', sess) elif scratch: # 从头开始训练,类似 Caffe 的 --weights sess.run(tf.global_variables_initializer()) log.info('Restore model from last model checkpoint {:s}, scratch'. format(weights_path)) try: restore_saver.restore(sess=sess, save_path=weights_path) except: log.info('model maybe is not exist!') else: # 继续训练,类似 Caffe 的 --snapshot log.info('Restore model from last model checkpoint {:s}'.format( weights_path)) try: restore_saver.restore(sess=sess, save_path=weights_path) except: log.info('model maybe is not exist!') # ============================== train_cost_time_mean = [] # 统计一个 batch 训练耗时 for step in range(train_steps): # ================================================================ # # Train # # ================================================================ # t_start = time.time() _, train_loss, train_accuracy_figure, train_fn_figure, train_fp_figure, \ lr, train_summary, train_binary_loss, \ train_instance_loss, train_embeddings, train_binary_seg_imgs, train_gt_imgs, \ train_binary_gt_labels, train_instance_gt_labels, train_l2_loss = \ sess.run([optimizer, train_total_loss, train_accuracy, train_fn, train_fp, learning_rate, train_merge_summary_op, train_binary_seg_loss, train_disc_loss, train_pix_embedding, train_prediction, train_images, train_binary_labels, train_instance_labels, train_l2_reg_loss]) cost_time = time.time() - t_start train_cost_time_mean.append(cost_time) # ============================== 透心凉,心飞扬 if math.isnan(train_loss) or math.isnan( train_binary_loss) or math.isnan(train_instance_loss): log.error('cost is: {:.5f}'.format(train_loss)) log.error('binary cost is: {:.5f}'.format(train_binary_loss)) log.error( 'instance cost is: {:.5f}'.format(train_instance_loss)) return # ============================== summary_writer.add_summary(summary=train_summary, global_step=step) # 每隔 DISPLAY_STEP 次,打印 loss 值 if step % CFG.TRAIN.DISPLAY_STEP == 0: epoch_num = step // one_epoch2step log.info( 'Epoch: {:d} Step: {:d} total_loss= {:6f} binary_seg_loss= {:6f} ' 'instance_seg_loss= {:6f} l2_reg_loss= {:6f} accuracy= {:6f} fp= {:6f} fn= {:6f}' ' lr= {:6f} mean_cost_time= {:5f}s '.format( epoch_num + 1, step + 1, train_loss, train_binary_loss, train_instance_loss, train_l2_loss, train_accuracy_figure, train_fp_figure, train_fn_figure, lr, np.mean(train_cost_time_mean))) train_cost_time_mean.clear() # # 每隔 VAL_DISPLAY_STEP 次,保存模型,保存当前 batch 训练结果图片 # if step % CFG.TRAIN.VAL_DISPLAY_STEP == 0: # saver.save(sess=sess, save_path=model_save_path, global_step=global_step) # global_step 会保存 global_step 信息 # record_training_intermediate_result( # gt_images=train_gt_imgs, gt_binary_labels=train_binary_gt_labels, # gt_instance_labels=train_instance_gt_labels, binary_seg_images=train_binary_seg_imgs, # pix_embeddings=train_embeddings # ) # ---------------------------------------------------------------- # # ================================================================ # # Val # # ================================================================ # # 每隔 VAL_DISPLAY_STEP 次,测试整个验证集 if step % CFG.TRAIN.VAL_DISPLAY_STEP == 0: val_t_start = time.time() val_cost_time = 0 mean_val_c = 0.0 mean_val_binary_loss = 0.0 mean_val_instance_loss = 0.0 mean_val_accuracy_figure = 0.0 mean_val_fp_figure = 0.0 mean_val_fn_figure = 0.0 for val_step in range(val_steps): # validation part val_c, val_accuracy_figure, val_fn_figure, val_fp_figure, \ val_summary, val_binary_loss, val_instance_loss, \ val_embeddings, val_binary_seg_imgs, val_gt_imgs, \ val_binary_gt_labels, val_instance_gt_labels = \ sess.run([val_total_loss, val_accuracy, val_fn, val_fp, val_merge_summary_op, val_binary_seg_loss, val_disc_loss, val_pix_embedding, val_prediction, val_images, val_binary_labels, val_instance_labels]) # ============================== 透心凉,心飞扬 if math.isnan(val_c) or math.isnan( val_binary_loss) or math.isnan(val_instance_loss): log.error('cost is: {:.5f}'.format(val_c)) log.error( 'binary cost is: {:.5f}'.format(val_binary_loss)) log.error('instance cost is: {:.5f}'.format( val_instance_loss)) return # ============================== # if val_step == 0: # record_training_intermediate_result( # gt_images=val_gt_imgs, gt_binary_labels=val_binary_gt_labels, # gt_instance_labels=val_instance_gt_labels, binary_seg_images=val_binary_seg_imgs, # pix_embeddings=val_embeddings, flag='val' # ) cost_time = time.time() - val_t_start val_cost_time += cost_time mean_val_c += val_c mean_val_binary_loss += val_binary_loss mean_val_instance_loss += val_instance_loss mean_val_accuracy_figure += val_accuracy_figure mean_val_fp_figure += val_fp_figure mean_val_fn_figure += val_fn_figure summary_writer.add_summary(summary=val_summary, global_step=step) mean_val_c /= val_steps mean_val_binary_loss /= val_steps mean_val_instance_loss /= val_steps mean_val_accuracy_figure /= val_steps mean_val_fp_figure /= val_steps mean_val_fn_figure /= val_steps # ============================== if mean_val_accuracy_figure > max_acc: max_acc = mean_val_accuracy_figure if save_num < 3: # 前三次不算 max_acc = 0.9 log.info('MAX_ACC change to {}'.format( mean_val_accuracy_figure)) model_save_path_max = ops.join( model_save_dir, 'tusimple_lanenet_{}.ckpt'.format( mean_val_accuracy_figure)) saver.save(sess=sess, save_path=model_save_path_max, global_step=global_step) save_num += 1 # ============================== log.info( 'MEAN Val: total_loss= {:6f} binary_seg_loss= {:6f} ' 'instance_seg_loss= {:6f} accuracy= {:6f} fp= {:6f} fn= {:6f}' ' mean_cost_time= {:5f}s '.format( mean_val_c, mean_val_binary_loss, mean_val_instance_loss, mean_val_accuracy_figure, mean_val_fp_figure, mean_val_fn_figure, val_cost_time)) # ---------------------------------------------------------------- # return