示例#1
0
def generate_tfrecords():
    """

    :return:
    """
    io = cityscapes_tf_io.CityScapesTfIO(cfg=CFG)
    io.writer.write_tfrecords()

    return
    def __init__(self):
        """
        initialize bisenetv2 trainner
        """
        # define solver params and dataset
        self._cityscapes_io = cityscapes_tf_io.CityScapesTfIO()
        self._train_dataset = self._cityscapes_io.train_dataset_reader
        self._steps_per_epoch = len(self._train_dataset)

        self._model_name = CFG.MODEL.MODEL_NAME

        self._train_epoch_nums = CFG.TRAIN.EPOCH_NUMS
        self._batch_size = CFG.TRAIN.BATCH_SIZE
        self._snapshot_epoch = CFG.TRAIN.SNAPSHOT_EPOCH
        self._model_save_dir = ops.join(CFG.TRAIN.MODEL_SAVE_DIR, self._model_name)
        self._tboard_save_dir = ops.join(CFG.TRAIN.TBOARD_SAVE_DIR, self._model_name)
        self._enable_miou = CFG.TRAIN.COMPUTE_MIOU.ENABLE
        if self._enable_miou:
            self._record_miou_epoch = CFG.TRAIN.COMPUTE_MIOU.EPOCH
        self._input_tensor_size = [int(tmp / 2) for tmp in CFG.AUG.TRAIN_CROP_SIZE]

        self._init_learning_rate = CFG.SOLVER.LR
        self._moving_ave_decay = CFG.SOLVER.MOVING_AVE_DECAY
        self._momentum = CFG.SOLVER.MOMENTUM
        self._lr_polynimal_decay_power = CFG.SOLVER.LR_POLYNOMIAL_POWER
        self._optimizer_mode = CFG.SOLVER.OPTIMIZER.lower()

        if CFG.TRAIN.RESTORE_FROM_SNAPSHOT.ENABLE:
            self._initial_weight = CFG.TRAIN.RESTORE_FROM_SNAPSHOT.SNAPSHOT_PATH
        else:
            self._initial_weight = None
        if CFG.TRAIN.WARM_UP.ENABLE:
            self._warmup_epoches = CFG.TRAIN.WARM_UP.EPOCH_NUMS
            self._warmup_init_learning_rate = self._init_learning_rate / 1000.0
        else:
            self._warmup_epoches = 0

        # define tensorflow session
        sess_config = tf.ConfigProto(allow_soft_placement=True)
        sess_config.gpu_options.per_process_gpu_memory_fraction = CFG.GPU.GPU_MEMORY_FRACTION
        sess_config.gpu_options.allow_growth = CFG.GPU.TF_ALLOW_GROWTH
        sess_config.gpu_options.allocator_type = 'BFC'
        self._sess = tf.Session(config=sess_config)

        # define graph input tensor
        with tf.variable_scope(name_or_scope='graph_input_node'):
            self._input_src_image, self._input_label_image = self._train_dataset.next_batch(
                batch_size=self._batch_size
            )

        # define model loss
        self._model = bisenet_v2.BiseNetV2(phase='train', cfg=CFG)
        loss_set = self._model.compute_loss(
            input_tensor=self._input_src_image,
            label_tensor=self._input_label_image,
            name='BiseNetV2',
            reuse=False
        )
        self._prediciton = self._model.inference(
            input_tensor=self._input_src_image,
            name='BiseNetV2',
            reuse=True
        )
        self._loss = loss_set['total_loss']
        self._l2_loss = loss_set['l2_loss']

        # define miou
        if self._enable_miou:
            with tf.variable_scope('miou'):
                pred = tf.reshape(self._prediciton, [-1, ])
                gt = tf.reshape(self._input_label_image, [-1, ])
                indices = tf.squeeze(tf.where(tf.less_equal(gt, CFG.DATASET.NUM_CLASSES - 1)), 1)
                gt = tf.gather(gt, indices)
                pred = tf.gather(pred, indices)
                self._miou, self._miou_update_op = tf.metrics.mean_iou(
                    labels=gt,
                    predictions=pred,
                    num_classes=CFG.DATASET.NUM_CLASSES
                )

        # define learning rate
        with tf.variable_scope('learning_rate'):
            self._global_step = tf.Variable(1.0, dtype=tf.float32, trainable=False, name='global_step')
            warmup_steps = tf.constant(
                self._warmup_epoches * self._steps_per_epoch, dtype=tf.float32, name='warmup_steps'
            )
            train_steps = tf.constant(
                self._train_epoch_nums * self._steps_per_epoch, dtype=tf.float32, name='train_steps'
            )
            self._learn_rate = tf.cond(
                pred=self._global_step < warmup_steps,
                true_fn=lambda: self._compute_warmup_lr(warmup_steps=warmup_steps, name='warmup_lr'),
                false_fn=lambda: tf.train.polynomial_decay(
                    learning_rate=self._init_learning_rate,
                    global_step=self._global_step,
                    decay_steps=train_steps,
                    end_learning_rate=0.000001,
                    power=self._lr_polynimal_decay_power)
            )
            self._learn_rate = tf.identity(self._learn_rate, 'lr')
            global_step_update = tf.assign_add(self._global_step, 1.0)

        # define moving average op
        with tf.variable_scope(name_or_scope='moving_avg'):
            if CFG.TRAIN.FREEZE_BN.ENABLE:
                train_var_list = [
                    v for v in tf.trainable_variables() if 'beta' not in v.name and 'gamma' not in v.name
                ]
            else:
                train_var_list = tf.trainable_variables()
            moving_ave_op = tf.train.ExponentialMovingAverage(
                self._moving_ave_decay).apply(train_var_list + tf.moving_average_variables())

        # define training op
        with tf.variable_scope(name_or_scope='train_step'):
            if CFG.TRAIN.FREEZE_BN.ENABLE:
                train_var_list = [
                    v for v in tf.trainable_variables() if 'beta' not in v.name and 'gamma' not in v.name
                ]
            else:
                train_var_list = tf.trainable_variables()
            if self._optimizer_mode == 'sgd':
                optimizer = tf.train.MomentumOptimizer(
                    learning_rate=self._learn_rate,
                    momentum=self._momentum
                )
            elif self._optimizer_mode == 'adam':
                optimizer = tf.train.AdamOptimizer(
                    learning_rate=self._learn_rate,
                )
            else:
                raise ValueError('Not support optimizer: {:s}'.format(self._optimizer_mode))
            optimize_op = optimizer.minimize(self._loss, var_list=train_var_list)
            with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
                with tf.control_dependencies([optimize_op, global_step_update]):
                    with tf.control_dependencies([moving_ave_op]):
                        self._train_op = tf.no_op()

        # define saver and loader
        with tf.variable_scope('loader_and_saver'):
            self._net_var = [vv for vv in tf.global_variables() if 'lr' not in vv.name]
            self._loader = tf.train.Saver(self._net_var)
            self._saver = tf.train.Saver(tf.global_variables(), max_to_keep=5)

        # define summary
        with tf.variable_scope('summary'):
            summary_merge_list = [
                tf.summary.scalar("learn_rate", self._learn_rate),
                tf.summary.scalar("total", self._loss),
                tf.summary.scalar('l2_loss', self._l2_loss)
            ]
            if self._enable_miou:
                with tf.control_dependencies([self._miou_update_op]):
                    summary_merge_list_with_miou = [
                        tf.summary.scalar("learn_rate", self._learn_rate),
                        tf.summary.scalar("total", self._loss),
                        tf.summary.scalar('l2_loss', self._l2_loss),
                        tf.summary.scalar('miou', self._miou)
                    ]
                    self._write_summary_op_with_miou = tf.summary.merge(summary_merge_list_with_miou)
            if ops.exists(self._tboard_save_dir):
                shutil.rmtree(self._tboard_save_dir)
            os.makedirs(self._tboard_save_dir, exist_ok=True)
            model_params_file_save_path = ops.join(self._tboard_save_dir, CFG.TRAIN.MODEL_PARAMS_CONFIG_FILE_NAME)
            with open(model_params_file_save_path, 'w', encoding='utf-8') as f_obj:
                CFG.dump_to_json_file(f_obj)
            self._write_summary_op = tf.summary.merge(summary_merge_list)
            self._summary_writer = tf.summary.FileWriter(self._tboard_save_dir, graph=self._sess.graph)

        LOG.info('Initialize cityscapes bisenetv2 trainner complete')
    def __init__(self, cfg):
        """
        initialize sfnet multi gpu trainner
        """
        self._cfg = cfg
        # define solver params and dataset
        self._cityscapes_io = cityscapes_tf_io.CityScapesTfIO(cfg=cfg)
        self._train_dataset = self._cityscapes_io.train_dataset_reader
        self._val_dataset = self._cityscapes_io.val_dataset_reader
        self._steps_per_epoch = len(self._train_dataset)
        self._val_steps_per_epoch = len(self._val_dataset)

        self._model_name = self._cfg.MODEL.MODEL_NAME

        self._train_epoch_nums = self._cfg.TRAIN.EPOCH_NUMS
        self._batch_size = self._cfg.TRAIN.BATCH_SIZE
        self._val_batch_size = self._cfg.TRAIN.VAL_BATCH_SIZE
        self._snapshot_epoch = self._cfg.TRAIN.SNAPSHOT_EPOCH
        self._model_save_dir = ops.join(self._cfg.TRAIN.MODEL_SAVE_DIR,
                                        self._model_name)
        self._tboard_save_dir = ops.join(self._cfg.TRAIN.TBOARD_SAVE_DIR,
                                         self._model_name)
        self._enable_miou = self._cfg.TRAIN.COMPUTE_MIOU.ENABLE
        if self._enable_miou:
            self._record_miou_epoch = self._cfg.TRAIN.COMPUTE_MIOU.EPOCH
        self._gpu_devices = self._cfg.TRAIN.MULTI_GPU.GPU_DEVICES
        self._gpu_nums = len(self._gpu_devices)
        self._chief_gpu_index = self._cfg.TRAIN.MULTI_GPU.CHIEF_DEVICE_INDEX
        self._batch_size_per_gpu = int(self._batch_size / self._gpu_nums)

        self._init_learning_rate = self._cfg.SOLVER.LR
        self._moving_ave_decay = self._cfg.SOLVER.MOVING_AVE_DECAY
        self._momentum = self._cfg.SOLVER.MOMENTUM
        self._lr_polynimal_decay_power = self._cfg.SOLVER.LR_POLYNOMIAL_POWER
        self._optimizer_mode = self._cfg.SOLVER.OPTIMIZER.lower()

        if self._cfg.TRAIN.RESTORE_FROM_SNAPSHOT.ENABLE:
            self._initial_weight = self._cfg.TRAIN.RESTORE_FROM_SNAPSHOT.SNAPSHOT_PATH
        else:
            self._initial_weight = None
        if self._cfg.TRAIN.WARM_UP.ENABLE:
            self._warmup_epoches = self._cfg.TRAIN.WARM_UP.EPOCH_NUMS
            self._warmup_init_learning_rate = self._init_learning_rate / 1000.0
        else:
            self._warmup_epoches = 0

        # define tensorflow session
        sess_config = tf.ConfigProto(allow_soft_placement=True)
        sess_config.gpu_options.per_process_gpu_memory_fraction = self._cfg.GPU.GPU_MEMORY_FRACTION
        sess_config.gpu_options.allow_growth = self._cfg.GPU.TF_ALLOW_GROWTH
        sess_config.gpu_options.allocator_type = 'BFC'
        self._sess = tf.Session(config=sess_config)

        # define graph input tensor
        with tf.variable_scope(name_or_scope='graph_input_node'):
            self._input_src_image_list = []
            self._input_label_image_list = []
            for i in range(self._gpu_nums):
                src_imgs, label_imgs = self._train_dataset.next_batch(
                    batch_size=self._batch_size_per_gpu)
                self._input_src_image_list.append(src_imgs)
                self._input_label_image_list.append(label_imgs)
            self._val_input_src_image, self._val_input_label_image = self._val_dataset.next_batch(
                batch_size=self._val_batch_size)

        # define model
        self._model = resnet_fcn.ResNetFCN(phase='train', cfg=self._cfg)
        self._val_model = resnet_fcn.ResNetFCN(phase='test', cfg=self._cfg)

        # define average container
        tower_grads = []
        tower_total_loss = []
        tower_l2_loss = []
        batchnorm_updates = None

        # define learning rate
        with tf.variable_scope('learning_rate'):
            self._global_step = tf.Variable(1.0,
                                            dtype=tf.float32,
                                            trainable=False,
                                            name='global_step')
            self._val_global_step = tf.Variable(1.0,
                                                dtype=tf.float32,
                                                trainable=False,
                                                name='val_global_step')
            self._val_global_step_update = tf.assign_add(
                self._val_global_step, 1.0)
            warmup_steps = tf.constant(self._warmup_epoches *
                                       self._steps_per_epoch,
                                       dtype=tf.float32,
                                       name='warmup_steps')
            train_steps = tf.constant(self._train_epoch_nums *
                                      self._steps_per_epoch,
                                      dtype=tf.float32,
                                      name='train_steps')
            self._learn_rate = tf.cond(
                pred=self._global_step < warmup_steps,
                true_fn=lambda: self._compute_warmup_lr(
                    warmup_steps=warmup_steps, name='warmup_lr'),
                false_fn=lambda: tf.train.polynomial_decay(
                    learning_rate=self._init_learning_rate,
                    global_step=self._global_step,
                    decay_steps=train_steps,
                    end_learning_rate=0.000000001,
                    power=self._lr_polynimal_decay_power))
            self._learn_rate = tf.identity(self._learn_rate, 'lr')

        # define optimizer
        if self._optimizer_mode == 'sgd':
            optimizer = tf.train.MomentumOptimizer(
                learning_rate=self._learn_rate, momentum=self._momentum)
        elif self._optimizer_mode == 'adam':
            optimizer = tf.train.AdamOptimizer(
                learning_rate=self._learn_rate, )
        else:
            raise NotImplementedError(
                'Not support optimizer: {:s} for now'.format(
                    self._optimizer_mode))

        # define distributed train op
        with tf.variable_scope(tf.get_variable_scope()):
            is_network_initialized = False
            for i in range(self._gpu_nums):
                with tf.device('/gpu:{:d}'.format(i)):
                    with tf.name_scope('tower_{:d}'.format(i)) as _:
                        input_images = self._input_src_image_list[i]
                        input_labels = self._input_label_image_list[i]
                        tmp_loss, tmp_grads = self._compute_net_gradients(
                            input_images,
                            input_labels,
                            optimizer,
                            is_net_first_initialized=is_network_initialized)
                        is_network_initialized = True

                        # Only use the mean and var in the chief gpu tower to update the parameter
                        if i == self._chief_gpu_index:
                            batchnorm_updates = tf.get_collection(
                                tf.GraphKeys.UPDATE_OPS)

                        tower_grads.append(tmp_grads)
                        tower_total_loss.append(tmp_loss['total_loss'])
                        tower_l2_loss.append(tmp_loss['l2_loss'])
        grads = self._average_gradients(tower_grads)
        self._loss = tf.reduce_mean(tower_total_loss,
                                    name='reduce_mean_tower_total_loss')
        self._l2_loss = tf.reduce_mean(tower_l2_loss,
                                       name='reduce_mean_tower_l2_loss')
        ret = self._val_model.compute_loss(
            input_tensor=self._val_input_src_image,
            label_tensor=self._val_input_label_image,
            name='SFNet',
            reuse=True)
        self._val_loss = ret['total_loss']
        self._val_l2_loss = ret['l2_loss']

        # define moving average op
        with tf.variable_scope(name_or_scope='moving_avg'):
            if self._cfg.TRAIN.FREEZE_BN.ENABLE:
                train_var_list = [
                    v for v in tf.trainable_variables()
                    if 'beta' not in v.name and 'gamma' not in v.name
                ]
            else:
                train_var_list = tf.trainable_variables()
            moving_ave_op = tf.train.ExponentialMovingAverage(
                self._moving_ave_decay).apply(train_var_list +
                                              tf.moving_average_variables())

        # group all the op needed for training
        batchnorm_updates_op = tf.group(*batchnorm_updates)
        apply_gradient_op = optimizer.apply_gradients(
            grads, global_step=self._global_step)
        self._train_op = tf.group(apply_gradient_op, moving_ave_op,
                                  batchnorm_updates_op)

        # define prediction
        self._prediciton = self._model.inference(
            input_tensor=self._input_src_image_list[self._chief_gpu_index],
            name='SFNet',
            reuse=True)
        self._val_prediction = self._val_model.inference(
            input_tensor=self._val_input_src_image, name='SFNet', reuse=True)

        # define miou
        if self._enable_miou:
            with tf.variable_scope('miou'):
                pred = tf.reshape(self._prediciton, [
                    -1,
                ])
                gt = tf.reshape(
                    self._input_label_image_list[self._chief_gpu_index], [
                        -1,
                    ])
                indices = tf.squeeze(
                    tf.where(
                        tf.less_equal(gt, self._cfg.DATASET.NUM_CLASSES - 1)),
                    1)
                gt = tf.gather(gt, indices)
                pred = tf.gather(pred, indices)
                self._miou, self._miou_update_op = tf.metrics.mean_iou(
                    labels=gt,
                    predictions=pred,
                    num_classes=self._cfg.DATASET.NUM_CLASSES)

                val_pred = tf.reshape(self._val_prediction, [
                    -1,
                ])
                val_gt = tf.reshape(self._val_input_label_image, [
                    -1,
                ])
                indices = tf.squeeze(
                    tf.where(
                        tf.less_equal(val_gt,
                                      self._cfg.DATASET.NUM_CLASSES - 1)), 1)
                val_gt = tf.gather(val_gt, indices)
                val_pred = tf.gather(val_pred, indices)
                self._val_miou, self._val_miou_update_op = tf.metrics.mean_iou(
                    labels=val_gt,
                    predictions=val_pred,
                    num_classes=self._cfg.DATASET.NUM_CLASSES)

        # define saver and loader
        with tf.variable_scope('loader_and_saver'):
            self._net_var = [
                vv for vv in tf.global_variables() if 'lr' not in vv.name
            ]
            self._loader = tf.train.Saver(self._net_var)
            self._saver = tf.train.Saver(max_to_keep=10)

        # define summary
        with tf.variable_scope('summary'):
            summary_merge_list = [
                tf.summary.scalar("learn_rate", self._learn_rate),
                tf.summary.scalar("total_loss", self._loss),
                tf.summary.scalar('l2_loss', self._l2_loss)
            ]
            val_summary_merge_list = [
                tf.summary.scalar('val_total_loss', self._val_loss),
                tf.summary.scalar('val_l2_loss', self._val_l2_loss)
            ]
            if self._enable_miou:
                with tf.control_dependencies([self._miou_update_op]):
                    summary_merge_list_with_miou = [
                        tf.summary.scalar("learn_rate", self._learn_rate),
                        tf.summary.scalar("total_loss", self._loss),
                        tf.summary.scalar('l2_loss', self._l2_loss),
                        tf.summary.scalar('miou', self._miou)
                    ]
                    self._write_summary_op_with_miou = tf.summary.merge(
                        summary_merge_list_with_miou)
                with tf.control_dependencies(
                    [self._val_miou_update_op, self._val_global_step_update]):
                    val_summary_merge_list_with_miou = [
                        tf.summary.scalar('val_total_loss', self._val_loss),
                        tf.summary.scalar('val_l2_loss', self._val_l2_loss),
                        tf.summary.scalar('val_miou', self._val_miou),
                    ]
                    self._val_write_summary_op_with_miou = tf.summary.merge(
                        val_summary_merge_list_with_miou)
            if ops.exists(self._tboard_save_dir):
                shutil.rmtree(self._tboard_save_dir)
            os.makedirs(self._tboard_save_dir, exist_ok=True)
            model_params_file_save_path = ops.join(
                self._tboard_save_dir,
                self._cfg.TRAIN.MODEL_PARAMS_CONFIG_FILE_NAME)
            with open(model_params_file_save_path, 'w',
                      encoding='utf-8') as f_obj:
                self._cfg.dump_to_json_file(f_obj)
            self._write_summary_op = tf.summary.merge(summary_merge_list)
            self._val_write_summary_op = tf.summary.merge(
                val_summary_merge_list)
            self._summary_writer = tf.summary.FileWriter(
                self._tboard_save_dir, graph=self._sess.graph)

        LOG.info(
            'Initialize cityscapes resnet fcn multi gpu trainner complete')