示例#1
0
 def load_weights(self, weights, weight_type='pretrain'):
     assert weight_type in ['pretrain', 'resume', 'finetune'], \
             "weight_type can only be 'pretrain', 'resume', 'finetune'"
     if weight_type == 'resume':
         self.start_epoch = load_weight(self.model, weights, self.optimizer)
         logger.debug("Resume weights of epoch {}".format(self.start_epoch))
     else:
         self.start_epoch = 0
         load_pretrain_weight(self.model, weights,
                              self.cfg.get('load_static_weights', False),
                              weight_type)
         logger.debug("Load {} weights {} to start training".format(
             weight_type, weights))
     self._weights_loaded = True
示例#2
0
    def __init__(self, cfg, slim_cfg):
        super(DistillModel, self).__init__()

        self.student_model = create(cfg.architecture)
        logger.debug('Load student model pretrain_weights:{}'.format(
            cfg.pretrain_weights))
        load_pretrain_weight(self.student_model, cfg.pretrain_weights)

        slim_cfg = load_config(slim_cfg)
        self.teacher_model = create(slim_cfg.architecture)
        self.distill_loss = create(slim_cfg.distill_loss)
        logger.debug('Load teacher model pretrain_weights:{}'.format(
            slim_cfg.pretrain_weights))
        load_pretrain_weight(self.teacher_model, slim_cfg.pretrain_weights)

        for param in self.teacher_model.parameters():
            param.trainable = False
示例#3
0
def build_slim_model(cfg, slim_cfg, mode='train'):
    with open(slim_cfg) as f:
        slim_load_cfg = yaml.load(f, Loader=yaml.Loader)
    if mode != 'train' and slim_load_cfg['slim'] == 'Distill':
        return cfg

    if slim_load_cfg['slim'] == 'Distill':
        model = DistillModel(cfg, slim_cfg)
        cfg['model'] = model
    elif slim_load_cfg['slim'] == 'DistillPrune':
        if mode == 'train':
            model = DistillModel(cfg, slim_cfg)
            pruner = create(cfg.pruner)
            pruner(model.student_model)
        else:
            model = create(cfg.architecture)
            weights = cfg.weights
            load_config(slim_cfg)
            pruner = create(cfg.pruner)
            model = pruner(model)
            load_pretrain_weight(model, weights)
        cfg['model'] = model
        cfg['slim_type'] = cfg.slim
    elif slim_load_cfg['slim'] == 'PTQ':
        model = create(cfg.architecture)
        load_config(slim_cfg)
        load_pretrain_weight(model, cfg.weights)
        slim = create(cfg.slim)
        cfg['slim_type'] = cfg.slim
        cfg['model'] = slim(model)
        cfg['slim'] = slim
    elif slim_load_cfg['slim'] == 'UnstructuredPruner':
        load_config(slim_cfg)
        slim = create(cfg.slim)
        cfg['slim_type'] = cfg.slim
        cfg['slim'] = slim
        cfg['unstructured_prune'] = True
    else:
        load_config(slim_cfg)
        model = create(cfg.architecture)
        if mode == 'train':
            load_pretrain_weight(model, cfg.pretrain_weights)
        slim = create(cfg.slim)
        cfg['slim_type'] = cfg.slim
        # TODO: fix quant export model in framework.
        if mode == 'test' and slim_load_cfg['slim'] == 'QAT':
            slim.quant_config['activation_preprocess_type'] = None
        cfg['model'] = slim(model)
        cfg['slim'] = slim
        if mode != 'train':
            load_pretrain_weight(cfg['model'], cfg.weights)

    return cfg
示例#4
0
def build_slim_model(cfg, slim_cfg, mode='train'):
    with open(slim_cfg) as f:
        slim_load_cfg = yaml.load(f, Loader=yaml.Loader)
    if mode != 'train' and slim_load_cfg['slim'] == 'Distill':
        return cfg

    if slim_load_cfg['slim'] == 'Distill':
        model = DistillModel(cfg, slim_cfg)
        cfg['model'] = model
    elif slim_load_cfg['slim'] == 'DistillPrune':
        if mode == 'train':
            model = DistillModel(cfg, slim_cfg)
            pruner = create(cfg.pruner)
            pruner(model.student_model)
        else:
            model = create(cfg.architecture)
            weights = cfg.weights
            load_config(slim_cfg)
            pruner = create(cfg.pruner)
            model = pruner(model)
            load_pretrain_weight(model, weights)
        cfg['model'] = model
    else:
        load_config(slim_cfg)
        model = create(cfg.architecture)
        if mode == 'train':
            load_pretrain_weight(model, cfg.pretrain_weights)
        slim = create(cfg.slim)
        cfg['model'] = slim(model)
        cfg['slim'] = slim
        if mode != 'train':
            load_pretrain_weight(cfg['model'], cfg.weights)

    return cfg
示例#5
0
 def load_weights(self, weights):
     if self.is_loaded_weights:
         return
     self.start_epoch = 0
     if hasattr(self.model, 'detector'):
         if self.model.__class__.__name__ == 'FairMOT':
             load_pretrain_weight(self.model, weights)
         else:
             load_pretrain_weight(self.model.detector, weights)
     else:
         load_pretrain_weight(self.model, weights)
     logger.debug("Load weights {} to start training".format(weights))
示例#6
0
def run(FLAGS, cfg, place):
    env = os.environ
    FLAGS.dist = 'PADDLE_TRAINER_ID' in env and 'PADDLE_TRAINERS_NUM' in env
    if FLAGS.dist:
        trainer_id = int(env['PADDLE_TRAINER_ID'])
        local_seed = (99 + trainer_id)
        random.seed(local_seed)
        np.random.seed(local_seed)

    if FLAGS.enable_ce:
        random.seed(0)
        np.random.seed(0)

    if ParallelEnv().nranks > 1:
        paddle.distributed.init_parallel_env()

    # Data
    dataset = cfg.TrainDataset
    train_loader, step_per_epoch = create('TrainReader')(dataset,
                                                         cfg['worker_num'],
                                                         place)

    # Model
    model = create(cfg.architecture)

    # Optimizer
    lr = create('LearningRate')(step_per_epoch)
    optimizer = create('OptimizerBuilder')(lr, model.parameters())

    # Init Model & Optimzer
    if FLAGS.weight_type == 'resume':
        load_weight(model, cfg.pretrain_weights, optimizer)
    else:
        load_pretrain_weight(model, cfg.pretrain_weights,
                             cfg.get('load_static_weights', False),
                             FLAGS.weight_type)

    if getattr(model.backbone, 'norm_type', None) == 'sync_bn':
        assert cfg.use_gpu and ParallelEnv(
        ).nranks > 1, 'you should use bn rather than sync_bn while using a single gpu'
    # sync_bn = (getattr(model.backbone, 'norm_type', None) == 'sync_bn' and
    #            cfg.use_gpu and ParallelEnv().nranks > 1)
    # if sync_bn:
    #     model = paddle.nn.SyncBatchNorm.convert_sync_batchnorm(model)

    # Parallel Model
    if ParallelEnv().nranks > 1:
        model = paddle.DataParallel(model)

    fields = train_loader.collate_fn.output_fields
    # Run Train
    time_stat = deque(maxlen=cfg.log_iter)
    start_time = time.time()
    end_time = time.time()
    # Run Train
    start_epoch = optimizer.state_dict()['LR_Scheduler']['last_epoch']
    for epoch_id in range(int(cfg.epoch)):
        cur_eid = epoch_id + start_epoch
        train_loader.dataset.epoch = cur_eid
        for iter_id, data in enumerate(train_loader):
            start_time = end_time
            end_time = time.time()
            time_stat.append(end_time - start_time)
            time_cost = np.mean(time_stat)
            eta_sec = (
                (cfg.epoch - cur_eid) * step_per_epoch - iter_id) * time_cost
            eta = str(datetime.timedelta(seconds=int(eta_sec)))

            # Model Forward
            model.train()
            outputs = model(data, fields, 'train')

            # Model Backward
            loss = outputs['loss']
            if ParallelEnv().nranks > 1:
                loss = model.scale_loss(loss)
                loss.backward()
                model.apply_collective_grads()
            else:
                loss.backward()
            optimizer.step()
            curr_lr = optimizer.get_lr()
            lr.step()
            optimizer.clear_grad()

            if ParallelEnv().nranks < 2 or ParallelEnv().local_rank == 0:
                # Log state
                if epoch_id == 0 and iter_id == 0:
                    train_stats = TrainingStats(cfg.log_iter, outputs.keys())
                train_stats.update(outputs)
                logs = train_stats.log()
                if iter_id % cfg.log_iter == 0:
                    ips = float(cfg['TrainReader']['batch_size']) / time_cost
                    strs = 'Epoch:{}: iter: {}, lr: {:.6f}, {}, eta: {}, batch_cost: {:.5f} sec, ips: {:.5f} images/sec'.format(
                        cur_eid, iter_id, curr_lr, logs, eta, time_cost, ips)
                    logger.info(strs)

        # Save Stage
        if ParallelEnv().local_rank == 0 and (cur_eid % cfg.snapshot_epoch == 0
                                              or
                                              (cur_eid + 1) == int(cfg.epoch)):
            cfg_name = os.path.basename(FLAGS.config).split('.')[0]
            save_name = str(cur_eid) if cur_eid + 1 != int(
                cfg.epoch) else "model_final"
            save_dir = os.path.join(cfg.save_dir, cfg_name)
            save_model(model, optimizer, save_dir, save_name)
示例#7
0
 def load_weights(self, weights):
     if self.is_loaded_weights:
         return
     self.start_epoch = 0
     load_pretrain_weight(self.model, weights)
     logger.debug("Load weights {} to start training".format(weights))
示例#8
0
def run(FLAGS, cfg, place):
    env = os.environ
    FLAGS.dist = 'PADDLE_TRAINER_ID' in env and 'PADDLE_TRAINERS_NUM' in env
    if FLAGS.dist:
        trainer_id = int(env['PADDLE_TRAINER_ID'])
        local_seed = (99 + trainer_id)
        random.seed(local_seed)
        np.random.seed(local_seed)

    if FLAGS.enable_ce:
        random.seed(0)
        np.random.seed(0)

    if ParallelEnv().nranks > 1:
        paddle.distributed.init_parallel_env()

    # Data
    datasets = cfg.TrainDataset
    train_loader = create('TrainReader')(datasets, cfg['worker_num'])
    steps = len(train_loader)

    # Model
    model = create(cfg.architecture)

    # Optimizer
    lr = create('LearningRate')(steps)
    optimizer = create('OptimizerBuilder')(lr, model.parameters())

    # Init Model & Optimzer
    start_epoch = 0
    if FLAGS.weight_type == 'resume':
        start_epoch = load_weight(model, cfg.pretrain_weights, optimizer)
    else:
        load_pretrain_weight(model, cfg.pretrain_weights,
                             cfg.get('load_static_weights', False),
                             FLAGS.weight_type)

    if getattr(model.backbone, 'norm_type', None) == 'sync_bn':
        assert cfg.use_gpu and ParallelEnv(
        ).nranks > 1, 'you should use bn rather than sync_bn while using a single gpu'
    # sync_bn = (getattr(model.backbone, 'norm_type', None) == 'sync_bn' and
    #            cfg.use_gpu and ParallelEnv().nranks > 1)
    # if sync_bn:
    #     model = paddle.nn.SyncBatchNorm.convert_sync_batchnorm(model)

    # The parameter filter is temporary fix for training because of #28997
    # in Paddle.
    def no_grad(param):
        if param.name.startswith("conv1_") or param.name.startswith("res2a_") \
            or param.name.startswith("res2b_") or param.name.startswith("res2c_"):
            return True

    for param in filter(no_grad, model.parameters()):
        param.stop_gradient = True

    # Parallel Model
    if ParallelEnv().nranks > 1:
        model = paddle.DataParallel(model)

    cfg_name = os.path.basename(FLAGS.config).split('.')[0]
    save_dir = os.path.join(cfg.save_dir, cfg_name)

    # Run Train
    end_epoch = int(cfg.epoch)
    batch_size = int(cfg['TrainReader']['batch_size'])
    total_steps = (end_epoch - start_epoch) * steps
    step_id = 0

    train_stats = stats.TrainingStats(cfg.log_iter)
    batch_time = stats.SmoothedValue(fmt='{avg:.4f}')
    data_time = stats.SmoothedValue(fmt='{avg:.4f}')

    end_time = time.time()
    space_fmt = ':' + str(len(str(steps))) + 'd'
    # Run Train
    for cur_eid in range(start_epoch, end_epoch):
        datasets.set_epoch(cur_eid)
        for iter_id, data in enumerate(train_loader):
            data_time.update(time.time() - end_time)
            # Model Forward
            model.train()
            outputs = model(data, mode='train')
            loss = outputs['loss']
            # Model Backward
            loss.backward()
            optimizer.step()
            curr_lr = optimizer.get_lr()
            lr.step()
            optimizer.clear_grad()

            batch_time.update(time.time() - end_time)
            if ParallelEnv().nranks < 2 or ParallelEnv().local_rank == 0:
                train_stats.update(outputs)
                logs = train_stats.log()
                if iter_id % cfg.log_iter == 0:
                    eta_sec = (total_steps - step_id) * batch_time.global_avg
                    eta_str = str(datetime.timedelta(seconds=int(eta_sec)))
                    ips = float(batch_size) / batch_time.avg
                    fmt = ' '.join([
                        'Epoch: [{}]',
                        '[{' + space_fmt + '}/{}]',
                        '{meters}',
                        'eta: {eta}',
                        'batch_cost: {btime}',
                        'data_cost: {dtime}',
                        'ips: {ips:.4f} images/s',
                    ])
                    fmt = fmt.format(cur_eid,
                                     iter_id,
                                     steps,
                                     meters=logs,
                                     eta=eta_str,
                                     btime=str(batch_time),
                                     dtime=str(data_time),
                                     ips=ips)
                    logger.info(fmt)
            step_id += 1
            end_time = time.time()  # after copy outputs to CPU.
        # Save Stage
        if (ParallelEnv().local_rank == 0 and \
            (cur_eid % cfg.snapshot_epoch) == 0) or (cur_eid + 1) == end_epoch:
            save_name = str(
                cur_eid) if cur_eid + 1 != end_epoch else "model_final"
            save_model(model, optimizer, save_dir, save_name, cur_eid + 1)