Exemplo n.º 1
0
 def _get_data(self, mode):
     """ Check and download Dataset """
     dl_paths = {}
     version = self.config.get("version", "3.0.0")
     if version not in ["1.0.0", "2.0.0", "3.0.0"]:
         raise ValueError("Unsupported version: %s" % version)
     dl_paths["version"] = version
     default_root = os.path.join(DATA_HOME, self.__class__.__name__)
     for k, v in self.cnn_dailymail.items():
         dir_path = os.path.join(default_root, k)
         if not os.path.exists(dir_path):
             get_path_from_url(v["url"], default_root, v["md5"])
         unique_endpoints = _get_unique_endpoints(ParallelEnv()
                                                  .trainer_endpoints[:])
         if ParallelEnv().current_endpoint in unique_endpoints:
             file_num = len(os.listdir(os.path.join(dir_path, "stories")))
             if file_num != v["file_num"]:
                 logger.warning(
                     "Number of %s stories is %d != %d, decompress again." %
                     (k, file_num, v["file_num"]))
                 shutil.rmtree(os.path.join(dir_path, "stories"))
                 _decompress(
                     os.path.join(default_root, os.path.basename(v["url"])))
         dl_paths[k] = dir_path
     filename, url, data_hash = self.SPLITS[mode]
     fullname = os.path.join(default_root, filename)
     if not os.path.exists(fullname) or (data_hash and
                                         not md5file(fullname) == data_hash):
         get_path_from_url(url, default_root, data_hash)
     dl_paths[mode] = fullname
     return dl_paths
Exemplo n.º 2
0
def _download_dist(url, path, md5sum=None):
    env = os.environ
    if 'PADDLE_TRAINERS_NUM' in env and 'PADDLE_TRAINER_ID' in env:
        trainer_id = int(env['PADDLE_TRAINER_ID'])
        num_trainers = int(env['PADDLE_TRAINERS_NUM'])
        if num_trainers <= 1:
            return _download(url, path, md5sum)
        else:
            fname = osp.split(url)[-1]
            fullname = osp.join(path, fname)
            lock_path = fullname + '.download.lock'

            if not osp.isdir(path):
                os.makedirs(path)

            if not osp.exists(fullname):
                from paddle.distributed import ParallelEnv
                unique_endpoints = _get_unique_endpoints(ParallelEnv()
                                                         .trainer_endpoints[:])
                with open(lock_path, 'w'):  # touch    
                    os.utime(lock_path, None)
                if ParallelEnv().current_endpoint in unique_endpoints:
                    _download(url, path, md5sum)
                    os.remove(lock_path)
                else:
                    while os.path.exists(lock_path):
                        time.sleep(0.5)
            return fullname
    else:
        return _download(url, path, md5sum)
Exemplo n.º 3
0
def get_weights_path_dist(path):
    env = os.environ
    if 'PADDLE_TRAINERS_NUM' in env and 'PADDLE_TRAINER_ID' in env:
        trainer_id = int(env['PADDLE_TRAINER_ID'])
        num_trainers = int(env['PADDLE_TRAINERS_NUM'])
        if num_trainers <= 1:
            path = get_weights_path(path)
        else:
            from ppdet.utils.download import map_path, WEIGHTS_HOME
            weight_path = map_path(path, WEIGHTS_HOME)
            lock_path = weight_path + '.lock'
            if not os.path.exists(weight_path):
                from paddle.distributed import ParallelEnv
                unique_endpoints = _get_unique_endpoints(
                    ParallelEnv().trainer_endpoints[:])
                try:
                    os.makedirs(os.path.dirname(weight_path))
                except OSError as e:
                    if e.errno != errno.EEXIST:
                        raise
                with open(lock_path, 'w'):  # touch
                    os.utime(lock_path, None)
                if ParallelEnv().current_endpoint in unique_endpoints:
                    get_weights_path(path)
                    os.remove(lock_path)
                else:
                    while os.path.exists(lock_path):
                        time.sleep(1)
            path = weight_path
    else:
        path = get_weights_path(path)

    return path
Exemplo n.º 4
0
    def _get_data(self, mode, **kwargs):
        """Downloads dataset."""
        default_root = os.path.join(DATA_HOME, self.__class__.__name__)
        filename, data_hash, url, zipfile_hash = self.SPLITS[mode]
        fullname = os.path.join(default_root, filename)
        if mode == 'train':
            if not os.path.exists(fullname):
                get_path_from_url(url, default_root, zipfile_hash)
            unique_endpoints = _get_unique_endpoints(
                ParallelEnv().trainer_endpoints[:])
            if ParallelEnv().current_endpoint in unique_endpoints:
                file_num = len(os.listdir(fullname))
                if file_num != len(ALL_LANGUAGES):
                    logger.warning(
                        "Number of train files is %d != %d, decompress again."
                        % (file_num, len(ALL_LANGUAGES)))
                    shutil.rmtree(fullname)
                    _decompress(
                        os.path.join(default_root, os.path.basename(url)))
        else:
            if not os.path.exists(fullname) or (
                    data_hash and not md5file(fullname) == data_hash):
                get_path_from_url(url, default_root, zipfile_hash)

        return fullname
Exemplo n.º 5
0
    def __init__(self,
                 dataset,
                 batch_size,
                 is_train,
                 num_workers=4,
                 distributed=True):

        self.dataset = DictDataset(dataset)

        place = paddle.CUDAPlace(ParallelEnv().dev_id) \
                    if ParallelEnv().nranks > 1 else paddle.CUDAPlace(0)

        if distributed:
            sampler = DistributedBatchSampler(
                self.dataset,
                batch_size=batch_size,
                shuffle=True if is_train else False,
                drop_last=True if is_train else False)

            self.dataloader = paddle.io.DataLoader(self.dataset,
                                                   batch_sampler=sampler,
                                                   places=place,
                                                   num_workers=num_workers)
        else:
            self.dataloader = paddle.io.DataLoader(
                self.dataset,
                batch_size=batch_size,
                shuffle=True if is_train else False,
                drop_last=True if is_train else False,
                places=place,
                num_workers=num_workers)

        self.batch_size = batch_size
Exemplo n.º 6
0
def _decompress_dist(fname):
    env = os.environ
    if 'PADDLE_TRAINERS_NUM' in env and 'PADDLE_TRAINER_ID' in env:
        trainer_id = int(env['PADDLE_TRAINER_ID'])
        num_trainers = int(env['PADDLE_TRAINERS_NUM'])
        if num_trainers <= 1:
            _decompress(fname)
        else:
            lock_path = fname + '.decompress.lock'
            from paddle.distributed import ParallelEnv
            unique_endpoints = _get_unique_endpoints(ParallelEnv()
                                                     .trainer_endpoints[:])
            # NOTE(dkp): _decompress_dist always performed after
            # _download_dist, in _download_dist sub-trainers is waiting
            # for download lock file release with sleeping, if decompress
            # prograss is very fast and finished with in the sleeping gap
            # time, e.g in tiny dataset such as coco_ce, spine_coco, main
            # trainer may finish decompress and release lock file, so we
            # only craete lock file in main trainer and all sub-trainer
            # wait 1s for main trainer to create lock file, for 1s is
            # twice as sleeping gap, this waiting time can keep all
            # trainer pipeline in order
            # **change this if you have more elegent methods**
            if ParallelEnv().current_endpoint in unique_endpoints:
                with open(lock_path, 'w'):  # touch    
                    os.utime(lock_path, None)
                _decompress(fname)
                os.remove(lock_path)
            else:
                time.sleep(1)
                while os.path.exists(lock_path):
                    time.sleep(0.5)
    else:
        _decompress(fname)
Exemplo n.º 7
0
 def on_epoch_end(self, status):
     if ParallelEnv().nranks < 2 or ParallelEnv().local_rank == 0:
         if self.model.mode == 'eval':
             sample_num = status['sample_num']
             cost_time = status['cost_time']
             logger.info('Total sample number: {}, averge FPS: {}'.format(
                 sample_num, sample_num / cost_time))
Exemplo n.º 8
0
    def __init__(self, cfg, mode='train'):
        self.cfg = cfg
        assert mode.lower() in ['train', 'eval', 'test'], \
                "mode should be 'train', 'eval' or 'test'"
        self.mode = mode.lower()
        self.optimizer = None

        # build model
        self.model = create(cfg.architecture)

        # model slim build
        if 'slim' in cfg and cfg.slim:
            if self.mode == 'train':
                self.load_weights(cfg.pretrain_weights, cfg.weight_type)
            slim = create(cfg.slim)
            slim(self.model)

        # build data loader
        self.dataset = cfg['{}Dataset'.format(self.mode.capitalize())]
        if self.mode == 'train':
            self.loader = create('{}Reader'.format(self.mode.capitalize()))(
                self.dataset, cfg.worker_num)
        # EvalDataset build with BatchSampler to evaluate in single device
        # TODO: multi-device evaluate
        if self.mode == 'eval':
            self._eval_batch_sampler = paddle.io.BatchSampler(
                self.dataset, batch_size=self.cfg.EvalReader['batch_size'])
            self.loader = create('{}Reader'.format(self.mode.capitalize()))(
                self.dataset, cfg.worker_num, self._eval_batch_sampler)
        # TestDataset build after user set images, skip loader creation here

        # build optimizer in train mode
        if self.mode == 'train':
            steps_per_epoch = len(self.loader)
            self.lr = create('LearningRate')(steps_per_epoch)
            self.optimizer = create('OptimizerBuilder')(
                self.lr, self.model.parameters())

        self._nranks = ParallelEnv().nranks
        self._local_rank = ParallelEnv().local_rank

        self.status = {}

        self.start_epoch = 0
        self.end_epoch = cfg.epoch

        self._weights_loaded = False

        # initial default callbacks
        self._init_callbacks()

        # initial default metrics
        self._init_metrics()
        self._reset_metrics()
Exemplo n.º 9
0
def main():
    paddle.enable_static() if FLAGS.static else None
    device = paddle.set_device(FLAGS.device)

    model_list = [x for x in models.__dict__["__all__"]]
    assert FLAGS.arch in model_list, "Expected FLAGS.arch in {}, but received {}".format(
        model_list, FLAGS.arch)
    net = models.__dict__[FLAGS.arch](
        pretrained=FLAGS.eval_only and not FLAGS.resume)

    inputs = [Input([None, 3, 224, 224], 'float32', name='image')]
    labels = [Input([None, 1], 'int64', name='label')]

    model = paddle.Model(net, inputs, labels)

    if FLAGS.resume is not None:
        model.load(FLAGS.resume)

    train_dataset = ImageNetDataset(os.path.join(FLAGS.data, 'train'),
                                    mode='train',
                                    image_size=FLAGS.image_size,
                                    resize_short_size=FLAGS.resize_short_size)

    val_dataset = ImageNetDataset(os.path.join(FLAGS.data, 'val'),
                                  mode='val',
                                  image_size=FLAGS.image_size,
                                  resize_short_size=FLAGS.resize_short_size)

    optim = make_optimizer(np.ceil(
        len(train_dataset) * 1. / FLAGS.batch_size / ParallelEnv().nranks),
                           parameter_list=model.parameters())

    model.prepare(optim, paddle.nn.CrossEntropyLoss(),
                  paddle.metric.Accuracy(topk=(1, 5)))

    if FLAGS.eval_only:
        model.evaluate(val_dataset,
                       batch_size=FLAGS.batch_size,
                       num_workers=FLAGS.num_workers)
        return

    output_dir = os.path.join(
        FLAGS.output_dir, FLAGS.arch,
        time.strftime('%Y-%m-%d-%H-%M', time.localtime()))
    if ParallelEnv().local_rank == 0 and not os.path.exists(output_dir):
        os.makedirs(output_dir)

    model.fit(train_dataset,
              val_dataset,
              batch_size=FLAGS.batch_size,
              epochs=FLAGS.epoch,
              save_dir=output_dir,
              num_workers=FLAGS.num_workers)
Exemplo n.º 10
0
 def on_epoch_end(self, status):
     assert self.model.mode == 'train', \
         "Checkpointer can only be set during training"
     if ParallelEnv().nranks < 2 or ParallelEnv().local_rank == 0:
         epoch_id = status['epoch_id']
         end_epoch = self.model.cfg.epoch
         if epoch_id % self.model.cfg.snapshot_epoch == 0 or epoch_id == end_epoch - 1:
             save_dir = os.path.join(self.model.cfg.save_dir,
                                     self.model.cfg.filename)
             save_name = str(
                 epoch_id) if epoch_id != end_epoch - 1 else "model_final"
             save_model(self.model.model, self.model.optimizer, save_dir,
                        save_name, epoch_id + 1)
Exemplo n.º 11
0
    def train(self):
        assert self.mode == 'train', "Model not in 'train' mode"

        # if no given weights loaded, load backbone pretrain weights as default
        if not self._weights_loaded:
            self.load_weights(self.cfg.pretrain_weights)

        self.status.update({
            'epoch_id': self.start_epoch,
            'step_id': 0,
            'steps_per_epoch': len(self.loader)
        })

        self.status['batch_time'] = stats.SmoothedValue(
            self.cfg.log_iter, fmt='{avg:.4f}')
        self.status['data_time'] = stats.SmoothedValue(
            self.cfg.log_iter, fmt='{avg:.4f}')
        self.status['training_staus'] = stats.TrainingStats(self.cfg.log_iter)

        for epoch_id in range(self.start_epoch, self.cfg.epoch):
            self.status['epoch_id'] = epoch_id
            self._compose_callback.on_epoch_begin(self.status)
            self.loader.dataset.set_epoch(epoch_id)
            iter_tic = time.time()
            for step_id, data in enumerate(self.loader):
                self.status['data_time'].update(time.time() - iter_tic)
                self.status['step_id'] = step_id
                self._compose_callback.on_step_begin(self.status)

                # model forward
                self.model.train()
                outputs = self.model(data)
                loss = outputs['loss']

                # model backward
                loss.backward()
                self.optimizer.step()
                curr_lr = self.optimizer.get_lr()
                self.lr.step()
                self.optimizer.clear_grad()
                self.status['learning_rate'] = curr_lr

                if ParallelEnv().nranks < 2 or ParallelEnv().local_rank == 0:
                    self.status['training_staus'].update(outputs)

                self.status['batch_time'].update(time.time() - iter_tic)
                self._compose_callback.on_step_end(self.status)

            self._compose_callback.on_epoch_end(self.status)
Exemplo n.º 12
0
def setup(args, cfg):
    if args.evaluate_only:
        cfg.isTrain = False

    cfg.timestamp = time.strftime('-%Y-%m-%d-%H-%M', time.localtime())
    cfg.output_dir = os.path.join(cfg.output_dir,
                                  str(cfg.model.name) + cfg.timestamp)

    logger = setup_logger(cfg.output_dir)

    logger.info('Configs: {}'.format(cfg))

    place = paddle.fluid.CUDAPlace(ParallelEnv().dev_id) \
                    if ParallelEnv().nranks > 1 else paddle.fluid.CUDAPlace(0)
    paddle.disable_static(place)
Exemplo n.º 13
0
    def backward_D_basic(self, netD, real, fake):
        """Calculate GAN loss for the discriminator

        Parameters:
            netD (network)      -- the discriminator D
            real (tensor array) -- real images
            fake (tensor array) -- images generated by a generator

        Return the discriminator loss.
        We also call loss_D.backward() to calculate the gradients.
        """
        # Real
        pred_real = netD(real)
        loss_D_real = self.criterionGAN(pred_real, True)
        # Fake
        pred_fake = netD(fake.detach())
        loss_D_fake = self.criterionGAN(pred_fake, False)
        # Combined loss and calculate gradients
        loss_D = (loss_D_real + loss_D_fake) * 0.5
        # loss_D.backward()
        if ParallelEnv().nranks > 1:
            loss_D = netD.scale_loss(loss_D)
            loss_D.backward()
            netD.apply_collective_grads()
        else:
            loss_D.backward()
        return loss_D
Exemplo n.º 14
0
def main():
    args = parse_args()
    operators = create_operators(args.interpolation)
    # assign the place
    place = 'gpu:{}'.format(ParallelEnv().dev_id) if args.use_gpu else 'cpu'
    place = paddle.set_device(place)

    net = ResNet50()
    load_dygraph_pretrain(net, args.pretrained_model)

    img = cv2.imread(args.image_file, cv2.IMREAD_COLOR)
    data = preprocess(img, operators)
    data = np.expand_dims(data, axis=0)
    data = paddle.to_tensor(data)
    net.eval()
    _, fm = net(data)
    assert args.channel_num >= 0 and args.channel_num <= fm.shape[
        1], "the channel is out of the range, should be in {} but got {}".format(
            [0, fm.shape[1]], args.channel_num)

    fm = (np.squeeze(fm[0][args.channel_num].numpy()) * 255).astype(np.uint8)
    fm = cv2.resize(fm, (img.shape[1], img.shape[0]))
    if args.save_path is not None:
        print("the feature map is saved in path: {}".format(args.save_path))
        cv2.imwrite(args.save_path, fm)
Exemplo n.º 15
0
    def on_step_end(self, status):
        if ParallelEnv().nranks < 2 or ParallelEnv().local_rank == 0:
            mode = status['mode']
            if mode == 'train':
                epoch_id = status['epoch_id']
                step_id = status['step_id']
                steps_per_epoch = status['steps_per_epoch']
                training_staus = status['training_staus']
                batch_time = status['batch_time']
                data_time = status['data_time']

                epoches = self.model.cfg.epoch
                batch_size = self.model.cfg['{}Reader'.format(
                    mode.capitalize())]['batch_size']

                logs = training_staus.log()
                space_fmt = ':' + str(len(str(steps_per_epoch))) + 'd'
                if step_id % self.model.cfg.log_iter == 0:
                    eta_steps = (epoches -
                                 epoch_id) * steps_per_epoch - step_id
                    eta_sec = eta_steps * batch_time.global_avg
                    eta_str = str(datetime.timedelta(seconds=int(eta_sec)))
                    ips = float(batch_size) / batch_time.avg
                    fmt = ' '.join([
                        'Epoch: [{}]',
                        '[{' + space_fmt + '}/{}]',
                        'learning_rate: {lr:.6f}',
                        '{meters}',
                        'eta: {eta}',
                        'batch_cost: {btime}',
                        'data_cost: {dtime}',
                        'ips: {ips:.4f} images/s',
                    ])
                    fmt = fmt.format(epoch_id,
                                     step_id,
                                     steps_per_epoch,
                                     lr=status['learning_rate'],
                                     meters=logs,
                                     eta=eta_str,
                                     btime=str(batch_time),
                                     dtime=str(data_time),
                                     ips=ips)
                    logger.info(fmt)
            if mode == 'eval':
                step_id = status['step_id']
                if step_id % 100 == 0:
                    logger.info("Eval iter: {}".format(step_id))
Exemplo n.º 16
0
def main():
    paddle.enable_static() if FLAGS.static else None
    device = paddle.set_device(FLAGS.device)

    train_transform = Compose([
        GroupScale(),
        GroupMultiScaleCrop(),
        GroupRandomCrop(),
        GroupRandomFlip(),
        NormalizeImage()
    ])
    train_dataset = KineticsDataset(
        file_list=os.path.join(FLAGS.data, 'train_10.list'),
        pickle_dir=os.path.join(FLAGS.data, 'train_10'),
        label_list=os.path.join(FLAGS.data, 'label_list'),
        transform=train_transform)
    val_transform = Compose(
        [GroupScale(), GroupCenterCrop(),
         NormalizeImage()])
    val_dataset = KineticsDataset(
        file_list=os.path.join(FLAGS.data, 'val_10.list'),
        pickle_dir=os.path.join(FLAGS.data, 'val_10'),
        label_list=os.path.join(FLAGS.data, 'label_list'),
        mode='val',
        transform=val_transform)

    pretrained = FLAGS.eval_only and FLAGS.weights is None
    model = tsm_resnet50(num_classes=train_dataset.num_classes,
                         pretrained=pretrained)

    step_per_epoch = int(len(train_dataset) / FLAGS.batch_size \
                         / ParallelEnv().nranks)
    optim = make_optimizer(step_per_epoch, model.parameters())

    model.prepare(optimizer=optim,
                  loss=paddle.nn.CrossEntropyLoss(),
                  metrics=paddle.metric.Accuracy(topk=(1, 5)))

    if FLAGS.eval_only:
        if FLAGS.weights is not None:
            model.load(FLAGS.weights, reset_optimizer=True)

        model.evaluate(val_dataset,
                       batch_size=FLAGS.batch_size,
                       num_workers=FLAGS.num_workers)
        return

    if FLAGS.resume is not None:
        model.load(FLAGS.resume)

    model.fit(train_data=train_dataset,
              eval_data=val_dataset,
              epochs=FLAGS.epoch,
              batch_size=FLAGS.batch_size,
              save_dir=FLAGS.save_dir or 'tsm_checkpoint',
              num_workers=FLAGS.num_workers,
              drop_last=True,
              shuffle=True)
Exemplo n.º 17
0
    def __init__(self, dataset, batch_size, shuffle=False, drop_last=False):
        self.dataset = dataset

        assert isinstance(batch_size, int) and batch_size > 0, \
                "batch_size should be a positive integer"
        self.batch_size = batch_size
        assert isinstance(shuffle, bool), \
                "shuffle should be a boolean value"
        self.shuffle = shuffle
        assert isinstance(drop_last, bool), \
                "drop_last should be a boolean number"

        self.drop_last = drop_last
        self.nranks = ParallelEnv().nranks
        self.local_rank = ParallelEnv().local_rank
        self.epoch = 0
        self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.nranks))
        self.total_size = self.num_samples * self.nranks
Exemplo n.º 18
0
    def __init__(self, cfg):

        # build train dataloader
        self.train_dataloader = build_dataloader(cfg.dataset.train)

        if 'lr_scheduler' in cfg.optimizer:
            cfg.optimizer.lr_scheduler.step_per_epoch = len(
                self.train_dataloader)

        # build model
        self.model = build_model(cfg)
        # multiple gpus prepare
        if ParallelEnv().nranks > 1:
            self.distributed_data_parallel()

        self.logger = logging.getLogger(__name__)
        self.enable_visualdl = cfg.get('enable_visualdl', False)
        if self.enable_visualdl:
            import visualdl
            self.vdl_logger = visualdl.LogWriter(logdir=cfg.output_dir)

        # base config
        self.output_dir = cfg.output_dir
        self.epochs = cfg.epochs
        self.start_epoch = 1
        self.current_epoch = 1
        self.batch_id = 0
        self.global_steps = 0
        self.weight_interval = cfg.snapshot_config.interval
        self.log_interval = cfg.log_config.interval
        self.visual_interval = cfg.log_config.visiual_interval
        self.validate_interval = -1
        if cfg.get('validate', None) is not None:
            self.validate_interval = cfg.validate.get('interval', -1)
        self.cfg = cfg

        self.local_rank = ParallelEnv().local_rank

        # time count
        self.steps_per_epoch = len(self.train_dataloader)
        self.total_steps = self.epochs * self.steps_per_epoch

        self.time_count = {}
        self.best_metric = {}
Exemplo n.º 19
0
 def __init__(self,
              dataset,
              batch_size,
              shuffle=False,
              drop_last=True,
              seed=None):
     self._dataset = dataset
     self._batch_size = batch_size
     self._shuffle = shuffle
     self._drop_last = drop_last
     self._random = np.random
     self._random.seed(seed)
     self._nranks = ParallelEnv().nranks
     self._local_rank = ParallelEnv().local_rank
     self._device_id = ParallelEnv().dev_id
     self._num_samples = int(
         math.ceil(len(self._dataset) * 1.0 / self._nranks))
     self._total_size = self._num_samples * self._nranks
     self._epoch = 0
Exemplo n.º 20
0
def init_parallel_env():
    env = os.environ
    dist = 'PADDLE_TRAINER_ID' in env and 'PADDLE_TRAINERS_NUM' in env
    if dist:
        trainer_id = int(env['PADDLE_TRAINER_ID'])
        local_seed = (99 + trainer_id)
        random.seed(local_seed)
        np.random.seed(local_seed)

    if ParallelEnv().nranks > 1:
        paddle.distributed.init_parallel_env()
Exemplo n.º 21
0
def main():
    FLAGS = parse_args()
    cfg = load_config(FLAGS.config)
    merge_config(FLAGS.opt)

    check_config(cfg)
    check_gpu(cfg.use_gpu)
    check_version()

    place = 'gpu:{}'.format(ParallelEnv().dev_id) if cfg.use_gpu else 'cpu'
    place = paddle.set_device(place)
    run(FLAGS, cfg)
Exemplo n.º 22
0
    def on_epoch_end(self, status):
        # Checkpointer only performed during training
        mode = status['mode']
        if mode != 'train':
            return

        if ParallelEnv().nranks < 2 or ParallelEnv().local_rank == 0:
            epoch_id = status['epoch_id']
            end_epoch = self.model.cfg.epoch
            if epoch_id % self.model.cfg.snapshot_epoch == 0 or epoch_id == end_epoch - 1:
                save_dir = os.path.join(self.model.cfg.save_dir,
                                        self.model.cfg.filename)
                save_name = str(
                    epoch_id) if epoch_id != end_epoch - 1 else "model_final"
                if self.use_ema:
                    state_dict = self.ema.apply()
                    save_model(state_dict, self.model.optimizer, save_dir,
                               save_name, epoch_id + 1)
                else:
                    save_model(self.model.model, self.model.optimizer,
                               save_dir, save_name, epoch_id + 1)
Exemplo n.º 23
0
def prepare_distributed_context(place=None):
    if place is None:
        place = fluid.CUDAPlace(ParallelEnv().dev_id) if ParallelEnv().nranks > 1 \
            else fluid.CUDAPlace(0)

    strategy = ParallelStrategy()
    strategy.nranks = ParallelEnv().nranks
    strategy.local_rank = ParallelEnv().local_rank
    strategy.trainer_endpoints = ParallelEnv().trainer_endpoints
    strategy.current_endpoint = ParallelEnv().current_endpoint

    if strategy.nranks < 2:
        return

    global _parallel_context_initialized

    if not _parallel_context_initialized and isinstance(place, fluid.CUDAPlace):

        def _init_context():
            communicator_prog = fluid.Program()
            init_communicator(communicator_prog, strategy.local_rank,
                              strategy.nranks, True, strategy.current_endpoint,
                              strategy.trainer_endpoints)
            exe = fluid.Executor(place)
            exe.run(communicator_prog)

        fluid.disable_dygraph()
        _init_context()
        fluid.enable_dygraph(place)

    else:
        assert ("Only support CUDAPlace for now.")

    _parallel_context_initialized = True
    return strategy
Exemplo n.º 24
0
    def __init__(self,
                 dataset,
                 batch_sizes,
                 num_replicas=None,
                 rank=None,
                 shuffle=False,
                 drop_last=False):
        self.dataset = dataset

        assert any(isinstance(batch_size, int) and batch_size > 0 for batch_size in batch_sizes), \
            "batch_size should be a positive integer"
        self.batch_sizes = batch_sizes
        self.len_batch_sizes = len(self.batch_sizes)
        assert isinstance(shuffle, bool), \
            "shuffle should be a boolean value"
        self.shuffle = shuffle
        assert isinstance(drop_last, bool), \
            "drop_last should be a boolean number"

        from paddle.distributed import ParallelEnv

        if num_replicas is not None:
            assert isinstance(num_replicas, int) and num_replicas > 0, \
                "num_replicas should be a positive integer"
            self.nranks = num_replicas
        else:
            self.nranks = ParallelEnv().nranks

        if rank is not None:
            assert isinstance(rank, int) and rank >= 0, \
                "rank should be a non-negative integer"
            self.local_rank = rank
        else:
            self.local_rank = ParallelEnv().local_rank

        self.drop_last = drop_last
        self.epoch = 0
        self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.nranks))
        self.total_size = self.num_samples * self.nranks
Exemplo n.º 25
0
def setup_logger(output=None, name="ppgan"):
    """
    Initialize the ppgan logger and set its verbosity level to "INFO".

    Args:
        output (str): a file name or a directory to save log. If None, will not save log file.
            If ends with ".txt" or ".log", assumed to be a file name.
            Otherwise, logs will be saved to `output/log.txt`.
        name (str): the root module name of this logger

    Returns:
        logging.Logger: a logger
    """
    logger = logging.getLogger(name)
    if name in logger_initialized:
        return logger
    logger.setLevel(logging.INFO)
    logger.propagate = False

    plain_formatter = logging.Formatter(
        "[%(asctime)s] %(name)s %(levelname)s: %(message)s",
        datefmt="%m/%d %H:%M:%S")
    # stdout logging: master only
    local_rank = ParallelEnv().local_rank
    if local_rank == 0:
        ch = logging.StreamHandler(stream=sys.stdout)
        ch.setLevel(logging.DEBUG)
        formatter = plain_formatter
        ch.setFormatter(formatter)
        logger.addHandler(ch)

    # file logging: all workers
    if output is not None:
        if output.endswith(".txt") or output.endswith(".log"):
            filename = output
        else:
            filename = os.path.join(output, "log.txt")
        if local_rank > 0:
            filename = filename + ".rank{}".format(local_rank)

        # PathManager.mkdirs(os.path.dirname(filename))
        os.makedirs(os.path.dirname(filename), exist_ok=True)

        # fh = logging.StreamHandler(_cached_log_stream(filename)
        fh = logging.FileHandler(filename, mode='a')
        fh.setLevel(logging.DEBUG)
        fh.setFormatter(plain_formatter)
        logger.addHandler(fh)
    logger_initialized.append(name)
    return logger
Exemplo n.º 26
0
    def __init__(self, cfg):
        self.batch_size = cfg.batch_size
        self.file_path = cfg.file_path

        self.seg_num = cfg.seg_num
        self.seglen = cfg.seglen
        self.short_size = cfg.short_size
        self.target_size = cfg.target_size

        # set num_shards and shard_id when distributed training is implemented
        self.num_shards = dist.get_world_size()
        self.shard_id = ParallelEnv().local_rank
        self.dali_mean = cfg.mean * (self.seg_num * self.seglen)
        self.dali_std = cfg.std * (self.seg_num * self.seglen)
Exemplo n.º 27
0
    def on_eval_end(self, logs=None):
        if logs is None or self.monitor not in logs:
            warnings.warn(
                'Monitor of ReduceLROnPlateau should be loss or metric name.')
            return
        else:
            try:
                lr = self.model._optimizer._learning_rate
                if not isinstance(lr, float):
                    warnings.warn(
                        'Expected learning_rate be float, bug got {}.'.format(
                            type(lr)))
                    return
            except Exception as e:
                warnings.warn(
                    'There are something wrong when get learning_rate from optimizer: {}.'
                    .format(e))
                return

        current = logs[self.monitor]
        if isinstance(current, (list, tuple)):
            current = current[0]
        elif isinstance(current, numbers.Number):
            current = current
        else:
            return

        if self.in_cooldown():
            self.cooldown_counter -= 1
            self.wait = 0

        if self.monitor_op(current, self.best):
            self.best = current
            self.wait = 0
        elif not self.in_cooldown():
            self.wait += 1
            if self.wait >= self.patience:
                old_lr = self.model._optimizer.get_lr()
                if old_lr > np.float32(self.min_lr):
                    new_lr = old_lr * self.factor
                    new_lr = max(new_lr, self.min_lr)
                    self.model._optimizer._learning_rate = new_lr
                    if self.verbose > 0 and ParallelEnv().local_rank == 0:
                        print(
                            '\nEpoch %d: ReduceLROnPlateau reducing learning '
                            'rate to %s.' % (self.epoch + 1, new_lr))
                    self.cooldown_counter = self.cooldown
                    self.wait = 0
        self.epoch += 1
Exemplo n.º 28
0
def get_path_from_url(url, md5sum=None, check_exist=True):
    """ Download from given url to root_dir.
    if file or directory specified by url is exists under
    root_dir, return the path directly, otherwise download
    from url and decompress it, return the path.

    Args:
        url (str): download url
        md5sum (str): md5 sum of download package

    Returns:
        str: a local path to save downloaded models & weights & datasets.
    """

    from paddle.distributed import ParallelEnv

    assert is_url(url), "downloading from {} not a url".format(url)
    root_dir = PPGAN_HOME
    # parse path after download to decompress under root_dir
    fullpath = _map_path(url, root_dir)

    if osp.exists(fullpath) and check_exist and _md5check(fullpath, md5sum):
        logger = get_logger('ppgan')
        logger.info("Found {}".format(fullpath))
    else:
        if ParallelEnv().local_rank == 0:
            fullpath = _download(url, root_dir, md5sum)
        else:
            while not os.path.exists(fullpath):
                time.sleep(1)

    if ParallelEnv().local_rank == 0:
        if tarfile.is_tarfile(fullpath) or zipfile.is_zipfile(fullpath):
            fullpath = _decompress(fullpath)

    return fullpath
Exemplo n.º 29
0
    def backward_G(self):
        """Calculate the loss for generators G_A and G_B"""
        lambda_idt = self.opt.lambda_identity
        lambda_A = self.opt.lambda_A
        lambda_B = self.opt.lambda_B
        # Identity loss
        if lambda_idt > 0:
            # G_A should be identity if real_B is fed: ||G_A(B) - B||
            self.idt_A = self.netG_A(self.real_B)
            self.loss_idt_A = self.criterionIdt(
                self.idt_A, self.real_B) * lambda_B * lambda_idt
            # G_B should be identity if real_A is fed: ||G_B(A) - A||
            self.idt_B = self.netG_B(self.real_A)
            self.loss_idt_B = self.criterionIdt(
                self.idt_B, self.real_A) * lambda_A * lambda_idt
        else:
            self.loss_idt_A = 0
            self.loss_idt_B = 0

        # GAN loss D_A(G_A(A))
        self.loss_G_A = self.criterionGAN(self.netD_A(self.fake_B), True)
        # GAN loss D_B(G_B(B))
        self.loss_G_B = self.criterionGAN(self.netD_B(self.fake_A), True)
        # Forward cycle loss || G_B(G_A(A)) - A||
        self.loss_cycle_A = self.criterionCycle(self.rec_A,
                                                self.real_A) * lambda_A
        # Backward cycle loss || G_A(G_B(B)) - B||
        self.loss_cycle_B = self.criterionCycle(self.rec_B,
                                                self.real_B) * lambda_B

        self.losses['G_idt_A_loss'] = self.loss_idt_A
        self.losses['G_idt_B_loss'] = self.loss_idt_B
        self.losses['G_A_adv_loss'] = self.loss_G_A
        self.losses['G_B_adv_loss'] = self.loss_G_B
        self.losses['G_A_cycle_loss'] = self.loss_cycle_A
        self.losses['G_B_cycle_loss'] = self.loss_cycle_B
        # combined loss and calculate gradients
        self.loss_G = self.loss_G_A + self.loss_G_B + self.loss_cycle_A + self.loss_cycle_B + self.loss_idt_A + self.loss_idt_B

        if ParallelEnv().nranks > 1:
            self.loss_G = self.netG_A.scale_loss(self.loss_G)
            self.loss_G.backward()
            self.netG_A.apply_collective_grads()
            self.netG_B.apply_collective_grads()
        else:
            self.loss_G.backward()
Exemplo n.º 30
0
def main():
    FLAGS = parse_args()

    cfg = load_config(FLAGS.config)
    merge_config(FLAGS.opt)
    if FLAGS.slim_config:
        slim_cfg = load_config(FLAGS.slim_config)
        merge_config(slim_cfg)
        if 'weight_type' not in cfg:
            cfg.weight_type = FLAGS.weight_type
    check.check_config(cfg)
    check.check_gpu(cfg.use_gpu)
    check.check_version()

    place = 'gpu:{}'.format(ParallelEnv().dev_id) if cfg.use_gpu else 'cpu'
    place = paddle.set_device(place)

    run(FLAGS, cfg)