def _get_data(self, mode): """ Check and download Dataset """ dl_paths = {} version = self.config.get("version", "3.0.0") if version not in ["1.0.0", "2.0.0", "3.0.0"]: raise ValueError("Unsupported version: %s" % version) dl_paths["version"] = version default_root = os.path.join(DATA_HOME, self.__class__.__name__) for k, v in self.cnn_dailymail.items(): dir_path = os.path.join(default_root, k) if not os.path.exists(dir_path): get_path_from_url(v["url"], default_root, v["md5"]) unique_endpoints = _get_unique_endpoints(ParallelEnv() .trainer_endpoints[:]) if ParallelEnv().current_endpoint in unique_endpoints: file_num = len(os.listdir(os.path.join(dir_path, "stories"))) if file_num != v["file_num"]: logger.warning( "Number of %s stories is %d != %d, decompress again." % (k, file_num, v["file_num"])) shutil.rmtree(os.path.join(dir_path, "stories")) _decompress( os.path.join(default_root, os.path.basename(v["url"]))) dl_paths[k] = dir_path filename, url, data_hash = self.SPLITS[mode] fullname = os.path.join(default_root, filename) if not os.path.exists(fullname) or (data_hash and not md5file(fullname) == data_hash): get_path_from_url(url, default_root, data_hash) dl_paths[mode] = fullname return dl_paths
def _download_dist(url, path, md5sum=None): env = os.environ if 'PADDLE_TRAINERS_NUM' in env and 'PADDLE_TRAINER_ID' in env: trainer_id = int(env['PADDLE_TRAINER_ID']) num_trainers = int(env['PADDLE_TRAINERS_NUM']) if num_trainers <= 1: return _download(url, path, md5sum) else: fname = osp.split(url)[-1] fullname = osp.join(path, fname) lock_path = fullname + '.download.lock' if not osp.isdir(path): os.makedirs(path) if not osp.exists(fullname): from paddle.distributed import ParallelEnv unique_endpoints = _get_unique_endpoints(ParallelEnv() .trainer_endpoints[:]) with open(lock_path, 'w'): # touch os.utime(lock_path, None) if ParallelEnv().current_endpoint in unique_endpoints: _download(url, path, md5sum) os.remove(lock_path) else: while os.path.exists(lock_path): time.sleep(0.5) return fullname else: return _download(url, path, md5sum)
def get_weights_path_dist(path): env = os.environ if 'PADDLE_TRAINERS_NUM' in env and 'PADDLE_TRAINER_ID' in env: trainer_id = int(env['PADDLE_TRAINER_ID']) num_trainers = int(env['PADDLE_TRAINERS_NUM']) if num_trainers <= 1: path = get_weights_path(path) else: from ppdet.utils.download import map_path, WEIGHTS_HOME weight_path = map_path(path, WEIGHTS_HOME) lock_path = weight_path + '.lock' if not os.path.exists(weight_path): from paddle.distributed import ParallelEnv unique_endpoints = _get_unique_endpoints( ParallelEnv().trainer_endpoints[:]) try: os.makedirs(os.path.dirname(weight_path)) except OSError as e: if e.errno != errno.EEXIST: raise with open(lock_path, 'w'): # touch os.utime(lock_path, None) if ParallelEnv().current_endpoint in unique_endpoints: get_weights_path(path) os.remove(lock_path) else: while os.path.exists(lock_path): time.sleep(1) path = weight_path else: path = get_weights_path(path) return path
def _get_data(self, mode, **kwargs): """Downloads dataset.""" default_root = os.path.join(DATA_HOME, self.__class__.__name__) filename, data_hash, url, zipfile_hash = self.SPLITS[mode] fullname = os.path.join(default_root, filename) if mode == 'train': if not os.path.exists(fullname): get_path_from_url(url, default_root, zipfile_hash) unique_endpoints = _get_unique_endpoints( ParallelEnv().trainer_endpoints[:]) if ParallelEnv().current_endpoint in unique_endpoints: file_num = len(os.listdir(fullname)) if file_num != len(ALL_LANGUAGES): logger.warning( "Number of train files is %d != %d, decompress again." % (file_num, len(ALL_LANGUAGES))) shutil.rmtree(fullname) _decompress( os.path.join(default_root, os.path.basename(url))) else: if not os.path.exists(fullname) or ( data_hash and not md5file(fullname) == data_hash): get_path_from_url(url, default_root, zipfile_hash) return fullname
def __init__(self, dataset, batch_size, is_train, num_workers=4, distributed=True): self.dataset = DictDataset(dataset) place = paddle.CUDAPlace(ParallelEnv().dev_id) \ if ParallelEnv().nranks > 1 else paddle.CUDAPlace(0) if distributed: sampler = DistributedBatchSampler( self.dataset, batch_size=batch_size, shuffle=True if is_train else False, drop_last=True if is_train else False) self.dataloader = paddle.io.DataLoader(self.dataset, batch_sampler=sampler, places=place, num_workers=num_workers) else: self.dataloader = paddle.io.DataLoader( self.dataset, batch_size=batch_size, shuffle=True if is_train else False, drop_last=True if is_train else False, places=place, num_workers=num_workers) self.batch_size = batch_size
def _decompress_dist(fname): env = os.environ if 'PADDLE_TRAINERS_NUM' in env and 'PADDLE_TRAINER_ID' in env: trainer_id = int(env['PADDLE_TRAINER_ID']) num_trainers = int(env['PADDLE_TRAINERS_NUM']) if num_trainers <= 1: _decompress(fname) else: lock_path = fname + '.decompress.lock' from paddle.distributed import ParallelEnv unique_endpoints = _get_unique_endpoints(ParallelEnv() .trainer_endpoints[:]) # NOTE(dkp): _decompress_dist always performed after # _download_dist, in _download_dist sub-trainers is waiting # for download lock file release with sleeping, if decompress # prograss is very fast and finished with in the sleeping gap # time, e.g in tiny dataset such as coco_ce, spine_coco, main # trainer may finish decompress and release lock file, so we # only craete lock file in main trainer and all sub-trainer # wait 1s for main trainer to create lock file, for 1s is # twice as sleeping gap, this waiting time can keep all # trainer pipeline in order # **change this if you have more elegent methods** if ParallelEnv().current_endpoint in unique_endpoints: with open(lock_path, 'w'): # touch os.utime(lock_path, None) _decompress(fname) os.remove(lock_path) else: time.sleep(1) while os.path.exists(lock_path): time.sleep(0.5) else: _decompress(fname)
def on_epoch_end(self, status): if ParallelEnv().nranks < 2 or ParallelEnv().local_rank == 0: if self.model.mode == 'eval': sample_num = status['sample_num'] cost_time = status['cost_time'] logger.info('Total sample number: {}, averge FPS: {}'.format( sample_num, sample_num / cost_time))
def __init__(self, cfg, mode='train'): self.cfg = cfg assert mode.lower() in ['train', 'eval', 'test'], \ "mode should be 'train', 'eval' or 'test'" self.mode = mode.lower() self.optimizer = None # build model self.model = create(cfg.architecture) # model slim build if 'slim' in cfg and cfg.slim: if self.mode == 'train': self.load_weights(cfg.pretrain_weights, cfg.weight_type) slim = create(cfg.slim) slim(self.model) # build data loader self.dataset = cfg['{}Dataset'.format(self.mode.capitalize())] if self.mode == 'train': self.loader = create('{}Reader'.format(self.mode.capitalize()))( self.dataset, cfg.worker_num) # EvalDataset build with BatchSampler to evaluate in single device # TODO: multi-device evaluate if self.mode == 'eval': self._eval_batch_sampler = paddle.io.BatchSampler( self.dataset, batch_size=self.cfg.EvalReader['batch_size']) self.loader = create('{}Reader'.format(self.mode.capitalize()))( self.dataset, cfg.worker_num, self._eval_batch_sampler) # TestDataset build after user set images, skip loader creation here # build optimizer in train mode if self.mode == 'train': steps_per_epoch = len(self.loader) self.lr = create('LearningRate')(steps_per_epoch) self.optimizer = create('OptimizerBuilder')( self.lr, self.model.parameters()) self._nranks = ParallelEnv().nranks self._local_rank = ParallelEnv().local_rank self.status = {} self.start_epoch = 0 self.end_epoch = cfg.epoch self._weights_loaded = False # initial default callbacks self._init_callbacks() # initial default metrics self._init_metrics() self._reset_metrics()
def main(): paddle.enable_static() if FLAGS.static else None device = paddle.set_device(FLAGS.device) model_list = [x for x in models.__dict__["__all__"]] assert FLAGS.arch in model_list, "Expected FLAGS.arch in {}, but received {}".format( model_list, FLAGS.arch) net = models.__dict__[FLAGS.arch]( pretrained=FLAGS.eval_only and not FLAGS.resume) inputs = [Input([None, 3, 224, 224], 'float32', name='image')] labels = [Input([None, 1], 'int64', name='label')] model = paddle.Model(net, inputs, labels) if FLAGS.resume is not None: model.load(FLAGS.resume) train_dataset = ImageNetDataset(os.path.join(FLAGS.data, 'train'), mode='train', image_size=FLAGS.image_size, resize_short_size=FLAGS.resize_short_size) val_dataset = ImageNetDataset(os.path.join(FLAGS.data, 'val'), mode='val', image_size=FLAGS.image_size, resize_short_size=FLAGS.resize_short_size) optim = make_optimizer(np.ceil( len(train_dataset) * 1. / FLAGS.batch_size / ParallelEnv().nranks), parameter_list=model.parameters()) model.prepare(optim, paddle.nn.CrossEntropyLoss(), paddle.metric.Accuracy(topk=(1, 5))) if FLAGS.eval_only: model.evaluate(val_dataset, batch_size=FLAGS.batch_size, num_workers=FLAGS.num_workers) return output_dir = os.path.join( FLAGS.output_dir, FLAGS.arch, time.strftime('%Y-%m-%d-%H-%M', time.localtime())) if ParallelEnv().local_rank == 0 and not os.path.exists(output_dir): os.makedirs(output_dir) model.fit(train_dataset, val_dataset, batch_size=FLAGS.batch_size, epochs=FLAGS.epoch, save_dir=output_dir, num_workers=FLAGS.num_workers)
def on_epoch_end(self, status): assert self.model.mode == 'train', \ "Checkpointer can only be set during training" if ParallelEnv().nranks < 2 or ParallelEnv().local_rank == 0: epoch_id = status['epoch_id'] end_epoch = self.model.cfg.epoch if epoch_id % self.model.cfg.snapshot_epoch == 0 or epoch_id == end_epoch - 1: save_dir = os.path.join(self.model.cfg.save_dir, self.model.cfg.filename) save_name = str( epoch_id) if epoch_id != end_epoch - 1 else "model_final" save_model(self.model.model, self.model.optimizer, save_dir, save_name, epoch_id + 1)
def train(self): assert self.mode == 'train', "Model not in 'train' mode" # if no given weights loaded, load backbone pretrain weights as default if not self._weights_loaded: self.load_weights(self.cfg.pretrain_weights) self.status.update({ 'epoch_id': self.start_epoch, 'step_id': 0, 'steps_per_epoch': len(self.loader) }) self.status['batch_time'] = stats.SmoothedValue( self.cfg.log_iter, fmt='{avg:.4f}') self.status['data_time'] = stats.SmoothedValue( self.cfg.log_iter, fmt='{avg:.4f}') self.status['training_staus'] = stats.TrainingStats(self.cfg.log_iter) for epoch_id in range(self.start_epoch, self.cfg.epoch): self.status['epoch_id'] = epoch_id self._compose_callback.on_epoch_begin(self.status) self.loader.dataset.set_epoch(epoch_id) iter_tic = time.time() for step_id, data in enumerate(self.loader): self.status['data_time'].update(time.time() - iter_tic) self.status['step_id'] = step_id self._compose_callback.on_step_begin(self.status) # model forward self.model.train() outputs = self.model(data) loss = outputs['loss'] # model backward loss.backward() self.optimizer.step() curr_lr = self.optimizer.get_lr() self.lr.step() self.optimizer.clear_grad() self.status['learning_rate'] = curr_lr if ParallelEnv().nranks < 2 or ParallelEnv().local_rank == 0: self.status['training_staus'].update(outputs) self.status['batch_time'].update(time.time() - iter_tic) self._compose_callback.on_step_end(self.status) self._compose_callback.on_epoch_end(self.status)
def setup(args, cfg): if args.evaluate_only: cfg.isTrain = False cfg.timestamp = time.strftime('-%Y-%m-%d-%H-%M', time.localtime()) cfg.output_dir = os.path.join(cfg.output_dir, str(cfg.model.name) + cfg.timestamp) logger = setup_logger(cfg.output_dir) logger.info('Configs: {}'.format(cfg)) place = paddle.fluid.CUDAPlace(ParallelEnv().dev_id) \ if ParallelEnv().nranks > 1 else paddle.fluid.CUDAPlace(0) paddle.disable_static(place)
def backward_D_basic(self, netD, real, fake): """Calculate GAN loss for the discriminator Parameters: netD (network) -- the discriminator D real (tensor array) -- real images fake (tensor array) -- images generated by a generator Return the discriminator loss. We also call loss_D.backward() to calculate the gradients. """ # Real pred_real = netD(real) loss_D_real = self.criterionGAN(pred_real, True) # Fake pred_fake = netD(fake.detach()) loss_D_fake = self.criterionGAN(pred_fake, False) # Combined loss and calculate gradients loss_D = (loss_D_real + loss_D_fake) * 0.5 # loss_D.backward() if ParallelEnv().nranks > 1: loss_D = netD.scale_loss(loss_D) loss_D.backward() netD.apply_collective_grads() else: loss_D.backward() return loss_D
def main(): args = parse_args() operators = create_operators(args.interpolation) # assign the place place = 'gpu:{}'.format(ParallelEnv().dev_id) if args.use_gpu else 'cpu' place = paddle.set_device(place) net = ResNet50() load_dygraph_pretrain(net, args.pretrained_model) img = cv2.imread(args.image_file, cv2.IMREAD_COLOR) data = preprocess(img, operators) data = np.expand_dims(data, axis=0) data = paddle.to_tensor(data) net.eval() _, fm = net(data) assert args.channel_num >= 0 and args.channel_num <= fm.shape[ 1], "the channel is out of the range, should be in {} but got {}".format( [0, fm.shape[1]], args.channel_num) fm = (np.squeeze(fm[0][args.channel_num].numpy()) * 255).astype(np.uint8) fm = cv2.resize(fm, (img.shape[1], img.shape[0])) if args.save_path is not None: print("the feature map is saved in path: {}".format(args.save_path)) cv2.imwrite(args.save_path, fm)
def on_step_end(self, status): if ParallelEnv().nranks < 2 or ParallelEnv().local_rank == 0: mode = status['mode'] if mode == 'train': epoch_id = status['epoch_id'] step_id = status['step_id'] steps_per_epoch = status['steps_per_epoch'] training_staus = status['training_staus'] batch_time = status['batch_time'] data_time = status['data_time'] epoches = self.model.cfg.epoch batch_size = self.model.cfg['{}Reader'.format( mode.capitalize())]['batch_size'] logs = training_staus.log() space_fmt = ':' + str(len(str(steps_per_epoch))) + 'd' if step_id % self.model.cfg.log_iter == 0: eta_steps = (epoches - epoch_id) * steps_per_epoch - step_id eta_sec = eta_steps * batch_time.global_avg eta_str = str(datetime.timedelta(seconds=int(eta_sec))) ips = float(batch_size) / batch_time.avg fmt = ' '.join([ 'Epoch: [{}]', '[{' + space_fmt + '}/{}]', 'learning_rate: {lr:.6f}', '{meters}', 'eta: {eta}', 'batch_cost: {btime}', 'data_cost: {dtime}', 'ips: {ips:.4f} images/s', ]) fmt = fmt.format(epoch_id, step_id, steps_per_epoch, lr=status['learning_rate'], meters=logs, eta=eta_str, btime=str(batch_time), dtime=str(data_time), ips=ips) logger.info(fmt) if mode == 'eval': step_id = status['step_id'] if step_id % 100 == 0: logger.info("Eval iter: {}".format(step_id))
def main(): paddle.enable_static() if FLAGS.static else None device = paddle.set_device(FLAGS.device) train_transform = Compose([ GroupScale(), GroupMultiScaleCrop(), GroupRandomCrop(), GroupRandomFlip(), NormalizeImage() ]) train_dataset = KineticsDataset( file_list=os.path.join(FLAGS.data, 'train_10.list'), pickle_dir=os.path.join(FLAGS.data, 'train_10'), label_list=os.path.join(FLAGS.data, 'label_list'), transform=train_transform) val_transform = Compose( [GroupScale(), GroupCenterCrop(), NormalizeImage()]) val_dataset = KineticsDataset( file_list=os.path.join(FLAGS.data, 'val_10.list'), pickle_dir=os.path.join(FLAGS.data, 'val_10'), label_list=os.path.join(FLAGS.data, 'label_list'), mode='val', transform=val_transform) pretrained = FLAGS.eval_only and FLAGS.weights is None model = tsm_resnet50(num_classes=train_dataset.num_classes, pretrained=pretrained) step_per_epoch = int(len(train_dataset) / FLAGS.batch_size \ / ParallelEnv().nranks) optim = make_optimizer(step_per_epoch, model.parameters()) model.prepare(optimizer=optim, loss=paddle.nn.CrossEntropyLoss(), metrics=paddle.metric.Accuracy(topk=(1, 5))) if FLAGS.eval_only: if FLAGS.weights is not None: model.load(FLAGS.weights, reset_optimizer=True) model.evaluate(val_dataset, batch_size=FLAGS.batch_size, num_workers=FLAGS.num_workers) return if FLAGS.resume is not None: model.load(FLAGS.resume) model.fit(train_data=train_dataset, eval_data=val_dataset, epochs=FLAGS.epoch, batch_size=FLAGS.batch_size, save_dir=FLAGS.save_dir or 'tsm_checkpoint', num_workers=FLAGS.num_workers, drop_last=True, shuffle=True)
def __init__(self, dataset, batch_size, shuffle=False, drop_last=False): self.dataset = dataset assert isinstance(batch_size, int) and batch_size > 0, \ "batch_size should be a positive integer" self.batch_size = batch_size assert isinstance(shuffle, bool), \ "shuffle should be a boolean value" self.shuffle = shuffle assert isinstance(drop_last, bool), \ "drop_last should be a boolean number" self.drop_last = drop_last self.nranks = ParallelEnv().nranks self.local_rank = ParallelEnv().local_rank self.epoch = 0 self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.nranks)) self.total_size = self.num_samples * self.nranks
def __init__(self, cfg): # build train dataloader self.train_dataloader = build_dataloader(cfg.dataset.train) if 'lr_scheduler' in cfg.optimizer: cfg.optimizer.lr_scheduler.step_per_epoch = len( self.train_dataloader) # build model self.model = build_model(cfg) # multiple gpus prepare if ParallelEnv().nranks > 1: self.distributed_data_parallel() self.logger = logging.getLogger(__name__) self.enable_visualdl = cfg.get('enable_visualdl', False) if self.enable_visualdl: import visualdl self.vdl_logger = visualdl.LogWriter(logdir=cfg.output_dir) # base config self.output_dir = cfg.output_dir self.epochs = cfg.epochs self.start_epoch = 1 self.current_epoch = 1 self.batch_id = 0 self.global_steps = 0 self.weight_interval = cfg.snapshot_config.interval self.log_interval = cfg.log_config.interval self.visual_interval = cfg.log_config.visiual_interval self.validate_interval = -1 if cfg.get('validate', None) is not None: self.validate_interval = cfg.validate.get('interval', -1) self.cfg = cfg self.local_rank = ParallelEnv().local_rank # time count self.steps_per_epoch = len(self.train_dataloader) self.total_steps = self.epochs * self.steps_per_epoch self.time_count = {} self.best_metric = {}
def __init__(self, dataset, batch_size, shuffle=False, drop_last=True, seed=None): self._dataset = dataset self._batch_size = batch_size self._shuffle = shuffle self._drop_last = drop_last self._random = np.random self._random.seed(seed) self._nranks = ParallelEnv().nranks self._local_rank = ParallelEnv().local_rank self._device_id = ParallelEnv().dev_id self._num_samples = int( math.ceil(len(self._dataset) * 1.0 / self._nranks)) self._total_size = self._num_samples * self._nranks self._epoch = 0
def init_parallel_env(): env = os.environ dist = 'PADDLE_TRAINER_ID' in env and 'PADDLE_TRAINERS_NUM' in env if dist: trainer_id = int(env['PADDLE_TRAINER_ID']) local_seed = (99 + trainer_id) random.seed(local_seed) np.random.seed(local_seed) if ParallelEnv().nranks > 1: paddle.distributed.init_parallel_env()
def main(): FLAGS = parse_args() cfg = load_config(FLAGS.config) merge_config(FLAGS.opt) check_config(cfg) check_gpu(cfg.use_gpu) check_version() place = 'gpu:{}'.format(ParallelEnv().dev_id) if cfg.use_gpu else 'cpu' place = paddle.set_device(place) run(FLAGS, cfg)
def on_epoch_end(self, status): # Checkpointer only performed during training mode = status['mode'] if mode != 'train': return if ParallelEnv().nranks < 2 or ParallelEnv().local_rank == 0: epoch_id = status['epoch_id'] end_epoch = self.model.cfg.epoch if epoch_id % self.model.cfg.snapshot_epoch == 0 or epoch_id == end_epoch - 1: save_dir = os.path.join(self.model.cfg.save_dir, self.model.cfg.filename) save_name = str( epoch_id) if epoch_id != end_epoch - 1 else "model_final" if self.use_ema: state_dict = self.ema.apply() save_model(state_dict, self.model.optimizer, save_dir, save_name, epoch_id + 1) else: save_model(self.model.model, self.model.optimizer, save_dir, save_name, epoch_id + 1)
def prepare_distributed_context(place=None): if place is None: place = fluid.CUDAPlace(ParallelEnv().dev_id) if ParallelEnv().nranks > 1 \ else fluid.CUDAPlace(0) strategy = ParallelStrategy() strategy.nranks = ParallelEnv().nranks strategy.local_rank = ParallelEnv().local_rank strategy.trainer_endpoints = ParallelEnv().trainer_endpoints strategy.current_endpoint = ParallelEnv().current_endpoint if strategy.nranks < 2: return global _parallel_context_initialized if not _parallel_context_initialized and isinstance(place, fluid.CUDAPlace): def _init_context(): communicator_prog = fluid.Program() init_communicator(communicator_prog, strategy.local_rank, strategy.nranks, True, strategy.current_endpoint, strategy.trainer_endpoints) exe = fluid.Executor(place) exe.run(communicator_prog) fluid.disable_dygraph() _init_context() fluid.enable_dygraph(place) else: assert ("Only support CUDAPlace for now.") _parallel_context_initialized = True return strategy
def __init__(self, dataset, batch_sizes, num_replicas=None, rank=None, shuffle=False, drop_last=False): self.dataset = dataset assert any(isinstance(batch_size, int) and batch_size > 0 for batch_size in batch_sizes), \ "batch_size should be a positive integer" self.batch_sizes = batch_sizes self.len_batch_sizes = len(self.batch_sizes) assert isinstance(shuffle, bool), \ "shuffle should be a boolean value" self.shuffle = shuffle assert isinstance(drop_last, bool), \ "drop_last should be a boolean number" from paddle.distributed import ParallelEnv if num_replicas is not None: assert isinstance(num_replicas, int) and num_replicas > 0, \ "num_replicas should be a positive integer" self.nranks = num_replicas else: self.nranks = ParallelEnv().nranks if rank is not None: assert isinstance(rank, int) and rank >= 0, \ "rank should be a non-negative integer" self.local_rank = rank else: self.local_rank = ParallelEnv().local_rank self.drop_last = drop_last self.epoch = 0 self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.nranks)) self.total_size = self.num_samples * self.nranks
def setup_logger(output=None, name="ppgan"): """ Initialize the ppgan logger and set its verbosity level to "INFO". Args: output (str): a file name or a directory to save log. If None, will not save log file. If ends with ".txt" or ".log", assumed to be a file name. Otherwise, logs will be saved to `output/log.txt`. name (str): the root module name of this logger Returns: logging.Logger: a logger """ logger = logging.getLogger(name) if name in logger_initialized: return logger logger.setLevel(logging.INFO) logger.propagate = False plain_formatter = logging.Formatter( "[%(asctime)s] %(name)s %(levelname)s: %(message)s", datefmt="%m/%d %H:%M:%S") # stdout logging: master only local_rank = ParallelEnv().local_rank if local_rank == 0: ch = logging.StreamHandler(stream=sys.stdout) ch.setLevel(logging.DEBUG) formatter = plain_formatter ch.setFormatter(formatter) logger.addHandler(ch) # file logging: all workers if output is not None: if output.endswith(".txt") or output.endswith(".log"): filename = output else: filename = os.path.join(output, "log.txt") if local_rank > 0: filename = filename + ".rank{}".format(local_rank) # PathManager.mkdirs(os.path.dirname(filename)) os.makedirs(os.path.dirname(filename), exist_ok=True) # fh = logging.StreamHandler(_cached_log_stream(filename) fh = logging.FileHandler(filename, mode='a') fh.setLevel(logging.DEBUG) fh.setFormatter(plain_formatter) logger.addHandler(fh) logger_initialized.append(name) return logger
def __init__(self, cfg): self.batch_size = cfg.batch_size self.file_path = cfg.file_path self.seg_num = cfg.seg_num self.seglen = cfg.seglen self.short_size = cfg.short_size self.target_size = cfg.target_size # set num_shards and shard_id when distributed training is implemented self.num_shards = dist.get_world_size() self.shard_id = ParallelEnv().local_rank self.dali_mean = cfg.mean * (self.seg_num * self.seglen) self.dali_std = cfg.std * (self.seg_num * self.seglen)
def on_eval_end(self, logs=None): if logs is None or self.monitor not in logs: warnings.warn( 'Monitor of ReduceLROnPlateau should be loss or metric name.') return else: try: lr = self.model._optimizer._learning_rate if not isinstance(lr, float): warnings.warn( 'Expected learning_rate be float, bug got {}.'.format( type(lr))) return except Exception as e: warnings.warn( 'There are something wrong when get learning_rate from optimizer: {}.' .format(e)) return current = logs[self.monitor] if isinstance(current, (list, tuple)): current = current[0] elif isinstance(current, numbers.Number): current = current else: return if self.in_cooldown(): self.cooldown_counter -= 1 self.wait = 0 if self.monitor_op(current, self.best): self.best = current self.wait = 0 elif not self.in_cooldown(): self.wait += 1 if self.wait >= self.patience: old_lr = self.model._optimizer.get_lr() if old_lr > np.float32(self.min_lr): new_lr = old_lr * self.factor new_lr = max(new_lr, self.min_lr) self.model._optimizer._learning_rate = new_lr if self.verbose > 0 and ParallelEnv().local_rank == 0: print( '\nEpoch %d: ReduceLROnPlateau reducing learning ' 'rate to %s.' % (self.epoch + 1, new_lr)) self.cooldown_counter = self.cooldown self.wait = 0 self.epoch += 1
def get_path_from_url(url, md5sum=None, check_exist=True): """ Download from given url to root_dir. if file or directory specified by url is exists under root_dir, return the path directly, otherwise download from url and decompress it, return the path. Args: url (str): download url md5sum (str): md5 sum of download package Returns: str: a local path to save downloaded models & weights & datasets. """ from paddle.distributed import ParallelEnv assert is_url(url), "downloading from {} not a url".format(url) root_dir = PPGAN_HOME # parse path after download to decompress under root_dir fullpath = _map_path(url, root_dir) if osp.exists(fullpath) and check_exist and _md5check(fullpath, md5sum): logger = get_logger('ppgan') logger.info("Found {}".format(fullpath)) else: if ParallelEnv().local_rank == 0: fullpath = _download(url, root_dir, md5sum) else: while not os.path.exists(fullpath): time.sleep(1) if ParallelEnv().local_rank == 0: if tarfile.is_tarfile(fullpath) or zipfile.is_zipfile(fullpath): fullpath = _decompress(fullpath) return fullpath
def backward_G(self): """Calculate the loss for generators G_A and G_B""" lambda_idt = self.opt.lambda_identity lambda_A = self.opt.lambda_A lambda_B = self.opt.lambda_B # Identity loss if lambda_idt > 0: # G_A should be identity if real_B is fed: ||G_A(B) - B|| self.idt_A = self.netG_A(self.real_B) self.loss_idt_A = self.criterionIdt( self.idt_A, self.real_B) * lambda_B * lambda_idt # G_B should be identity if real_A is fed: ||G_B(A) - A|| self.idt_B = self.netG_B(self.real_A) self.loss_idt_B = self.criterionIdt( self.idt_B, self.real_A) * lambda_A * lambda_idt else: self.loss_idt_A = 0 self.loss_idt_B = 0 # GAN loss D_A(G_A(A)) self.loss_G_A = self.criterionGAN(self.netD_A(self.fake_B), True) # GAN loss D_B(G_B(B)) self.loss_G_B = self.criterionGAN(self.netD_B(self.fake_A), True) # Forward cycle loss || G_B(G_A(A)) - A|| self.loss_cycle_A = self.criterionCycle(self.rec_A, self.real_A) * lambda_A # Backward cycle loss || G_A(G_B(B)) - B|| self.loss_cycle_B = self.criterionCycle(self.rec_B, self.real_B) * lambda_B self.losses['G_idt_A_loss'] = self.loss_idt_A self.losses['G_idt_B_loss'] = self.loss_idt_B self.losses['G_A_adv_loss'] = self.loss_G_A self.losses['G_B_adv_loss'] = self.loss_G_B self.losses['G_A_cycle_loss'] = self.loss_cycle_A self.losses['G_B_cycle_loss'] = self.loss_cycle_B # combined loss and calculate gradients self.loss_G = self.loss_G_A + self.loss_G_B + self.loss_cycle_A + self.loss_cycle_B + self.loss_idt_A + self.loss_idt_B if ParallelEnv().nranks > 1: self.loss_G = self.netG_A.scale_loss(self.loss_G) self.loss_G.backward() self.netG_A.apply_collective_grads() self.netG_B.apply_collective_grads() else: self.loss_G.backward()
def main(): FLAGS = parse_args() cfg = load_config(FLAGS.config) merge_config(FLAGS.opt) if FLAGS.slim_config: slim_cfg = load_config(FLAGS.slim_config) merge_config(slim_cfg) if 'weight_type' not in cfg: cfg.weight_type = FLAGS.weight_type check.check_config(cfg) check.check_gpu(cfg.use_gpu) check.check_version() place = 'gpu:{}'.format(ParallelEnv().dev_id) if cfg.use_gpu else 'cpu' place = paddle.set_device(place) run(FLAGS, cfg)