def train(): # 1. initialize parallel environment dist.init_parallel_env() # 2. get current ParallelEnv parallel_env = dist.ParallelEnv() print("rank: ", parallel_env.rank) print("world_size: ", parallel_env.world_size)
def train(): """parallelenv""" # 1. initialize parallel env dist.init_parallel_env() # 2. get current ParallelEnv parallel_env = dist.ParallelEnv() assert parallel_env.rank == 0 assert parallel_env.world_size == 2 print("test_ParallelEnv ... ok")
def preprocess(is_train=False): FLAGS = ArgsParser().parse_args() profiler_options = FLAGS.profiler_options config = load_config(FLAGS.config) merge_config(FLAGS.opt) profile_dic = {"profiler_options": FLAGS.profiler_options} merge_config(profile_dic) if is_train: # save_config save_model_dir = config['Global']['save_model_dir'] os.makedirs(save_model_dir, exist_ok=True) with open(os.path.join(save_model_dir, 'config.yml'), 'w') as f: yaml.dump(dict(config), f, default_flow_style=False, sort_keys=False) log_file = '{}/train.log'.format(save_model_dir) else: log_file = None logger = get_logger(name='root', log_file=log_file) # check if set use_gpu=True in paddlepaddle cpu version use_gpu = config['Global']['use_gpu'] check_gpu(use_gpu) alg = config['Architecture']['algorithm'] assert alg in [ 'EAST', 'DB', 'SAST', 'Rosetta', 'CRNN', 'STARNet', 'RARE', 'SRN', 'CLS', 'PGNet', 'Distillation', 'NRTR', 'TableAttn', 'SAR', 'PSE', 'SEED', 'SDMGR' ] windows_not_support_list = ['PSE'] if platform.system() == "Windows" and alg in windows_not_support_list: logger.warning('{} is not support in Windows now'.format( windows_not_support_list)) sys.exit() device = 'gpu:{}'.format(dist.ParallelEnv().dev_id) if use_gpu else 'cpu' device = paddle.set_device(device) config['Global']['distributed'] = dist.get_world_size() != 1 if config['Global']['use_visualdl']: from visualdl import LogWriter save_model_dir = config['Global']['save_model_dir'] vdl_writer_path = '{}/vdl/'.format(save_model_dir) os.makedirs(vdl_writer_path, exist_ok=True) vdl_writer = LogWriter(logdir=vdl_writer_path) else: vdl_writer = None print_dict(config, logger) logger.info('train with paddle {} and device {}'.format( paddle.__version__, device)) return config, device, logger, vdl_writer
def preprocess(is_train=False): FLAGS = ArgsParser().parse_args() config = load_config(FLAGS.config) merge_config(FLAGS.opt) # check if set use_gpu=True in paddlepaddle cpu version use_gpu = config['Global']['use_gpu'] check_gpu(use_gpu) alg = config['Architecture']['algorithm'] assert alg in [ 'EAST', 'DB', 'SAST', 'Rosetta', 'CRNN', 'STARNet', 'RARE', 'SRN', 'CLS' ] device = 'gpu:{}'.format(dist.ParallelEnv().dev_id) if use_gpu else 'cpu' device = paddle.set_device(device) config['Global']['distributed'] = dist.get_world_size() != 1 if is_train: # save_config save_model_dir = config['Global']['save_model_dir'] os.makedirs(save_model_dir, exist_ok=True) with open(os.path.join(save_model_dir, 'config.yml'), 'w') as f: yaml.dump(dict(config), f, default_flow_style=False, sort_keys=False) log_file = '{}/train.log'.format(save_model_dir) else: log_file = None logger = get_logger(name='root', log_file=log_file) if config['Global']['use_visualdl']: from visualdl import LogWriter save_model_dir = config['Global']['save_model_dir'] vdl_writer_path = '{}/vdl/'.format(save_model_dir) os.makedirs(vdl_writer_path, exist_ok=True) vdl_writer = LogWriter(logdir=vdl_writer_path) else: vdl_writer = None print_dict(config, logger) logger.info('train with paddle {} and device {}'.format( paddle.__version__, device)) return config, device, logger, vdl_writer
* @file test.py * @author [email protected] * @date 2020-12-30 15:53 * @brief * **************************************************************************/ """ import os import sys import paddle import paddle.distributed as dist from utils import run_priority os.system("unset CUDA_VISIBLE_DEVICES") os.system("export CUDA_VISIBLE_DEVICES=1") dist.init_parallel_env() parallel_env = dist.ParallelEnv() @run_priority(level="P0") def test_get_rank(): """parallelenv""" assert parallel_env.rank == 0 print("{} ... ok".format(sys._getframe().f_code.co_name)) @run_priority(level="P0") def test_get_world_size(): """parallelenv""" assert parallel_env.world_size == 1 print("{} ... ok".format(sys._getframe().f_code.co_name))
def read_datasets(self, splits=None, data_files=None): def remove_if_exit(filepath): if isinstance(filepath, (list, tuple)): for file in filepath: try: os.remove(file) except OSError: pass else: try: os.remove(filepath) except OSError: pass if data_files is None: if splits is None: splits = list(self.BUILDER_CONFIGS[ self.name]['splits'].keys()) if hasattr( self, "BUILDER_CONFIGS") else list(self.SPLITS.keys()) assert isinstance( splits, str ) or (isinstance(splits, list) and isinstance(splits[0], str)) or ( isinstance(splits, tuple) and isinstance(splits[0], str) ), "`splits` should be a string or list of string or a tuple of string." if isinstance(splits, str): splits = [splits] datasets = DatasetTuple(splits) parallel_env = dist.ParallelEnv() unique_endpoints = _get_unique_endpoints( parallel_env.trainer_endpoints[:]) # move register hook to first and register togather lock_files = [] for split in splits: lock_file = os.path.join(DATA_HOME, self.__class__.__name__) if self.name is not None: lock_file = lock_file + "." + self.name lock_file += "." + split + ".done" + "." + str(os.getppid()) lock_files.append(lock_file) # Must register to all procs to make the lock file can be removed # when any proc breaks. Otherwise, the single registered proc may # not receive proper singal send by the parent proc to exit. atexit.register(lambda: remove_if_exit(lock_files)) for split in splits: filename = self._get_data(split) lock_file = os.path.join(DATA_HOME, self.__class__.__name__) if self.name is not None: lock_file = lock_file + "." + self.name lock_file += "." + split + ".done" + "." + str(os.getppid()) # `lock_file` indicates the finished status of`_get_data`. # `_get_data` only works in the `unique_endpoints` specified # proc since `get_path_from_url` only work for it. The other # procs wait `_get_data` to be finished. if parallel_env.current_endpoint in unique_endpoints: f = open(lock_file, "w") f.close() else: while not os.path.exists(lock_file): time.sleep(1) datasets[split] = self.read(filename=filename, split=split) else: assert isinstance(data_files, str) or isinstance( data_files, tuple ) or isinstance( data_files, list ), "`data_files` should be a string or tuple or list of strings." if isinstance(data_files, str): data_files = [data_files] default_split = 'train' if splits: if isinstance(splits, str): splits = [splits] datasets = DatasetTuple(splits) assert len(splits) == len( data_files ), "Number of `splits` and number of `data_files` should be the same if you want to specify the split of loacl data file." for i in range(len(data_files)): datasets[splits[i]] = self.read(filename=data_files[i], split=splits[i]) else: datasets = DatasetTuple( ["split" + str(i) for i in range(len(data_files))]) for i in range(len(data_files)): datasets["split" + str(i)] = self.read( filename=data_files[i], split=default_split) return datasets if len(datasets) > 1 else datasets[0]
def __init__(self, layers, num_stages=None, topology=None, loss_fn=None, seg_method="uniform", recompute_interval=0, recompute_offload=False, recompute_partition=False): super(PipelineLayer, self).__init__() if num_stages is None and topology is None: raise ValueError("should provide num_stages or topology") # lazy import import paddle.distributed as dist from paddle.distributed import fleet self.device_id = dist.ParallelEnv().device_id self.layers = layers self._loss_fn = loss_fn self._topo = topology self._recompute_interval = recompute_interval self._recompute_offload = recompute_offload self._recompute_partition = recompute_partition if recompute_interval > 0: logger.info( "Start Recompute for PipeLineParallel. recompute_offload: {}, recompute_partition: {}" .format(recompute_offload, recompute_partition)) _initialize_recompute_setting(recompute_offload, recompute_partition) world_size = dist.get_world_size() self.global_rank = dist.get_rank() if self._topo: self._stage_id = self._topo.get_coord(self.global_rank).pipe self._num_stages = self._topo.get_dim_size("pipe") if num_stages: assert self._num_stages == num_stages, "num_stages should be equal to be %d" % ( self._num_stages) else: # construct default topology if world_size % num_stages != 0: raise ValueError( "should provide correct num_stages({}) " "which can be divided by world_size({})".format( num_stages, world_size)) dp_num = world_size // num_stages self._topo = fleet.CommunicateTopology(["data", "pipe", "model"], [dp_num, num_stages, 1]) self._stage_id = self._topo.get_coord(self.global_rank).pipe self._num_stages = self._topo.get_dim_size("pipe") # initialize segment self._layers_desc = list(self.layers) self._num_layers = len(self._layers_desc) self._start_pos = 0 self._end_pos = self._num_layers - 1 self._segment_network(seg_method) self.shared_layers = paddle.nn.LayerDict() self.shared_weight_attrs = {} # construct layer self.run_function = [] self._build_layer() self.shared_comm = self._construct_shared_comm() self._synchronize_shared_weights()
def train_model_multigrid(cfg, world_size=1, validate=True): """Train model entry Args: cfg (dict): configuration. parallel (bool): Whether multi-card training. Default: Treu validate (bool): Whether to do evaluation. Default: False. """ # Init multigrid. multigrid = None if cfg.MULTIGRID.LONG_CYCLE or cfg.MULTIGRID.SHORT_CYCLE: multigrid = MultigridSchedule() cfg = multigrid.init_multigrid(cfg) if cfg.MULTIGRID.LONG_CYCLE: cfg, _ = multigrid.update_long_cycle(cfg, cur_epoch=0) multi_save_epoch = [i[-1] - 1 for i in multigrid.schedule] parallel = world_size != 1 logger = get_logger("paddlevideo") batch_size = cfg.DATASET.get('batch_size', 2) places = paddle.set_device('gpu') model_name = cfg.model_name output_dir = cfg.get("output_dir", f"./output/{model_name}") mkdir(output_dir) local_rank = dist.ParallelEnv().local_rank precise_bn = cfg.get("PRECISEBN") num_iters_precise_bn = cfg.PRECISEBN.num_iters_preciseBN # 1. Construct model model = build_model(cfg.MODEL) if parallel: model = paddle.DataParallel(model) # 2. Construct dataloader train_loader, valid_loader, precise_bn_loader = \ construct_loader(cfg, places, validate, precise_bn, num_iters_precise_bn, world_size, ) # 3. Construct optimizer lr = build_lr(cfg.OPTIMIZER.learning_rate, len(train_loader)) optimizer = build_optimizer(cfg.OPTIMIZER, lr, parameter_list=model.parameters()) # Resume resume_epoch = cfg.get("resume_epoch", 0) if resume_epoch: filename = osp.join( output_dir, model_name + str(local_rank) + '_' + f"{resume_epoch:05d}") subn_load(model, filename, optimizer) # 4. Train Model best = 0. total_epochs = int(cfg.epochs * cfg.MULTIGRID.epoch_factor) for epoch in range(total_epochs): if epoch < resume_epoch: logger.info( f"| epoch: [{epoch+1}] <= resume_epoch: [{ resume_epoch}], continue... " ) continue if cfg.MULTIGRID.LONG_CYCLE: cfg, changed = multigrid.update_long_cycle(cfg, epoch) if changed: logger.info("====== Rebuild model/optimizer/loader =====") ( model, lr, optimizer, train_loader, valid_loader, precise_bn_loader, ) = build_trainer(cfg, places, parallel, validate, precise_bn, num_iters_precise_bn, world_size) #load checkpoint after re-build model if epoch != 0: #epoch no need to -1, haved add 1 when save filename = osp.join( output_dir, model_name + str(local_rank) + '_' + f"{(epoch):05d}") subn_load(model, filename, optimizer) #update lr last epoch, not to use saved params lr.last_epoch = epoch lr.step(rebuild=True) model.train() record_list = build_record(cfg.MODEL) tic = time.time() for i, data in enumerate(train_loader): record_list['reader_time'].update(time.time() - tic) # 4.1 forward if parallel: outputs = model._layers.train_step(data) ## required for DataParallel, will remove in next version model._reducer.prepare_for_backward( list(model._find_varbase(outputs))) else: outputs = model.train_step(data) # 4.2 backward avg_loss = outputs['loss'] avg_loss.backward() # 4.3 minimize optimizer.step() optimizer.clear_grad() # log record record_list['lr'].update( optimizer._global_learning_rate().numpy()[0], batch_size) for name, value in outputs.items(): record_list[name].update(value.numpy()[0], batch_size) record_list['batch_time'].update(time.time() - tic) tic = time.time() if i % cfg.get("log_interval", 10) == 0: ips = "ips: {:.5f} instance/sec.".format( batch_size / record_list["batch_time"].val) log_batch(record_list, i, epoch + 1, total_epochs, "train", ips) # learning rate iter step if cfg.OPTIMIZER.learning_rate.get("iter_step"): lr.step() # learning rate epoch step if not cfg.OPTIMIZER.learning_rate.get("iter_step"): lr.step() ips = "ips: {:.5f} instance/sec.".format( batch_size * record_list["batch_time"].count / record_list["batch_time"].sum) log_epoch(record_list, epoch + 1, "train", ips) def evaluate(best): model.eval() record_list = build_record(cfg.MODEL) record_list.pop('lr') tic = time.time() for i, data in enumerate(valid_loader): if parallel: outputs = model._layers.val_step(data) else: outputs = model.val_step(data) # log_record for name, value in outputs.items(): record_list[name].update(value.numpy()[0], batch_size) record_list['batch_time'].update(time.time() - tic) tic = time.time() if i % cfg.get("log_interval", 10) == 0: ips = "ips: {:.5f} instance/sec.".format( batch_size / record_list["batch_time"].val) log_batch(record_list, i, epoch + 1, total_epochs, "val", ips) ips = "ips: {:.5f} instance/sec.".format( batch_size * record_list["batch_time"].count / record_list["batch_time"].sum) log_epoch(record_list, epoch + 1, "val", ips) best_flag = False if record_list.get('top1') and record_list['top1'].avg > best: best = record_list['top1'].avg best_flag = True return best, best_flag # use precise bn to improve acc if is_eval_epoch(cfg, epoch, total_epochs, multigrid.schedule): logger.info(f"do precise BN in {epoch+1} ...") do_preciseBN(model, precise_bn_loader, parallel, min(num_iters_precise_bn, len(precise_bn_loader))) # aggregate sub_BN stats logger.info("Aggregate sub_BatchNorm stats...") aggregate_sub_bn_stats(model) # 5. Validation if is_eval_epoch(cfg, epoch, total_epochs, multigrid.schedule): logger.info(f"eval in {epoch+1} ...") with paddle.fluid.dygraph.no_grad(): best, save_best_flag = evaluate(best) # save best if save_best_flag: save(optimizer.state_dict(), osp.join(output_dir, model_name + "_best.pdopt")) save(model.state_dict(), osp.join(output_dir, model_name + "_best.pdparams")) logger.info( f"Already save the best model (top1 acc){int(best * 10000) / 10000}" ) # 6. Save model and optimizer if is_eval_epoch( cfg, epoch, total_epochs, multigrid.schedule) or epoch % cfg.get( "save_interval", 10) == 0 or epoch in multi_save_epoch: logger.info("[Save parameters] ======") subn_save(output_dir, model_name + str(local_rank) + '_', epoch + 1, model, optimizer) logger.info(f'training {model_name} finished')