def finetune(args): paddle.set_device(args.device) if dist.get_world_size() > 1: dist.init_parallel_env() pos_file = os.path.join(args.data_dir, 'rt-polarity.pos') neg_file = os.path.join(args.data_dir, 'rt-polarity.neg') x_text, y = load_data_and_labels(pos_file, neg_file) x_train, x_test, y_train, y_test = train_test_split(x_text, y, test_size=0.1, random_state=args.seed) if not args.init_from_ckpt: raise ValueError('`init_from_ckpt` should be set.') model = ELMoBowTextClassification(args.init_from_ckpt, args.batch_size, args.sent_embedding_dim, args.dropout, args.num_classes) if dist.get_world_size() > 1: model = paddle.DataParallel(model) model.train() adam = paddle.optimizer.Adam(parameters=model.parameters(), learning_rate=args.lr, weight_decay=args.weight_decay) criterion = nn.CrossEntropyLoss() vocab = load_vocab() train_dataset = SentencePolarityDatasetV1(x_train, y_train, vocab, args.max_seq_len) test_dataset = SentencePolarityDatasetV1(x_test, y_test, vocab, args.max_seq_len) train_loader = DataLoader(train_dataset, batch_size=args.batch_size, return_list=True, shuffle=True, collate_fn=lambda batch: generate_batch(batch)) test_loader = DataLoader(test_dataset, batch_size=args.batch_size, return_list=True, shuffle=False, collate_fn=lambda batch: generate_batch(batch)) for epoch in range(args.epochs): print('Epoch {}/{}'.format(epoch + 1, args.epochs)) for step, batch_data in enumerate(train_loader, start=1): ids, ids_reverse, label = batch_data output = model((ids, ids_reverse)) loss = criterion(output, label) loss.backward() adam.step() adam.clear_grad() if step % args.logging_step == 0: print('step {}, loss {}'.format(step, loss.numpy()[0])) acc = test(model, test_loader) print('\ntest acc {}\n'.format(acc))
def train(): """bergin train""" arr1 = [] arr2 = [] dist.init_parallel_env() set_seed(2021) layer = LinearNet() if dist.get_world_size() > 1: dp_layer = paddle.DataParallel(layer) else: dp_layer = layer layer2 = LinearNet() if dist.get_world_size() > 1: dp_layer2 = paddle.DataParallel(layer2) else: dp_layer2 = layer2 dp_layer2.set_state_dict(dp_layer.state_dict()) loss_fn = nn.MSELoss() adam = opt.Adam( learning_rate=0.001, parameters=dp_layer.parameters()) adam2 = opt.Adam( learning_rate=0.001, parameters=dp_layer2.parameters()) for i in range(2): batch_size = 10 shard = int(batch_size / dist.get_world_size()) start_no = shard * dist.get_rank() end_no = start_no + shard inputs = paddle.randn([10, 10], 'float32')[start_no:end_no] outputs = dp_layer(inputs) labels = paddle.randn([10, 1], 'float32')[start_no:end_no] loss = loss_fn(outputs, labels) if dist.get_rank() == 0: arr1.append(loss.numpy()[0]) loss.backward() adam.step() adam.clear_grad() outputs = dp_layer2(inputs) loss = loss_fn(outputs, labels) loss.backward() if dist.get_rank() == 0: arr2.append(loss.numpy()[0]) adam2.step() adam2.clear_grad() check_data(arr1, arr2)
def train(): dist.init_parallel_env() # 1. initialize parallel environment set_seed(2021) # 2. create data parallel layer & optimizer layer = LinearNet() if dist.get_world_size() > 1: dp_layer = paddle.DataParallel(layer) else: dp_layer = layer layer2 = LinearNet() if dist.get_world_size() > 1: dp_layer2 = paddle.DataParallel(layer2) else: dp_layer2 = layer2 dp_layer2.set_state_dict(dp_layer.state_dict()) loss_fn = nn.MSELoss() adam = opt.Adam(learning_rate=0.001, parameters=dp_layer.parameters()) adam2 = opt.Adam(learning_rate=0.001, parameters=dp_layer2.parameters()) # 3. run layer print("Start") for i in range(10): batch_size = 10 shard = int(batch_size / dist.get_world_size()) start_no = shard * dist.get_rank() end_no = start_no + shard inputs = paddle.randn([10, 10], 'float32')[start_no:end_no] outputs = dp_layer(inputs) labels = paddle.randn([10, 1], 'float32')[start_no:end_no] loss = loss_fn(outputs, labels) if dist.get_rank() == 0: print("Loss1", loss.numpy()[0]) print(dp_layer.parameters()) loss.backward() adam.step() adam.clear_grad() outputs = dp_layer2(inputs) loss = loss_fn(outputs, labels) loss.backward() if dist.get_rank() == 0: print("Loss2", loss.numpy()[0]) print(dp_layer2.parameters()) adam2.step() adam2.clear_grad()
def shard(self, num_shards=None, index=None): """ Split the dataset into `num_shards` pieces. Args: num_shards (int, optional): An integer representing the number of data shards. If None, `num_shards` would be number of trainers. Defaults to None. index (int, optional): An integer representing the index of the current shard. If None, `index` would be the current trainer rank id. Defaults to None. """ if num_shards is None: num_shards = dist.get_world_size() if index is None: index = dist.get_rank() def sharder(num_shards, index, num_samples): if num_samples % num_shards == index: return True else: return False fn = partial(sharder, num_shards=num_shards, index=index) self._shard_filter = fn return self
def __init__(self, filepattern, batch_size, pad_token_id, bos_token_id, sort_pool_size=2**16, seed=1, n_gpus=None, rank=None, mode='test'): super(DialogueDataset, self).__init__() self.file_list = glob(filepattern) self.sort_pool_size = 0 if mode == 'test' else sort_pool_size self.n_gpus = n_gpus if n_gpus else dist.get_world_size() self.rank = rank if rank else dist.get_rank() self.batch_size = batch_size * self.n_gpus # len(batch) * max_len <= this value self.shuffle = True if mode == 'train' else False self.mode = mode self.pad_id = pad_token_id # [PAD] self.bos_id = bos_token_id # [CLS] self.global_rng = np.random.RandomState(seed) assert len( self.file_list) > 0, 'There is no files in %s.' % filepattern
def batch_norm_1d(num_channels): """tbd""" if dist.get_world_size() > 1: return nn.SyncBatchNorm.convert_sync_batchnorm( nn.BatchNorm1D(num_channels)) else: return nn.BatchNorm1D(num_channels)
def all_gather(v): if dist.get_world_size() <= 1: return v.item() ret = [] dist.all_gather(ret, v) concat = paddle.concat(ret, axis=0) return concat.mean().item()
def on_epoch_end(self, status): # Checkpointer only performed during training mode = status['mode'] epoch_id = status['epoch_id'] weight = None save_name = None if dist.get_world_size() < 2 or dist.get_rank() == 0: if mode == 'train': end_epoch = self.model.cfg.epoch if epoch_id % self.model.cfg.snapshot_epoch == 0 or epoch_id == end_epoch - 1: save_name = str( epoch_id ) if epoch_id != end_epoch - 1 else "model_final" weight = self.weight elif mode == 'eval': if 'save_best_model' in status and status['save_best_model']: for metric in self.model._metrics: map_res = metric.get_results() key = 'bbox' if 'bbox' in map_res else 'mask' if key not in map_res: logger.warn("Evaluation results empty, this may be due to " \ "training iterations being too few or not " \ "loading the correct weights.") return if map_res[key][0] > self.best_ap: self.best_ap = map_res[key][0] save_name = 'best_model' weight = self.weight logger.info("Best test {} ap is {:0.3f}.".format( key, self.best_ap)) if weight: save_model(weight, self.model.optimizer, self.save_dir, save_name, epoch_id + 1)
def _shard_edges_by_dst(self, edges, edge_feat): """Shard Edges by dst Args: edges: list of (u, v) tuples, 2D numpy.ndarry or 2D paddle.Tensor edge_feat (optional): a dict of numpy array as edge features (should have consistent order with edges) Returns: Return a tuple (shard_edges, shard_edge_feat) as the shard results. """ shard_flag = edges[:, 1] mask = (shard_flag % dist.get_world_size()) == dist.get_rank() if type(mask) == paddle.Tensor: eid = paddle.masked_select(paddle.arange(edges.shape[0]), mask) shard_edges = paddle.gather(edges, eid) shard_edge_feat = {} for key, value in edge_feat.items(): shard_edge_feat[key] = paddle.gather(value, eid) else: eid = np.arange(edges.shape[0])[mask] shard_edges = edges[eid] shard_edge_feat = {} for key, value in edge_feat.items(): shard_edge_feat[key] = value[eid] return shard_edges, shard_edge_feat
def shard(self, num_replicas=None, rank=None): """ Operates slice using multi GPU. Args: num_replicas (int, optional): The number of training process, and is also the number of GPU cards used in training. Default: None. rank (int, optional): Number of training process. Equal to the value of the environment variable PADDLE_TRAINER_ID. Default: None. Returns: SamplerHelper """ if num_replicas is None: num_replicas = dist.get_world_size() if rank is None: rank = dist.get_rank() def _impl(): for i, idx in enumerate(self): if i % num_replicas == rank: yield idx if i % num_replicas != num_replicas - 1 and rank > i % num_replicas: # use last samples to make it evenly divisible yield idx sampler = type(self)(self.data_source, _impl) if self.length is not None: sampler.length = int(math.ceil(self.length * 1.0 / num_replicas)) else: sampler.length = None return sampler
def test_class_center_sample(self): rank_id = dist.get_rank() nranks = dist.get_world_size() seed = 1025 set_random_seed(seed) paddle.seed(rank_id * 10) random.seed(seed) np.random.seed(seed) batch_size = 20 num_samples = 6 for dtype in ('int32', 'int64'): for _ in range(5): classes_list = np.random.randint(10, 15, (nranks, )) num_class = np.sum(classes_list) np_label = np.random.randint(0, num_class, (batch_size, ), dtype=dtype) label = paddle.to_tensor(np_label, dtype=dtype) np_remapped_label, np_sampled_class_center_per_device = class_center_sample_numpy( np_label, classes_list, num_samples) remapped_label, sampled_class_index = paddle.nn.functional.class_center_sample( label, classes_list[rank_id], num_samples) np.testing.assert_allclose(remapped_label.numpy(), np_remapped_label) np_sampled_class_index = np_sampled_class_center_per_device[ rank_id] np.testing.assert_allclose( sampled_class_index.numpy()[:len(np_sampled_class_index)], np_sampled_class_index)
def infer(self): assert self.mode == "infer" and self.eval_mode == "classification" total_trainer = dist.get_world_size() local_rank = dist.get_rank() image_list = get_image_list(self.config["Infer"]["infer_imgs"]) # data split image_list = image_list[local_rank::total_trainer] batch_size = self.config["Infer"]["batch_size"] self.model.eval() batch_data = [] image_file_list = [] for idx, image_file in enumerate(image_list): with open(image_file, 'rb') as f: x = f.read() for process in self.preprocess_func: x = process(x) batch_data.append(x) image_file_list.append(image_file) if len(batch_data) >= batch_size or idx == len(image_list) - 1: batch_tensor = paddle.to_tensor(batch_data) out = self.model(batch_tensor) if isinstance(out, list): out = out[0] if isinstance(out, dict) and "logits" in out: out = out["logits"] if isinstance(out, dict) and "output" in out: out = out["output"] result = self.postprocess_func(out, image_file_list) print(result) batch_data.clear() image_file_list.clear()
def forward(self, input): dtype = input.dtype flatten = input.reshape([-1, self.dim]) dist = (flatten.pow(2).sum(1, keepdim=True) - 2 * flatten.transpose([0, 1]).matmul(self.embed) + self.embed.pow(2).sum(0, keepdim=True)) embed_ind = (-dist).argmax(1) embed_onehot = F.one_hot(embed_ind, self.n_embed).astype(dtype) embed_ind = embed_ind.reshape(input.shape[:-1]) quantize = F.embedding(embed_ind, self.embed.transpose([1, 0]), padding_idx=-1) if self.training: embed_onehot_sum = embed_onehot.sum(0) embed_sum = flatten.transpose([1, 0]).matmul(embed_onehot) if dist_fn.get_world_size() > 1: dist_fn.all_reduce(embed_onehot_sum) dist_fn.all_reduce(embed_sum) ema_inplace(self.cluster_size, embed_onehot_sum, self.decay) ema_inplace(self.embed_avg, embed_sum, self.decay) cluster_size = laplace_smoothing( self.cluster_size, self.n_embed, self.eps) * self.cluster_size.sum() embed_normalized = self.embed_avg / cluster_size.unsqueeze(0) self.embed[:] = embed_normalized loss = F.mse_loss(quantize.detach(), input) * self.commitment quantize = input + (quantize - input).detach() return quantize, embed_ind, loss
def shard(self, num_shards=None, index=None): """ Use samples whose indices mod `index` equals 0 to update this dataset. Args: num_shards (int, optional): A integer representing the number of data shards. If None, `num_shards` would be number of trainers. Default: None index (int, optional): A integer representing the index of the current shard. If None, index` would be the current trainer rank id. Default: None. """ if num_shards is None: num_shards = dist.get_world_size() if index is None: index = dist.get_rank() num_samples = int(math.ceil(len(self.new_data) * 1.0 / num_shards)) total_size = num_samples * num_shards # add extra samples to make it evenly divisible self.new_data = [ self.new_data[idx] for idx in range(len(self.new_data)) if idx % num_shards == index ] if len(self.new_data) < num_samples: self.new_data.append(self.new_data[index + 1 - num_shards]) return self
def shard(self, num_shards=None, index=None): """ Use samples whose indices mod `index` equals 0 to update this dataset. Args: num_shards (int, optional): A integer representing the number of data shards. If None, `num_shards` would be number of trainers. Default: None index (int, optional): A integer representing the index of the current shard. If None, index` would be the current trainer rank id. Default: None. """ if num_shards is None: num_shards = dist.get_world_size() if index is None: index = dist.get_rank() def sharder(num_shards, index, num_samples): if num_samples % num_shards == index: return True else: return False fn = partial(sharder, num_shards=num_shards, index=index) self._shard_filter = fn return self
def on_epoch_end(self, status): if dist.get_world_size() < 2 or dist.get_rank() == 0: mode = status['mode'] if mode == 'eval': sample_num = status['sample_num'] cost_time = status['cost_time'] logger.info('Total sample number: {}, averge FPS: {}'.format( sample_num, sample_num / cost_time))
def train_iter_end(self, trainer): # print('-----------------------------') # print('updating target network!') # print('-----------------------------') if dist.get_world_size() > 1: trainer.model._layers.update_target_network_L1() else: trainer.model.update_target_network_L1()
def __init__(self, cfg, mode='train'): self.cfg = cfg assert mode.lower() in ['train', 'eval', 'test'], \ "mode should be 'train', 'eval' or 'test'" self.mode = mode.lower() self.optimizer = None self.is_loaded_weights = False # build model if 'model' not in self.cfg: self.model = create(cfg.architecture) else: self.model = self.cfg.model self.is_loaded_weights = True self.use_ema = ('use_ema' in cfg and cfg['use_ema']) if self.use_ema: self.ema = ModelEMA(cfg['ema_decay'], self.model, use_thres_step=True) # build data loader self.dataset = cfg['{}Dataset'.format(self.mode.capitalize())] if self.mode == 'train': self.loader = create('{}Reader'.format(self.mode.capitalize()))( self.dataset, cfg.worker_num) # EvalDataset build with BatchSampler to evaluate in single device # TODO: multi-device evaluate if self.mode == 'eval': self._eval_batch_sampler = paddle.io.BatchSampler( self.dataset, batch_size=self.cfg.EvalReader['batch_size']) self.loader = create('{}Reader'.format(self.mode.capitalize()))( self.dataset, cfg.worker_num, self._eval_batch_sampler) # TestDataset build after user set images, skip loader creation here # build optimizer in train mode if self.mode == 'train': steps_per_epoch = len(self.loader) self.lr = create('LearningRate')(steps_per_epoch) self.optimizer = create('OptimizerBuilder')( self.lr, self.model.parameters()) self._nranks = dist.get_world_size() self._local_rank = dist.get_rank() self.status = {} self.start_epoch = 0 self.end_epoch = cfg.epoch # initial default callbacks self._init_callbacks() # initial default metrics self._init_metrics() self._reset_metrics()
def on_epoch_end(self, status): mode = status['mode'] if dist.get_world_size() < 2 or dist.get_rank() == 0: if mode == 'eval': for metric in self.model._metrics: for key, map_value in metric.get_results().items(): self.vdl_writer.add_scalar("{}-mAP".format(key), map_value[0], self.vdl_mAP_step) self.vdl_mAP_step += 1
def create_data_loader(args, places=None, use_all_vocab=False): root = None if args.root == "None" else args.root if not use_all_vocab: WMT14ende.VOCAB_INFO = (os.path.join("WMT14.en-de", "wmt14_ende_data_bpe", "vocab_all.bpe.33712"), os.path.join("WMT14.en-de", "wmt14_ende_data_bpe", "vocab_all.bpe.33712"), "de485e3c2e17e23acf4b4b70b54682dd", "de485e3c2e17e23acf4b4b70b54682dd") (src_vocab, trg_vocab) = WMT14ende.get_vocab(root=root) padding_vocab = ( lambda x: (x + args.pad_factor - 1) // args.pad_factor * args.pad_factor) args.src_vocab_size = padding_vocab(len(src_vocab)) args.trg_vocab_size = padding_vocab(len(trg_vocab)) transform_func = WMT14ende.get_default_transform_func(root=root) datasets = [ WMT14ende.get_datasets(mode=m, root=root, transform_func=transform_func) for m in ["train", "dev"] ] data_loaders = [(None)] * 2 for i, dataset in enumerate(datasets): dataset = dataset.filter( partial(min_max_filer, max_len=args.max_length)) batch_sampler = TransformerBatchSampler( dataset=dataset, batch_size=args.batch_size, pool_size=args.pool_size, sort_type=args.sort_type, shuffle=args.shuffle, shuffle_batch=args.shuffle_batch, use_token_batch=True, max_length=args.max_length, distribute_mode=True if i == 0 else False, world_size=dist.get_world_size(), rank=dist.get_rank(), pad_seq=args.pad_seq, bsz_multi=args.bsz_multi) data_loader = DataLoader(dataset=dataset, places=places, batch_sampler=batch_sampler, collate_fn=partial(prepare_train_input, bos_idx=args.bos_idx, eos_idx=args.eos_idx, pad_idx=args.bos_idx, pad_seq=args.pad_seq), num_workers=0) data_loaders[i] = (data_loader) return data_loaders
def preprocess(is_train=False): FLAGS = ArgsParser().parse_args() profiler_options = FLAGS.profiler_options config = load_config(FLAGS.config) merge_config(FLAGS.opt) profile_dic = {"profiler_options": FLAGS.profiler_options} merge_config(profile_dic) if is_train: # save_config save_model_dir = config['Global']['save_model_dir'] os.makedirs(save_model_dir, exist_ok=True) with open(os.path.join(save_model_dir, 'config.yml'), 'w') as f: yaml.dump(dict(config), f, default_flow_style=False, sort_keys=False) log_file = '{}/train.log'.format(save_model_dir) else: log_file = None logger = get_logger(name='root', log_file=log_file) # check if set use_gpu=True in paddlepaddle cpu version use_gpu = config['Global']['use_gpu'] check_gpu(use_gpu) alg = config['Architecture']['algorithm'] assert alg in [ 'EAST', 'DB', 'SAST', 'Rosetta', 'CRNN', 'STARNet', 'RARE', 'SRN', 'CLS', 'PGNet', 'Distillation', 'NRTR', 'TableAttn', 'SAR', 'PSE', 'SEED', 'SDMGR' ] windows_not_support_list = ['PSE'] if platform.system() == "Windows" and alg in windows_not_support_list: logger.warning('{} is not support in Windows now'.format( windows_not_support_list)) sys.exit() device = 'gpu:{}'.format(dist.ParallelEnv().dev_id) if use_gpu else 'cpu' device = paddle.set_device(device) config['Global']['distributed'] = dist.get_world_size() != 1 if config['Global']['use_visualdl']: from visualdl import LogWriter save_model_dir = config['Global']['save_model_dir'] vdl_writer_path = '{}/vdl/'.format(save_model_dir) os.makedirs(vdl_writer_path, exist_ok=True) vdl_writer = LogWriter(logdir=vdl_writer_path) else: vdl_writer = None print_dict(config, logger) logger.info('train with paddle {} and device {}'.format( paddle.__version__, device)) return config, device, logger, vdl_writer
def create_data_loader(args, places=None): datasets = load_dataset('wmt14ende', splits=('train', 'dev')) if not args.benchmark: src_vocab = Vocab.load_vocabulary(**datasets[0].vocab_info["bpe"]) else: src_vocab = Vocab.load_vocabulary( **datasets[0].vocab_info["benchmark"]) trg_vocab = src_vocab padding_vocab = ( lambda x: (x + args.pad_factor - 1) // args.pad_factor * args.pad_factor) args.src_vocab_size = padding_vocab(len(src_vocab)) args.trg_vocab_size = padding_vocab(len(trg_vocab)) def convert_samples(sample): source = sample[args.src_lang].split() target = sample[args.trg_lang].split() source = src_vocab.to_indices(source) target = trg_vocab.to_indices(target) return source, target data_loaders = [(None)] * 2 for i, dataset in enumerate(datasets): dataset = dataset.map(convert_samples, lazy=False).filter( partial(min_max_filer, max_len=args.max_length)) batch_sampler = TransformerBatchSampler( dataset=dataset, batch_size=args.batch_size, pool_size=args.pool_size, sort_type=args.sort_type, shuffle=args.shuffle, shuffle_batch=args.shuffle_batch, use_token_batch=True, max_length=args.max_length, distribute_mode=True if i == 0 else False, world_size=dist.get_world_size(), rank=dist.get_rank(), pad_seq=args.pad_seq, bsz_multi=args.bsz_multi) data_loader = DataLoader(dataset=dataset, places=places, batch_sampler=batch_sampler, collate_fn=partial(prepare_train_input, bos_idx=args.bos_idx, eos_idx=args.eos_idx, pad_idx=args.bos_idx, pad_seq=args.pad_seq), num_workers=0) data_loaders[i] = (data_loader) return data_loaders
def get_steps_per_epoch(args): """tbd""" # add as argument if args.dataset == 'zinc': train_num = int(20000000 * (1 - args.test_ratio)) else: raise ValueError(args.dataset) if args.DEBUG: train_num = 100 steps_per_epoch = int(train_num / args.batch_size) if args.distributed: steps_per_epoch = int(steps_per_epoch / dist.get_world_size()) return steps_per_epoch
def __init__(self, cfg): # base config self.logger = logging.getLogger(__name__) self.cfg = cfg self.output_dir = cfg.output_dir self.local_rank = dist.get_rank() self.log_interval = cfg.log_config.interval self.start_epoch = 0 self.current_epoch = 0 self.current_iter = 0 self.inner_iter = 0 self.batch_id = 0 self.global_steps = 0 self.timestamp = cfg.timestamp self.logs = OrderedDict() # build model self.model = build_model(cfg.model) # multiple gpus prepare if dist.get_world_size() > 1: paddle.distributed.init_parallel_env() self.model = DistributedDataParallel(self.model) # build train dataloader self.train_dataloader = build_dataloader(cfg.dataloader.train) self.iters_per_epoch = len(self.train_dataloader) # build lr scheduler self.lr_scheduler = build_lr_scheduler(cfg.lr_scheduler, self.iters_per_epoch) # build optimizer self.optimizer = build_optimizer(cfg.optimizer, self.lr_scheduler, self.model.parameters()) # build hooks self.hooks = [] self.add_train_hooks() self.add_custom_hooks() self.epochs = cfg.get('epochs', None) if self.epochs: self.total_iters = self.epochs * self.iters_per_epoch self.by_epoch = True else: self.by_epoch = False self.total_iters = cfg.total_iters
def __init__(self, cfg): self.batch_size = cfg.batch_size self.file_path = cfg.file_path self.seg_num = cfg.seg_num self.seglen = cfg.seglen self.short_size = cfg.short_size self.target_size = cfg.target_size # set num_shards and shard_id when distributed training is implemented self.num_shards = dist.get_world_size() self.shard_id = ParallelEnv().local_rank self.dali_mean = cfg.mean * (self.seg_num * self.seglen) self.dali_std = cfg.std * (self.seg_num * self.seglen)
def _get_size(self): # random_interval = 10 as default, every 10 iters to change self._input_size image_ratio = self.input_size[1] * 1.0 / self.input_size[0] if self._step % self.random_interval == 0: size_factor = random.randint(*self.size_range) size = [ self.size_stride * size_factor, self.size_stride * int(size_factor * image_ratio) ] size = paddle.to_tensor(size) if dist.get_world_size() > 1 and paddle_distributed_is_initialized( ): dist.barrier() dist.broadcast(size, 0) self._input_size = size self._step += 1
def all_gather_tokens(data): """Gathers num of tokens from all nodes. `data` should be a tensor of num of tokens. """ if dist.get_world_size() < 2: return data if not hasattr(all_gather_tokens, '_in_buffer') or all_gather_tokens._in_buffer is None: all_gather_tokens._in_buffer = data all_gather_tokens._out_buffers = [] in_buffer = all_gather_tokens._in_buffer out_buffers = all_gather_tokens._out_buffers dist.all_gather(out_buffers, in_buffer) return paddle.add_n(out_buffers)
def on_step_end(self, status): if dist.get_world_size() < 2 or dist.get_rank() == 0: mode = status['mode'] if mode == 'train': epoch_id = status['epoch_id'] step_id = status['step_id'] steps_per_epoch = status['steps_per_epoch'] training_staus = status['training_staus'] batch_time = status['batch_time'] data_time = status['data_time'] epoches = self.model.cfg.epoch batch_size = self.model.cfg['{}Reader'.format( mode.capitalize())]['batch_size'] logs = training_staus.log() space_fmt = ':' + str(len(str(steps_per_epoch))) + 'd' if step_id % self.model.cfg.log_iter == 0: eta_steps = (epoches - epoch_id) * steps_per_epoch - step_id eta_sec = eta_steps * batch_time.global_avg eta_str = str(datetime.timedelta(seconds=int(eta_sec))) ips = float(batch_size) / batch_time.avg fmt = ' '.join([ 'Epoch: [{}]', '[{' + space_fmt + '}/{}]', 'learning_rate: {lr:.6f}', '{meters}', 'eta: {eta}', 'batch_cost: {btime}', 'data_cost: {dtime}', 'ips: {ips:.4f} images/s', ]) fmt = fmt.format(epoch_id, step_id, steps_per_epoch, lr=status['learning_rate'], meters=logs, eta=eta_str, btime=str(batch_time), dtime=str(data_time), ips=ips) logger.info(fmt) if mode == 'eval': step_id = status['step_id'] if step_id % 100 == 0: logger.info("Eval iter: {}".format(step_id))
def __init__(self, cfg, mode='train'): self.cfg = cfg assert mode.lower() in ['train', 'eval', 'test'], \ "mode should be 'train', 'eval' or 'test'" self.mode = mode.lower() self.optimizer = None # init distillation config self.distill_model = None self.distill_loss = None # build data loader self.dataset = cfg['{}Dataset'.format(self.mode.capitalize())] if self.mode == 'train': self.loader = create('{}Reader'.format(self.mode.capitalize()))( self.dataset, cfg.worker_num) self.model = create(cfg.architecture) #normalize params for deploy self.model.load_meanstd(cfg['TestReader']['sample_transforms']) # EvalDataset build with BatchSampler to evaluate in single device if self.mode == 'eval': self._eval_batch_sampler = paddle.io.BatchSampler( self.dataset, batch_size=self.cfg.EvalReader['batch_size']) self.loader = create('{}Reader'.format(self.mode.capitalize()))( self.dataset, cfg.worker_num, self._eval_batch_sampler) # TestDataset build after user set images, skip loader creation here self._nranks = dist.get_world_size() self._local_rank = dist.get_rank() self.status = {} self.start_epoch = 0 self.end_epoch = 0 if 'epoch' not in cfg else cfg.epoch # initial default callbacks self._init_callbacks() # initial default metrics self._init_metrics() self._reset_metrics()
def create_data_loader(args): root = None if args.root == "None" else args.root (src_vocab, trg_vocab) = WMT14ende.get_vocab(root=root) padding_vocab = ( lambda x: (x + args.pad_factor - 1) // args.pad_factor * args.pad_factor ) args.src_vocab_size = padding_vocab(len(src_vocab)) args.trg_vocab_size = padding_vocab(len(trg_vocab)) transform_func = WMT14ende.get_default_transform_func(root=root) datasets = [ WMT14ende.get_datasets( mode=m, root=root, transform_func=transform_func) for m in ["train", "dev"] ] data_loaders = [(None)] * 2 for i, dataset in enumerate(datasets): dataset = dataset.filter( partial( min_max_filer, max_len=args.max_length)) batch_sampler = TransformerBatchSampler( dataset=dataset, batch_size=args.batch_size, pool_size=args.pool_size, sort_type=args.sort_type, shuffle=args.shuffle, shuffle_batch=args.shuffle_batch, use_token_batch=True, max_length=args.max_length, distribute_mode=True if i == 0 else False, world_size=dist.get_world_size(), rank=dist.get_rank()) data_loader = DataLoader( dataset=dataset, batch_sampler=batch_sampler, collate_fn=partial( prepare_train_input, bos_idx=args.bos_idx, eos_idx=args.eos_idx, pad_idx=args.bos_idx), num_workers=0, return_list=True) data_loaders[i] = (data_loader) return data_loaders