def create_pretrained_dataset( args, data_file, tokenizer, data_world_size, data_world_rank, max_seq_len, places=None, data_holders=None, current_step=0, ): train_valid_test_num_samples = [ args.global_batch_size * args.max_steps, args.micro_batch_size * (args.max_steps // args.eval_freq + 1) * args.eval_iters * data_world_size, args.micro_batch_size * args.test_iters * data_world_size ] train_ds, valid_ds, test_ds = build_train_valid_test_datasets( data_prefix=data_file, args=args, tokenizer=tokenizer, splits_string=args.split, train_valid_test_num_samples=train_valid_test_num_samples, max_seq_length=args.max_seq_len, masked_lm_prob=args.masked_lm_prob, short_seq_prob=args.short_seq_prob, seed=args.seed, skip_warmup=True, binary_head=True, max_seq_length_dec=None, dataset_type='ernie') def _collate_data(data, stack_fn=Stack()): num_fields = len(data[0]) out = [None] * num_fields # 0. input_ids, # 1. segment_ids, # 2. input_mask, # 3. masked_lm_positions, # 4. masked_lm_labels, # 5. next_sentence_labels for i in (0, 1, 2, 5): out[i] = stack_fn([x[i] for x in data]) out[5] = out[5].reshape([-1, 1]) batch_size, seq_length = out[0].shape size = num_mask = sum(len(x[3]) for x in data) # masked_lm_positions # Organize as a 1D tensor for gather or use gather_nd if size % 8 != 0: size += 8 - (size % 8) out[3] = np.full(size, 0, dtype=np.int32) # masked_lm_labels out[4] = np.full([size, 1], -1, dtype=np.int64) mask_token_num = 0 for i, x in enumerate(data): for j, pos in enumerate(x[3]): out[3][mask_token_num] = i * seq_length + pos out[4][mask_token_num] = x[4][j] mask_token_num += 1 return out def loader(dataset, consumed_samples=0): batch_sampler = DistributedBatchSampler( dataset, batch_size=args.micro_batch_size, num_replicas=data_world_size, rank=data_world_rank, shuffle=False, drop_last=True, consumed_samples=consumed_samples) data_loader = paddle.io.DataLoader(dataset=dataset, batch_sampler=batch_sampler, num_workers=args.num_workers, worker_init_fn=None, collate_fn=_collate_data, return_list=False) return data_loader train_dl = loader(train_ds, args.global_batch_size * current_step) valid_dl = loader( valid_ds, args.micro_batch_size * ((current_step + 1) // args.eval_freq) * args.eval_iters * data_world_size) test_dl = loader(test_ds, 0) return train_dl, valid_dl, test_dl
def create_pretrained_dataset(data_args, training_args, data_file, tokenizer): train_valid_test_num_samples = [ training_args.per_device_train_batch_size * training_args.world_size * training_args.max_steps * training_args.gradient_accumulation_steps, training_args.per_device_eval_batch_size * training_args.world_size * training_args.eval_iters * (training_args.max_steps // training_args.eval_steps + 1), training_args.per_device_eval_batch_size * training_args.world_size * training_args.test_iters, ] train_ds, valid_ds, test_ds = build_train_valid_test_datasets( data_prefix=data_file, args=data_args, tokenizer=tokenizer, splits_string=data_args.split, train_valid_test_num_samples=train_valid_test_num_samples, max_seq_length=data_args.max_seq_length, masked_lm_prob=data_args.masked_lm_prob, short_seq_prob=data_args.short_seq_prob, seed=training_args.seed, skip_warmup=True, binary_head=True, max_seq_length_dec=None, dataset_type='ernie') def _collate_data(data, stack_fn=Stack()): num_fields = len(data[0]) out = [None] * num_fields # 0. input_ids, # 1. segment_ids, # 2. input_mask, # 3. masked_lm_positions, # 4. masked_lm_labels, # 5. next_sentence_labels for i in (0, 1, 2, 5): out[i] = stack_fn([x[i] for x in data]) out[5] = out[5].reshape([-1, 1]) batch_size, seq_length = out[0].shape size = num_mask = sum(len(x[3]) for x in data) # masked_lm_positions # Organize as a 1D tensor for gather or use gather_nd if size % 8 != 0: size += 8 - (size % 8) out[3] = np.full(size, 0, dtype=np.int32) # masked_lm_labels out[4] = np.full([size, 1], -1, dtype=np.int64) mask_token_num = 0 for i, x in enumerate(data): for j, pos in enumerate(x[3]): out[3][mask_token_num] = i * seq_length + pos out[4][mask_token_num] = x[4][j] mask_token_num += 1 return { "input_ids": out[0], "token_type_ids": out[1], "attention_mask": out[2], "masked_positions": out[3], "labels": (out[4], out[5]), } return train_ds, valid_ds, test_ds, _collate_data