def make_loaders(args): """makes training/val/test""" if args.use_tfrecords: return make_tfrecord_loaders(args) world_size = torch.distributed.get_world_size( group=mpu.get_data_parallel_group()) batch_size = args.batch_size * world_size eval_batch_size = batch_size if args.eval_batch_size is not None: eval_batch_size = args.eval_batch_size * world_size seq_length = args.seq_length if seq_length < 0: seq_length = seq_length * world_size eval_seq_length = args.eval_seq_length if eval_seq_length is not None and eval_seq_length < 0: eval_seq_length = eval_seq_length * world_size split = get_split(args) data_set_args = { 'local_rank': args.local_rank, 'path': args.train_data, 'seq_length': seq_length, 'mem_length': args.mem_length, 'lazy': args.lazy_loader, 'xl_style': args.transformer_xl, 'delim': args.delim, 'text_key': args.text_key, 'label_key': 'label', 'non_binary_cols': None, 'ds_type': args.data_set_type, 'split': split, 'loose': args.loose_json, 'tokenizer_type': args.tokenizer_type, 'tokenizer_model_path': args.tokenizer_path, 'vocab_size': args.vocab_size, 'model_type': args.tokenizer_model_type, 'cache_dir': args.cache_dir, 'max_preds_per_seq': args.max_preds_per_seq, 'presplit_sentences': args.presplit_sentences, 'sample_one_document': args.sample_one_document, 'pre_tokenize': not args.not_pre_tokenize } eval_set_args = copy.copy(data_set_args) eval_set_args['split'] = [1.] # if optional eval args were set then replace their # equivalent values in the arg dict if eval_seq_length: eval_set_args['seq_length'] = eval_seq_length if args.eval_max_preds_per_seq: eval_set_args['max_preds_per_seq'] = args.eval_max_preds_per_seq if args.eval_text_key is not None: eval_set_args['text_key'] = args.eval_text_key # make datasets splits and tokenizer train = None valid = None test = None if args.train_data is not None: train, tokenizer = data_utils.make_dataset(**data_set_args) if data_utils.should_split(split): train, valid, test = train eval_set_args['tokenizer'] = tokenizer # make training and val dataset if necessary if valid is None and args.valid_data is not None: eval_set_args['path'] = args.valid_data valid, tokenizer = data_utils.make_dataset(**eval_set_args) eval_set_args['tokenizer'] = tokenizer if test is None and args.test_data is not None: eval_set_args['path'] = args.test_data test, tokenizer = data_utils.make_dataset(**eval_set_args) # wrap datasets with data loader if train is not None and args.batch_size > 0: train = make_data_loader(train, batch_size, args) args.do_train = True else: args.do_train = False eval_batch_size = eval_batch_size if eval_batch_size != 0 else batch_size if valid is not None: valid = make_data_loader(valid, eval_batch_size, args) args.do_valid = True else: args.do_valid = False if test is not None: test = make_data_loader(test, eval_batch_size, args) args.do_test = True else: args.do_test = False return (train, valid, test), tokenizer
def make_loaders(args, tokenizer): """makes training/val/test""" if args.use_tfrecords: return make_tfrecord_loaders(args) world_size = torch.distributed.get_world_size( group=mpu.get_data_parallel_group()) if args.loader_scatter is not None: assert world_size % args.loader_scatter == 0 batch_size = args.batch_size * world_size eval_batch_size = batch_size if args.eval_batch_size is not None: eval_batch_size = args.eval_batch_size * world_size seq_length = args.seq_length if seq_length < 0: seq_length = seq_length * world_size eval_seq_length = args.eval_seq_length if eval_seq_length is not None and eval_seq_length < 0: eval_seq_length = eval_seq_length * world_size split = get_split(args) data_set_args = { 'path': args.train_data, 'seq_length': seq_length, 'mem_length': args.mem_length, 'delim': args.delim, 'text_key': args.text_key, 'label_key': 'label', 'ds_type': args.data_set_type, 'split': split, 'loose': args.loose_json, 'max_preds_per_seq': args.max_preds_per_seq, 'presplit_sentences': args.presplit_sentences, 'sample_one_document': args.sample_one_document, 'filter_english': args.filter_english, 'pre_tokenize': not args.no_pre_tokenize, 'tokenizer': tokenizer, 'save_splits': args.save_splits, 'load_splits': args.load_splits, 'save_test_data': args.save_test_data, 'no_lazy_loader': args.no_lazy_loader, 'loader_scatter': args.loader_scatter, 'data_parallel_rank': mpu.get_data_parallel_rank(), "non_sentence_start": args.non_sentence_start, "half_lazy_loader": args.half_lazy_loader } eval_set_args = copy.copy(data_set_args) eval_set_args['split'] = [1.] # if optional eval args were set then replace their # equivalent values in the arg dict if eval_seq_length: eval_set_args['seq_length'] = eval_seq_length if args.eval_max_preds_per_seq: eval_set_args['max_preds_per_seq'] = args.eval_max_preds_per_seq if args.eval_text_key is not None: eval_set_args['text_key'] = args.eval_text_key # make datasets splits and tokenizer train, valid, test = None, None, None if args.train_data is not None: train = data_utils.make_dataset(**data_set_args) if data_utils.should_split(split): train, valid, test = train eval_set_args['tokenizer'] = tokenizer # make training and val dataset if necessary if valid is None and args.valid_data is not None: eval_set_args['path'] = args.valid_data valid = data_utils.make_dataset(**eval_set_args) eval_set_args['tokenizer'] = tokenizer if test is None and args.test_data is not None: eval_set_args['path'] = args.test_data test = data_utils.make_dataset(**eval_set_args) # wrap datasets with data loader use_block = args.block_lm or args.encoder_decoder if train is not None and args.batch_size > 0: train = make_data_loader(train, tokenizer, batch_size, args.train_iters, args, shuffle=args.shuffle, block_collate=use_block) args.do_train = True else: args.do_train = False eval_batch_size = eval_batch_size if eval_batch_size != 0 else batch_size if valid is not None: valid = make_data_loader(valid, tokenizer, eval_batch_size, args.train_iters, args, shuffle=args.shuffle, block_collate=use_block) args.do_valid = True else: args.do_valid = False if test is not None: test = make_data_loader(test, tokenizer, eval_batch_size, len(test) // eval_batch_size + 1, args, shuffle=args.shuffle, block_collate=use_block) args.do_test = True else: args.do_test = False return train, valid, test
def make_loaders(args): """makes training/val/test""" if args.use_tfrecords: return make_tfrecord_loaders(args) batch_size = args.batch_size * args.world_size eval_batch_size = batch_size if args.eval_batch_size is not None: eval_batch_size = args.eval_batch_size * args.world_size seq_length = args.seq_length if seq_length < 0: seq_length = seq_length * args.world_size eval_seq_length = args.eval_seq_length if eval_seq_length is not None and eval_seq_length < 0: eval_seq_length = eval_seq_length * args.world_size split = get_split(args) data_set_args = { 'path': args.train_data, 'seq_length': seq_length, 'lazy': args.lazy_loader, 'delim': args.delim, 'text_key': args.text_key, 'label_key': 'label', 'non_binary_cols': None, 'ds_type': args.data_set_type, 'split': split, 'loose': args.loose_json, 'tokenizer_type': args.tokenizer_type, 'tokenizer_model_path': args.tokenizer_path, 'vocab_size': args.vocab_size, 'model_type': args.tokenizer_model_type, 'cache_dir': args.cache_dir, 'max_preds_per_seq': args.max_preds_per_seq } eval_set_args = copy.copy(data_set_args) eval_set_args['split'] = [1.] # if optional eval args were set then replace their # equivalent values in the arg dict if eval_seq_length: eval_set_args['seq_length'] = eval_seq_length if args.eval_max_preds_per_seq: eval_set_args['max_preds_per_seq'] = args.eval_max_preds_per_seq if args.eval_text_key is not None: eval_set_args['text_key'] = args.eval_text_key # make datasets splits and tokenizer train = None valid = None test = None if args.train_data is not None: train, tokenizer = data_utils.make_dataset(**data_set_args) if data_utils.should_split(split): train, valid, test = train eval_set_args['tokenizer'] = tokenizer # make training and val dataset if necessary if valid is None and args.valid_data is not None: eval_set_args['path'] = args.valid_data valid, _ = data_utils.make_dataset(**eval_set_args) if test is None and args.test_data is not None: eval_set_args['path'] = args.test_data test, _ = data_utils.make_dataset(**eval_set_args) # wrap datasets with data loader if train is not None and args.batch_size > 0: train = make_data_loader(train, batch_size, args) eval_batch_size = eval_batch_size if eval_batch_size != 0 else batch_size if valid is not None: valid = make_data_loader(valid, eval_batch_size, args) if test is not None: test = make_data_loader(test, eval_batch_size, args) return (train, valid, test), tokenizer