args.device = device # logger.info("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", # args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16) # ------------------------------------------------------------------------------------------- # init tokenizer & Converter # logger.info("start setting tokenizer, dataset and dataloader (local_rank = {})... ".format(args.local_rank)) tokenizer = tokenizer_class[args.pretrain_model_type].from_pretrained(args.cache_dir) # ------------------------------------------------------------------------------------------- # Select dataloader batchify_features_for_train, batchify_features_for_test = dataloader.get_class(args.model_class) # ------------------------------------------------------------------------------------------- # build dev dataloader dev_dataset = dataloader.build_dataset(**{'args':args, 'tokenizer':tokenizer, 'mode':'dev'}) args.test_batch_size = args.per_gpu_test_batch_size * max(1, args.n_gpu) dev_sampler = torch.utils.data.sampler.SequentialSampler(dev_dataset) dev_data_loader = torch.utils.data.DataLoader( dev_dataset, batch_size=args.test_batch_size, sampler=dev_sampler, num_workers=args.data_workers, collate_fn=batchify_features_for_test, pin_memory=args.cuda, ) # ------------------------------------------------------------------------------------------- # build eval dataloader if args.dataset_class == 'kp20k': eval_dataset = dataloader.build_dataset(**{'args':args, 'tokenizer':tokenizer, 'mode':'eval'})
logger.info( "start setting tokenizer, dataset and dataloader (local_rank = {})... " .format(args.local_rank)) tokenizer = tokenizer_class[args.pretrain_model_type].from_pretrained( args.cache_dir) # ------------------------------------------------------------------------------------------- # Select dataloader batchify_features_for_train, batchify_features_for_test = dataloader.get_class( args.model_class) # ------------------------------------------------------------------------------------------- # build train dataloader train_dataset = dataloader.build_dataset(**{ "args": args, "tokenizer": tokenizer, "mode": "train" }) args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = (torch.utils.data.sampler.RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)) train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.train_batch_size, sampler=train_sampler, num_workers=args.data_workers, collate_fn=batchify_features_for_train, pin_memory=args.cuda, ) logger.info("Successfully Preprocess Training Features !")