def load_and_cache_examples(args, task, tokenizer, data_type='train'): '''加载数据''' if args.local_rank not in [-1, 0] and not evaluate: torch.distributed.barrier( ) # Make sure only the first process in distributed training process the dataset, and the others will use the cache processor = processors[task]() # Load data features from cache or dataset file cached_features_file = os.path.join( args.data_dir, 'cached_soft-{}_{}_{}_{}'.format( data_type, list(filter(None, args.model_name_or_path.split('/'))).pop(), str(args.train_max_seq_length if data_type == 'train' else args.eval_max_seq_length), str(task))) if os.path.exists(cached_features_file) and not args.overwrite_cache: logger.info("Loading features from cached file %s", cached_features_file) features = torch.load(cached_features_file) else: logger.info("Creating features from dataset file at %s", args.data_dir) label_list = processor.get_labels() if data_type == 'train': examples = processor.get_train_examples(args.data_dir) elif data_type == 'dev': examples = processor.get_dev_examples(args.data_dir) else: examples = processor.get_test_examples(args.data_dir) features = convert_examples_to_features(examples=examples, tokenizer=tokenizer, label_list=label_list, max_seq_length=args.train_max_seq_length if data_type=='train' \ else args.eval_max_seq_length, cls_token_at_end=bool(args.model_type in ["xlnet"]), pad_on_left=bool(args.model_type in ['xlnet']), cls_token = tokenizer.cls_token, cls_token_segment_id=2 if args.model_type in ["xlnet"] else 0, sep_token=tokenizer.sep_token, # pad on the left for xlnet pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0], pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0, ) if args.local_rank in [-1, 0]: logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) if args.local_rank == 0 and not evaluate: torch.distributed.barrier( ) # Make sure only the first process in distributed training process the dataset, and the others will use the cache # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) all_label_ids = torch.tensor([f.label_ids for f in features], dtype=torch.long) all_lens = torch.tensor([f.input_len for f in features], dtype=torch.long) dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_lens, all_label_ids) return dataset
def load_and_cache_examples(args, task, tokenizer, lines, data_type='train'): if args.local_rank not in [-1, 0]: torch.distributed.barrier( ) # Make sure only the first process in distributed training process the dataset, and the others will use the cache processor = processors[task]() label_list = processor.get_labels() examples = processor.get_predict_examples(lines) features = convert_examples_to_features(examples=examples, tokenizer=tokenizer, label_list=label_list, max_seq_length=args.train_max_seq_length if data_type=='train' \ else args.eval_max_seq_length, cls_token_at_end=bool(args.model_type in ["xlnet"]), pad_on_left=bool(args.model_type in ['xlnet']), cls_token = tokenizer.cls_token, cls_token_segment_id=2 if args.model_type in ["xlnet"] else 0, sep_token=tokenizer.sep_token, # pad on the left for xlnet pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0], pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0, ) # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) all_label_ids = torch.tensor([f.label_ids for f in features], dtype=torch.long) all_lens = torch.tensor([f.input_len for f in features], dtype=torch.long) dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_lens, all_label_ids) return dataset
def load_and_cache_examples(args, task_name, tokenizer, data_type="train"): if args.local_rank not in [-1, 0] and not evaluate: torch.distributed.barrier() processor = processors[task_name]() # load data features from cache or datafile cached_features_file = os.path.join( args.data_dir, "cache_crf-{}_{}_{}_{}".format( data_type, list(filter(None, args.model_name_or_path.split("/"))).pop(), str(args.train_max_seq_length if data_type == "train" else args.eval_max_seq_length), str(task_name))) if os.path.exists(cached_features_file) and not args.overwrite_cache: logger.info("Loading features from cache file %s", cached_features_file) features = torch.load(cached_features_file) else: logger.info("Creating features from dateset file at %s", cached_features_file) label_list = processor.get_labels() if data_type == "train": examples = processor.get_train_examples(args.data_dir) elif data_type == "dev": examples = processor.get_dev_examples(args.data_dir) else: examples = processor.get_test_examples(args.data_dir) features = convert_examples_to_features( examples=examples, tokenizer=tokenizer, label_list=label_list, max_seq_length=args.train_max_seq_length if data_type == "train" else args.eval_max_seq_length, cls_token_at_end=bool(args.model_type in ["xlnet"]), pad_on_left=bool(args.model_type in ["xlnet"]), cls_token=tokenizer.cls_token, cls_token_segment_id=2 if args.model_type in ["xlnet"] else 0, sep_token=tokenizer.sep_token, #pad on left for xlnet pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token ])[0], pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0, ) if args.local_rank in [-1, 0]: logger.info("save features into cached file %s", cached_features_file) torch.save(features, cached_features_file) if args.local_rank == 0 and not evaluate: torch.distributed.barrier() #convert to tensors and build dateset print(features[0]) all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_masks = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_id for f in features], dtype=torch.long) all_label_ids = torch.tensor([f.label_ids for f in features], dtype=torch.long) all_lens = torch.tensor([f.input_len for f in features], dtype=torch.long) dataset = TensorDataset(all_input_ids, all_input_masks, all_segment_ids, all_lens, all_label_ids) return dataset