def load_and_cache_examples(args, task, tokenizer, evaluate=False): if args.local_rank not in [-1, 0] and not evaluate: torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache processor = processors[task]() output_mode = output_modes[task] # Load data features from cache or dataset file cached_features_file = os.path.join( args.data_dir, "cached_{}_{}_{}_{}".format( "dev" if evaluate else "train", list(filter(None, args.model_name_or_path.split("/"))).pop(), str(args.max_seq_length), str(task), ), ) label_list = processor.get_labels() if os.path.exists(cached_features_file) and not args.overwrite_cache: logger.info("Loading features from cached file %s", cached_features_file) features = torch.load(cached_features_file) else: logger.info("Creating features from dataset file at %s", args.data_dir) if task in ["mnli", "mnli-mm"] and args.model_type in ["roberta"]: # HACK(label indices are swapped in RoBERTa pretrained model) label_list[1], label_list[2] = label_list[2], label_list[1] examples = ( processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir) ) features = convert_examples_to_features( examples, tokenizer, label_list=label_list, max_length=args.max_seq_length, output_mode=output_mode, pad_on_left=bool(args.model_type in ["xlnet"]), # pad on the left for xlnet pad_token=tokenizer.pad_token_id, pad_token_segment_id=tokenizer._convert_token_to_id('[PAD]'), #pad_token_segment_id=tokenizer.pad_token_type_id, ) if args.local_rank in [-1, 0]: logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) if args.local_rank == 0 and not evaluate: torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) if output_mode == "classification": all_labels = torch.tensor([f.label for f in features], dtype=torch.long) elif output_mode == "regression": all_labels = torch.tensor([f.label for f in features], dtype=torch.float) all_pair_ids = torch.tensor([int(f.pairID) for f in features], dtype=torch.long) dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels, all_pair_ids) return dataset, label_list
def load_and_cache_examples(data_dir, max_seq_length, model_type, model_name_or_path, tokenizer, overwrite_cache): processor = HansProcessor() # Load data features from cache or dataset file cached_features_file = os.path.join( data_dir, "cached_{}_{}_{}_{}".format( "dev" if evaluate else "train", list(filter(None, model_name_or_path.split("/"))).pop(), str(max_seq_length), "hans", ), ) label_list = processor.get_labels() if os.path.exists(cached_features_file) and not overwrite_cache: logger.info("Loading features from cached file %s", cached_features_file) features = torch.load(cached_features_file) else: logger.info("Creating features from dataset file at %s", data_dir) if model_type in ["roberta"]: # HACK(label indices are swapped in RoBERTa pretrained model) label_list[1], label_list[2] = label_list[2], label_list[1] examples = ( processor.get_dev_examples(data_dir) ) features = convert_examples_to_features( examples, tokenizer, label_list=label_list, max_length=max_seq_length, output_mode="classification", pad_on_left=bool(model_type in ["xlnet"]), # pad on the left for xlnet pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0], pad_token_segment_id=4 if model_type in ["xlnet"] else 0, ) logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) all_labels = torch.tensor([f.label for f in features], dtype=torch.long) all_pair_ids = torch.tensor([int(f.pairID) for f in features], dtype=torch.long) dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels, all_pair_ids) return dataset, label_list