def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False, meta_training=False, sampled_keys=None): # Load data features from cache or dataset file input_file = args.predict_file if evaluate else args.train_file model_name = args.model_type cached_features_file = os.path.join( os.path.dirname(input_file), 'cached_{}_{}_{}_{}'.format(args.qa_dataset, 'dev' if evaluate else 'train', model_name, str(args.max_seq_length))) print("Load %s" % cached_features_file) cached_examples_file = os.path.join( os.path.dirname(input_file), "examples_{}_valid.pkl".format(args.qa_dataset)) if os.path.exists(cached_features_file ) and not args.overwrite_cache and not output_examples: logger.info("Loading features from cached file %s", cached_features_file) with open(cached_features_file, 'rb') as f: features = pickle.load(f) elif os.path.exists( cached_features_file) and output_examples and os.path.exists( cached_examples_file): logger.info("Loading features and examples from cached file %s", cached_features_file) with open(cached_features_file, 'rb') as f: features = pickle.load(f) with open(cached_examples_file, 'rb') as f: examples = pickle.load(f) else: logger.info("Creating features from dataset file at %s", input_file) examples = read_squad_examples( input_file=input_file, is_training=not evaluate, version_2_with_negative=args.version_2_with_negative) features = convert_examples_to_features( examples=examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=not evaluate, cls_token=tokenizer.cls_token, sep_token=tokenizer.sep_token, pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token ])[0]) if not evaluate: max_id = examples[-1].context_id tmp_features = {str(i): [] for i in range(max_id + 1)} for feature in features: context_id = feature.context_id tmp_features[str(context_id)].append(feature) features = tmp_features if evaluate and args.local_rank in [-1, 0]: logger.info("Dumping evaluation examples to cached file %s", cached_examples_file) with open(cached_examples_file, 'wb') as f: pickle.dump(examples, f) if args.local_rank in [-1, 0]: logger.info("Saving features into cached file %s", cached_features_file) # torch.save(features, cached_features_file) with open(cached_features_file, 'wb') as f: pickle.dump(features, f) if not evaluate and args.matching_sample: train_features = [] for key in sampled_keys: feature = features[key] train_features += feature train_feature_length = len(train_features) if args.truncate_task_dataset < 0 \ else min(args.truncate_task_dataset, len(train_features)) extracted_features = random.sample(features.items(), len(features)) if args.fix_task > 0: print("Load Fixed Valid Task...") feature_length = args.fix_task valid_features_file = cached_features_file + "_valid_" + str( feature_length) if os.path.exists(valid_features_file): with open(valid_features_file, 'rb') as f: test_features = pickle.load(f) else: sub_features = [] for feature in features.items(): sub_features += feature[1] sub_len = len(sub_features) feature_length = sub_len if args.truncate_task_dataset < 0 \ else min(sub_len, feature_length) test_features = random.sample(sub_features, feature_length) with open(valid_features_file, 'wb') as f: pickle.dump(test_features, f) else: test_features = [] count = 0 for feature in extracted_features: if count >= train_feature_length: break test_features += feature[1] count += len(feature[1]) test_features = random.sample(test_features, len(test_features)) test_features = test_features[:train_feature_length] if args.truncate_task_dataset > 0: train_features = random.sample(train_features, train_feature_length) print('Extracted train features --> %s' % len(train_features)) print('Extracted test features --> %s' % len(test_features)) if not meta_training: features = train_features elif not evaluate: all_features = [] for idx, feature in features.items(): all_features += feature features = all_features if args.local_rank == 0 and not evaluate: torch.distributed.barrier( ) # Make sure only the first process in distributed training process the dataset, and the others will use the cache if not meta_training: # Convert to Tensors and build dataset if args.truncate_task_dataset > 0 and not evaluate: features = features[:args.truncate_task_dataset] print('Extracted features --> %s' % len(features)) all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) all_cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long) all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float) if evaluate: all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index, all_cls_index, all_p_mask) else: all_start_positions = torch.tensor( [f.start_position for f in features], dtype=torch.long) all_end_positions = torch.tensor( [f.end_position for f in features], dtype=torch.long) dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_start_positions, all_end_positions, all_cls_index, all_p_mask) else: if not evaluate and not args.matching_sample: if args.truncate_task_dataset > 0 and not evaluate: random.shuffle(features) features = features[:args.truncate_task_dataset] train_len = int(0.8 * len(features)) test_features = features[train_len:] train_features = features[:train_len] print('Extracted train features --> %s' % len(train_features)) print('Extracted test features --> %s' % len(test_features)) dataset = [] for features in [train_features, test_features]: all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) all_cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long) all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float) all_start_positions = torch.tensor( [f.start_position for f in features], dtype=torch.long) all_end_positions = torch.tensor( [f.end_position for f in features], dtype=torch.long) dataset_ = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_start_positions, all_end_positions, all_cls_index, all_p_mask) dataset.append(dataset_) if output_examples: return dataset, examples, features return dataset
def load_and_cache_examples(args, task, tokenizer, evaluate=False, meta_training=False, sampled_keys=None): if args.local_rank not in [-1, 0] and not evaluate: torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache processor = processors[task]() output_mode = output_modes[task] # Load data features from cache or dataset file model_name = args.model_type cached_features_file = os.path.join(args.data_dir, 'cached_{}_{}_{}_{}'.format( 'dev' if evaluate else 'train', model_name, str(args.max_seq_length), str(task))) print("Load {}".format(cached_features_file)) if os.path.exists(cached_features_file): logger.info("Loading features from cached file %s", cached_features_file) with open(cached_features_file, 'rb') as f: features = pickle.load(f) else: logger.info("Creating features from dataset file at %s", args.data_dir) label_list = processor.get_labels() examples = processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir) features = convert_examples_to_features(examples, label_list, args.max_seq_length, tokenizer, output_mode, cls_token_at_end=bool(args.model_type in ['xlnet']), # xlnet has a cls token at the end cls_token=tokenizer.cls_token, cls_token_segment_id=2 if args.model_type in ['xlnet'] else 0, sep_token=False if args.task == "chemical" else tokenizer.sep_token, sep_token_extra=bool(args.model_type in ['roberta']), # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805 pad_on_left=bool(args.model_type in ['xlnet']), # pad on the left for xlnet pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0], pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0, ) if args.local_rank in [-1, 0]: logger.info("Saving features into cached file %s", cached_features_file) with open(cached_features_file, 'wb') as f: pickle.dump(features, f) if not evaluate and args.matching_sample: train_features = [] sampled_keys = list(set(sampled_keys)) for key in sampled_keys: feature = features[int(key)] train_features.append(feature) train_feature_length = len(train_features) if args.truncate_task_dataset < 0 \ else min(args.truncate_task_dataset, len(train_features)) extracted_features = random.sample(features, len(features)) test_features = [] count = 0 for feature in extracted_features: if count >= train_feature_length: break test_features.append(feature) count += 1 test_features = random.sample(test_features, len(test_features)) test_features = test_features[:train_feature_length] if args.truncate_task_dataset > 0: train_features = random.sample(train_features, train_feature_length) print('Extracted train features --> %s' % len(train_features)) print('Extracted test features --> %s' % len(test_features)) if not meta_training: features = train_features elif not evaluate: all_features = [] for feature in features: all_features.append(feature) features = all_features if args.truncate_task_dataset > 0: features = features[:args.truncate_task_dataset] train_feature_length = int(0.8 * len(features)) train_features = features[:train_feature_length] test_features = features[train_feature_length:] print('Extracted train features --> %s' % len(train_features)) print('Extracted test features --> %s' % len(test_features)) if args.local_rank == 0 and not evaluate: torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache if not meta_training: if args.truncate_task_dataset > 0 and not evaluate: features = features[:args.truncate_task_dataset] # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.float) dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) else: dataset = [] for features in [train_features, test_features]: all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.float) dataset_ = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) dataset.append(dataset_) return dataset