def load_squad_for_bert(self): if args['datasetsize'] == '1.1': self.basedir = '../MR/SQUAD/' self.corpus_file_train = self.basedir + 'train-v1.1.json' self.corpus_file_dev = self.basedir + 'dev-v1.1.json' self.data_dump_path = args['rootDir'] + '/SQUAD_1_bert.pkl' # self.vocfile = args['rootDir'] + '/voc_squad_1.txt' self.processor = SquadV1Processor() elif args['datasetsize'] == '2.0': self.basedir = '../MR/SQUAD/' self.corpus_file_train = self.basedir + 'train-v2.0.json' self.corpus_file_test = self.basedir + 'dev-v2.0.json' self.data_dump_path = args['rootDir'] + '/SQUAD_2_bert.pkl' # self.vocfile = args['rootDir'] + '/voc_squad_2.txt' self.processor = SquadV2Processor() datasetExist = os.path.isfile(self.data_dump_path) if not datasetExist: datasets = {'train': {}, 'dev': {}, 'test': {}} examples = self.processor.get_train_examples( self.basedir, filename='train-v1.1.json') features, data = squad_convert_examples_to_features( examples=examples, tokenizer=self.tokenizer, max_seq_length=384, doc_stride=128, max_query_length=64, is_training=True, return_dataset="pt", threads=1, ) datasets['train']['dataset'] = data datasets['train']['features'] = features datasets['train']['examples'] = examples examples_dev = self.processor.get_dev_examples( self.basedir, filename='dev-v1.1.json') features_dev, data_dev = squad_convert_examples_to_features( examples=examples_dev, tokenizer=self.tokenizer, max_seq_length=384, doc_stride=128, max_query_length=64, is_training=False, return_dataset="pt", threads=1, ) datasets['dev']['dataset'] = data_dev datasets['dev']['features'] = features_dev datasets['dev']['examples'] = examples_dev print('Saving dataset...') self.saveDataset(self.data_dump_path, datasets, dataonly=True) # Saving tf samples else: datasets = self.loadDataset(self.data_dump_path, dataonly=True) # print('train size:\t', len(dataset['train'])) # print('test size:\t', len(dataset['test'])) print('loaded') return datasets
def load_and_cache_examples(tokenizer, is_training=True): # Load data features from cache or dataset file cached_features_file = "cached_{}".format("train" if is_training else "dev") # Init features and dataset from cache if it exists if os.path.exists(cached_features_file): print("Loading features from cached file ", cached_features_file) features_and_dataset = torch.load(cached_features_file) features, dataset, examples = ( features_and_dataset["features"], features_and_dataset["dataset"], features_and_dataset["examples"], ) else: print("Creating features from dataset file") if is_training: examples = SquadV1Processor().get_train_examples('') else: examples = SquadV1Processor().get_dev_examples('') features, dataset = squad_convert_examples_to_features( examples,tokenizer,max_seq_length, doc_stride=128, max_query_length=64, is_training=is_training, return_dataset="pt") print("Saving features into cached file", cached_features_file) torch.save({"features": features, "dataset": dataset, "examples": examples}, cached_features_file) return dataset, examples, features
def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False): if args.local_rank not in [-1, 0] and not evaluate: # Make sure only the first process in distributed training process the dataset, and the others will use the cache torch.distributed.barrier() # Load data features from cache or dataset file input_file = args.predict_file if evaluate else args.train_file cached_features_file = os.path.join( os.path.dirname(input_file), "cached_distillation_{}_{}_{}".format( "dev" if evaluate else "train", list(filter(None, args.model_name_or_path.split("/"))).pop(), str(args.max_seq_length), ), ) if os.path.exists(cached_features_file) and not args.overwrite_cache: logger.info("Loading features from cached file %s", cached_features_file) features_and_dataset = torch.load(cached_features_file) try: features, dataset, examples = ( features_and_dataset["features"], features_and_dataset["dataset"], features_and_dataset["examples"], ) except KeyError: raise DeprecationWarning( "You seem to be loading features from an older version of this script please delete the " "file %s in order for it to be created again" % cached_features_file ) else: logger.info("Creating features from dataset file at %s", input_file) processor = SquadV2Processor() if args.version_2_with_negative else SquadV1Processor() if evaluate: examples = processor.get_dev_examples(args.data_dir, filename=args.predict_file) else: examples = processor.get_train_examples(args.data_dir, filename=args.train_file) features, dataset = squad_convert_examples_to_features( examples=examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=not evaluate, return_dataset="pt", threads=args.threads, ) if args.local_rank in [-1, 0]: logger.info("Saving features into cached file %s", cached_features_file) torch.save({"features": features, "dataset": dataset, "examples": examples}, cached_features_file) if args.local_rank == 0 and not evaluate: # Make sure only the first process in distributed training process the dataset, and the others will use the cache torch.distributed.barrier() if output_examples: return dataset, examples, features return dataset
def convert_to_features(in_file, evaluate, doc_stride, max_query_length, max_seq_length, num_workers, tokenizer, debug_features=False, v2=False): processor = SquadV2Processor() if v2 else SquadV1Processor() data_dir = os.path.dirname(in_file) file_name = os.path.basename(in_file) if doc_stride >= max_seq_length - max_query_length: logger.warning( "WARNING - You've set a doc stride which may be superior to the document length in some " "examples. This could result in errors when building features from the examples. Please reduce the doc " "stride or increase the maximum length to ensure the features are correctly built." ) if evaluate: examples = processor.get_dev_examples(data_dir, filename=file_name) else: examples = processor.get_train_examples(data_dir, filename=file_name) features, dataset = squad_convert_examples_to_features( examples=examples, tokenizer=tokenizer, max_seq_length=max_seq_length, doc_stride=doc_stride, max_query_length=max_query_length, is_training=not evaluate, return_dataset="pt", threads=num_workers, ) if debug_features: debug_features_examples_dataset(dataset, examples, features, tokenizer) return dataset, examples, features
def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False): if args.local_rank not in [-1, 0] and not evaluate: torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache # Load data features from cache or dataset file input_dir = args.data_dir if args.data_dir else "." # cached_features_file = os.path.join(input_dir, 'cached_{}_{}_{}'.format( # 'dev' if evaluate else 'train', # list(filter(None, args.model_name_or_path.split('/'))).pop(), # str(args.max_seq_length)) # ) # # # Init features and dataset from cache if it exists # if os.path.exists(cached_features_file) and not args.overwrite_cache and not output_examples: # logger.info("Loading features from cached file %s", cached_features_file) # features_and_dataset = torch.load(cached_features_file) # features, dataset = features_and_dataset["features"], features_and_dataset["dataset"] # else: logger.info("Creating features from dataset file at %s", input_dir) if not args.data_dir and ((evaluate and not args.predict_file) or (not evaluate and not args.train_file)): try: import tensorflow_datasets as tfds except ImportError: raise ImportError("If not data_dir is specified, tensorflow_datasets needs to be installed.") if args.version_2_with_negative: logger.warn("tensorflow_datasets does not handle version 2 of SQuAD.") tfds_examples = tfds.load("squad") examples = SquadV1Processor().get_examples_from_dataset(tfds_examples, evaluate=evaluate) else: processor = SquadV2Processor() if args.version_2_with_negative else SquadV1Processor() if evaluate: examples = processor.get_dev_examples(args.data_dir, filename=args.predict_file) else: examples = processor.get_train_examples(args.data_dir, filename=args.train_file) features, dataset = squad_convert_examples_to_features( examples=examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=not evaluate, return_dataset='pt', regression=True ) # if args.local_rank in [-1, 0]: # logger.info("Saving features into cached file %s", cached_features_file) # torch.save({"features": features, "dataset": dataset}, cached_features_file) if args.local_rank == 0 and not evaluate: torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache if output_examples: return dataset, examples, features return dataset
def load_bert(): # The following code is adapted from HuggingFace transformers # https://github.com/huggingface/transformers/blob/master/examples/run_squad.py # Load pretrained model and tokenizer config_class, model_class, tokenizer_class = (BertConfig, BertForQuestionAnswering, BertTokenizer) config = config_class.from_pretrained(model_name_or_path, cache_dir=cache_dir) tokenizer = tokenizer_class.from_pretrained(model_name_or_path, do_lower_case=True, cache_dir=cache_dir) model = model_class.from_pretrained(model_name_or_path, from_tf=False, config=config, cache_dir=cache_dir) # load some examples processor = SquadV1Processor() examples = processor.get_dev_examples(None, filename=predict_file) # Convert examples to features features, dataset = squad_convert_examples_to_features( examples= examples[: total_samples], # convert just enough examples for this notebook tokenizer=tokenizer, max_seq_length=max_seq_length, doc_stride=doc_stride, max_query_length=max_query_length, is_training=False, return_dataset='pt') return model, features, dataset
def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False): # Load data features from cache or dataset file input_dir = args.data_dir cached_features_file = os.path.join( input_dir, "cached_{}_{}_{}".format( "dev" if evaluate else "train", list(filter(None, args.model_name_or_path.split("/"))).pop(), str(args.max_seq_length), ), ) # Init features and dataset from cache if it exists if os.path.exists(cached_features_file) and not args.overwrite_cache: logger.info("Loading features from cached file %s", cached_features_file) features_and_dataset = torch.load(cached_features_file) features, dataset, examples = ( features_and_dataset["features"], features_and_dataset["dataset"], features_and_dataset["examples"], ) else: logger.info("Creating features from dataset file at %s", input_dir) # If the processor is defined, filename will automatically defined by the processor. processor = SquadV2Processor() if evaluate: examples = processor.get_dev_examples(args.data_dir, filename=None) else: examples = processor.get_train_examples(args.data_dir, filename=None) features, dataset = squad_convert_examples_to_features( examples=examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=not evaluate, return_dataset="pt", threads=args.threads, ) logger.info("Saving features into cached file %s", cached_features_file) torch.save( { "features": features, "dataset": dataset, "examples": examples }, cached_features_file) if output_examples: return dataset, examples, features return dataset
def load_examples(args, tokenizer, evaluate=False, output_examples=False): # Load data features from dataset file # logger.info("Creating features from dataset file at %s", ) if not args.data_dir and ((evaluate and not args.predict_file) or (not evaluate and not args.train_file)): try: import tensorflow_datasets as tfds except ImportError: raise ImportError( "If not data_dir is specified, tensorflow_datasets needs to be installed." ) if args.version_2_with_negative: logger.warn( "tensorflow_datasets does not handle version 2 of SQuAD.") tfds_examples = tfds.load("squad") examples = SquadV1Processor().get_examples_from_dataset( tfds_examples, evaluate=evaluate) else: processor = SquadV2Processor( ) if args.version_2_with_negative else SquadV1Processor() if evaluate: examples = processor.get_dev_examples(os.path.join( args.data_dir, args.task), filename=args.predict_file) else: examples = processor.get_train_examples(os.path.join( args.data_dir, args.task), filename=args.train_file) features, dataset = squad_convert_examples_to_features( examples=examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=not evaluate, return_dataset="pt", threads=args.threads, ) # logger.info("Saving features into cached file %s", cached_features_file) torch.save({ "features": features, "dataset": dataset, "examples": examples }) if output_examples: return dataset, examples, features return dataset
def _create_dataset(self, examples, evaluate=False, output_examples=False): if not evaluate: features, dataset = squad_convert_examples_to_features( examples=examples, tokenizer=self._tokenizer, max_seq_length=self._max_seq_length, doc_stride=self._doc_stride, max_query_length=self._max_query_length, is_training=not evaluate, return_dataset="pt") # pytorch return dataset else: features, dataset = squad_convert_examples_to_features( examples=examples, tokenizer=self._tokenizer, max_seq_length=self._max_seq_length, doc_stride=self._doc_stride, max_query_length=self._max_query_length, is_training=not evaluate, return_dataset="pt") # pytorch # if output_examples: return dataset, examples, features
def load_and_cache_examples(tokenizer, evaluate=False, output_examples=False): # Load data features from cache or dataset file input_dir = "deeplearning_needed/SQUAD_data" cached_features_file = os.path.join( input_dir, "cached_{}_{}_{}".format( "dev" if evaluate else "train", list(filter(None, model_type.split("/"))).pop(), str(384), ), ) # Init features and dataset from cache if it exists if os.path.exists(cached_features_file): features_and_dataset = torch.load(cached_features_file) features, dataset, examples = ( features_and_dataset["features"], features_and_dataset["dataset"], features_and_dataset["examples"], ) else: processor = SquadV2Processor() if evaluate: examples = processor.get_dev_examples(input_dir, filename='dev-v2.0.json') else: examples = processor.get_train_examples(input_dir, filename='train-v2.0.json') features, dataset = squad_convert_examples_to_features( examples=examples, tokenizer=tokenizer, max_seq_length=384, doc_stride=128, max_query_length=64, is_training=not evaluate, return_dataset="pt", threads=1, ) torch.save( { "features": features, "dataset": dataset, "examples": examples }, cached_features_file) if output_examples: return dataset, examples, features return dataset
def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False): # Load data features from cache or dataset file input_dir = args.output_dir if args.output_dir else "." if not os.path.exists(input_dir): os.makedirs(input_dir) cached_features_file = os.path.join( input_dir, "cached_{}_{}_{}".format( "dev" if evaluate else "train", list(filter(None, args.model_name_or_path.split("/"))).pop(), str(args.max_seq_length), ), ) # Init features and dataset from cache if it exists if os.path.exists(cached_features_file) and not args.overwrite_cache: logger.info("Loading features from cached file %s", cached_features_file) features_and_dataset = torch.load(cached_features_file) features, dataset, examples = ( features_and_dataset["features"], features_and_dataset["dataset"], features_and_dataset["examples"], ) else: logger.info("Creating features from dataset file at %s", input_dir) if evaluate: examples = read_squad_examples(args.predict_file, is_training=False) else: examples = read_squad_examples(args.train_file, is_training=True) features, dataset = squad_convert_examples_to_features( examples=examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=not evaluate, return_dataset="pt" ) logger.info("Saving features into cached file %s", cached_features_file) torch.save({"features": features, "dataset": dataset, "examples": examples}, cached_features_file) if output_examples: return dataset, examples, features return dataset
def load_and_cache_examples(data_dir: str, tokenizer, task, max_seq_length, doc_stride, max_query_length, evaluate=False): if (task == "SQuAD1.1"): train_url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json" validation_url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json" train_file = "train-v1.1.json" validation_file = "dev-v1.1.json" processor = SquadV1Processor() elif (task == "SQuAD2.0"): train_url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json" validation_url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json" train_file = "train-v2.0.json" validation_file = "dev-v2.0.json" processor = SquadV2Processor() else: raise NameError("Incompatible dataset detected") if not os.path.exists(data_dir): os.makedirs(data_dir) if evaluate: with urllib.request.urlopen(validation_url) as url: with open(data_dir + "/" + validation_file, 'w') as f: f.write(url.read().decode()) examples = processor.get_dev_examples(data_dir, filename=validation_file) else: with urllib.request.urlopen(train_url) as url: with open(data_dir + "/" + train_file, 'w') as f: f.write(url.read().decode()) examples = processor.get_train_examples(data_dir, filename=train_file) features, dataset = squad_convert_examples_to_features( examples=examples, tokenizer=tokenizer, max_seq_length=max_seq_length, doc_stride=doc_stride, max_query_length=max_query_length, is_training=not evaluate, return_dataset="pt", ) return dataset, examples, features
def __transform_to_features(self, samples): features, dataset = squad_convert_examples_to_features( examples=samples, tokenizer=self.tokenizer, max_seq_length=self.max_seq, doc_stride=self.doc_stride, max_query_length=self.max_query_length, is_training=False, return_dataset='pt', threads=self.workers) sampler = SequentialSampler(dataset) dataloader = DataLoader(dataset, sampler=sampler, batch_size=self.batch_size) return features, dataloader
def setup_squadv1_dataset(data_dir: str, tokenizer: nn.Module, test: bool = False, **kwargs) -> Tuple[Dataset, torch.Tensor, torch.Tensor]: cached_path = os.path.join(data_dir, f"{'dev' if test else 'train'}v1.pth") if os.path.isfile(cached_path): ckpt = torch.load(cached_path) return ckpt["dataset"], ckpt["examples"], ckpt["features"] processor = SquadV1Processor() fname = f"{'dev' if test else 'train'}-v1.1.json" getter = processor.get_dev_examples if test else processor.get_train_examples examples = getter(data_dir, fname) features, dataset = squad_convert_examples_to_features( examples = examples, tokenizer = tokenizer, is_training = not test, return_dataset = "pt", **kwargs ) torch.save({ "dataset": dataset, "examples": examples, "features": features }, cached_path) return dataset, examples, features
def evaluate(model, tokenizer, device, maxSequenceLength, maxQueryLength, documentStride): processor = SquadV2Processor() devData = processor.get_dev_examples(".") features, devDataset = transformers.squad_convert_examples_to_features( examples=devData, tokenizer=tokenizer, max_seq_length=maxSequenceLength, max_query_length=maxQueryLength, doc_stride=documentStride, return_dataset="pt", threads=1, is_training=False) batchSize = 2 sampler = torch.utils.data.SequentialSampler(devDataset) dataLoader = torch.utils.data.DataLoader(devDataset, sampler=sampler, batch_size=batchSize) results = [] for batch in tqdm.tqdm(devDataset): model.eval() batch = tuple(bat.to(device) for bat in batch) with torch.no_grad(): startPosition = batch[3] outputs = model(input_ids=batch[0], attention_mask=batch[1], token_type_ids=batch[2]) for i, index in enumerate(example): feature = features[index.item()] id = int(feature.unique_id) output = [output[i].detach().cpu().tolist() for output in outputs] print(len(output)) results.append( transformers.data.processors.squad.SquadResult( id, output[0], output[2])) predictions = transformers.data.metrics.squad_metrics.compute_predictions_logits( devData, features, results) results = transformers.data.metrics.squad_metrics.squad_evaluate( devData, predictions) return results
def convert_examples_to_features(self, examples, tokenizer, output_mode='', evaluate=False): max_seq_length = self.configs.get('max_seq_length', 384) doc_stride = self.configs.get('doc_stride', 128) max_query_length = self.configs.get('max_query_length', 64) threads = self.configs.get('num_threads', 1) return squad_convert_examples_to_features( examples=examples, tokenizer=tokenizer, max_seq_length=max_seq_length, doc_stride=doc_stride, max_query_length=max_query_length, is_training=not evaluate, return_dataset="pt", threads=threads, )
async def predict( self, sources: SourcesContext ) -> AsyncIterator[Tuple[Record, Any, float]]: if not os.path.isfile( os.path.join(self.parent.config.output_dir, "pytorch_model.bin")): raise ModelNotTrained("Train model before prediction.") self.model = AutoModelForQuestionAnswering.from_pretrained( self.parent.config.output_dir) # , force_download=True) self.model.to(self.parent.config.device) self.tokenizer = AutoTokenizer.from_pretrained( self.parent.config.output_dir, do_lower_case=self.parent.config.do_lower_case, ) async for record in sources.records(): example = SquadExample( qas_id=record.key, question_text=record.feature("question"), context_text=record.feature("context"), answer_text=record.feature("answer_text"), start_position_character=record.feature("start_pos_char"), title=record.feature("title"), is_impossible=record.feature("is_impossible"), answers=record.feature("answers"), ) features, dataset = squad_convert_examples_to_features( examples=[example], tokenizer=self.tokenizer, max_seq_length=self.parent.config.max_seq_length, doc_stride=self.parent.config.doc_stride, max_query_length=self.parent.config.max_query_length, is_training=False, return_dataset="pt", ) prediction = await self._custom_accuracy([example], features, dataset) record.predicted("Answer", prediction, "Nan") yield record
def answer(self, question): hits = self.searcher.search(question, k=self.k) ir_scores = [] paragraphs = [] for j in range(len(hits)): passage = hits[j].raw ir_scores.append(hits[j].score) paragraphs.append(passage) input_ = build_squad_input(question, paragraphs) examples = self.processor._create_examples(input_["data"], "dev") features, dataset = squad_convert_examples_to_features( examples=examples, tokenizer=self.tokenizer, max_seq_length=384, doc_stride=128, max_query_length=64, is_training=False, return_dataset="pt", threads=1, ) all_results, predictions = process_one_question( features, dataset, self.model, self.tokenizer, examples, self.device, self.use_ir_score, self.mu, ir_scores) scores = np.array([(p['start_logit'] + p['end_logit']) for p in predictions['0']]) texts = [p['text'] for p in predictions['0']] predicted_p_indexes_all = scores.argsort()[::-1].argsort() iterator_idx = 0 is_empty = True predicted_p_index = 0 while is_empty and iterator_idx < len(predicted_p_indexes_all): predicted_p_index = predicted_p_indexes_all[iterator_idx] is_empty = texts[predicted_p_index] == "empty" iterator_idx += 1 predicted_answer = texts[predicted_p_index] return predicted_answer
def load_squad_examples(self, mode="train"): if self.data_dir: processor = SquadV2Processor( ) if self.version_2_with_negative else SquadV1Processor() if mode == "train": examples = processor.get_train_examples(self.data_dir, filename="train.json") elif mode == "dev": examples = processor.get_train_examples( self.data_dir, filename="dev.json" ) # for obtaining start positions and end positions elif mode == "test": examples = processor.get_dev_examples(self.data_dir, filename="dev.json") else: raise KeyError(mode) # for debugging -- to small set # Uncomment out below code for debugging. N = 10 examples = examples[:N] # -------------------------------------- is_training = mode != "test" # for obtaining start positions and end positions features, dataset = squad_convert_examples_to_features( examples=examples, tokenizer=self.tokenizer, max_seq_length=self.max_seq_length, doc_stride=self.doc_stride, max_query_length=self.max_query_length, is_training=is_training, return_dataset="pt", # Return DataType is Pytorch Tensor ! threads=2) if not is_training: return dataset, examples, features return dataset
async def accuracy(self, sources: Sources): if not os.path.isfile( os.path.join(self.parent.config.output_dir, "pytorch_model.bin")): raise ModelNotTrained("Train model before assessing for accuracy.") self.tokenizer = AutoTokenizer.from_pretrained( self.parent.config.output_dir, do_lower_case=self.parent.config.do_lower_case, ) eval_examples = await self._preprocess_data(sources) features, dataset = squad_convert_examples_to_features( examples=eval_examples, tokenizer=self.tokenizer, max_seq_length=self.parent.config.max_seq_length, doc_stride=self.parent.config.doc_stride, max_query_length=self.parent.config.max_query_length, is_training=False, return_dataset="pt", ) results = {} if self.parent.config.local_rank in [-1, 0]: logger.info( "Loading checkpoints saved during training for evaluation") self.model = AutoModelForQuestionAnswering.from_pretrained( self.parent.config.output_dir) self.model.to(self.parent.config.device) # Evaluate predictions = await self._custom_accuracy(eval_examples, features, dataset) results = squad_evaluate(eval_examples, predictions) logger.info("Results: {}".format(results)) # return results return Accuracy(results["f1"])
def find_answer(self, question, context, n_best_size=20, max_answer_length=30, full_sentence=False): # heavily inspired by "https://github.com/huggingface/transformers/blob/v2.3.0/examples/run_squad.py#L212-L317" example_id = '55555' example = SquadExample(example_id, question, context, None, None, None) features, dataset = squad_convert_examples_to_features( [example], self.tokenizer, self.max_seq_length, self.doc_stride, self.max_query_length, False, return_dataset='pt') sampler = SequentialSampler(dataset) dataloader = DataLoader(dataset, sampler=sampler, batch_size=1) all_results = [] for batch in dataloader: self.model.eval() batch = tuple(t.to(self.device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], } if self.model_type in {"xlm", "roberta", "distilbert"}: del inputs["token_type_ids"] example_index = batch[3] # XLNet and XLM use more arguments for their predictions if self.model_type in {"xlnet", "xlm"}: inputs.update({"cls_index": batch[4], "p_mask": batch[5]}) outputs = self.model(**inputs) output = [o.detach().cpu().tolist() for o in outputs] unique_id = int(features[example_index].unique_id) # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler" # models only use two. if len(output) >= 5: start_logits = output[0] start_top_index = output[1] end_logits = output[2] end_top_index = output[3] cls_logits = output[4] squad_result = SquadResult( unique_id, start_logits[0], end_logits[0], start_top_index=start_top_index[0], end_top_index=end_top_index[0], cls_logits=cls_logits[0], ) else: start_logits, end_logits = output squad_result = SquadResult(unique_id, start_logits[0], end_logits[0]) all_results.append(squad_result) # XLNet and XLM use a more complex post-processing procedure if self.model_type in {"xlnet", "xlm"}: if hasattr(model, "config"): start_n_top = self.model.config.start_n_top end_n_top = self.model.config.end_n_top else: start_n_top = self.model.module.config.start_n_top end_n_top = self.model.module.config.end_n_top predictions = compute_predictions_log_probs( [example], features, all_results, n_best_size, max_answer_length, '/tmp/pred.out', '/tmp/nbest.out', '/tmp/null.out', start_n_top, end_n_top, self.version_2_with_negative, tokenizer, self.verbose, ) else: predictions = compute_predictions_logits( [example], features, all_results, n_best_size, max_answer_length, self.do_lower_case, '/tmp/pred.out', '/tmp/nbest.out', '/tmp/null.out', self.verbose, self.version_2_with_negative, self.null_score_diff_threshold, ) prediction = predictions[example_id] logger.debug(f'found prediction: "{prediction}"') # empty prediction indicates unknown answer if not prediction: logger.debug('empty prediction') return None if full_sentence: doc = self.nlp(context) for sent in doc.sents: if prediction in sent.text: prediction = sent.text break return prediction
def load_and_cache_examples(args_dict, tokenizer, evaluate=False, output_examples=False): if args_dict[RunParameters.LOCAL_RANK.value] not in [-1, 0 ] and not evaluate: # Make sure only the first process in distributed training process the dataset, and the others will use the cache torch.distributed.barrier() # Load data features from cache or dataset file input_file = args_dict[ RunParameters.PREDICT_FILE.value] if evaluate else args_dict[ RunParameters.TRAIN_FILE.value] cached_features_file = os.path.join( os.path.dirname(input_file), "cached_distillation_{}_{}_{}".format( "dev" if evaluate else "train", list( filter( None, args_dict[RunParameters.MODEL_NAME_OR_PATH.value].split( "/"))).pop(), str(args_dict[ SquadArchitectureHyperparameter.MAX_SEQ_LENGTH.value]), ), ) if os.path.exists(cached_features_file) is True: logging.info( "deleting local cache file: {}".format(cached_features_file)) os.remove(cached_features_file) download_cache_from_s3(args_dict, evaluate) if os.path.exists(cached_features_file) and args_dict[ RunParameters.OVERWRTIE_CACHE.value] is False: logger.info("Loading features from cached file %s", cached_features_file) features_and_dataset = torch.load(cached_features_file) try: features, dataset, examples = ( features_and_dataset["features"], features_and_dataset["dataset"], features_and_dataset["examples"], ) except KeyError: raise DeprecationWarning( "You seem to be loading features from an older version of this script please delete the " "file %s in order for it to be created again" % cached_features_file) else: logger.info("Creating features from dataset file at %s", input_file) processor = SquadV2Processor() if args_dict[ RunParameters.VERSION_2.value] else SquadV1Processor() if evaluate: examples = processor.get_dev_examples( args_dict["data_dir"], filename=args_dict[RunParameters.PREDICT_FILE.value]) else: examples = processor.get_train_examples( args_dict["data_dir"], filename=args_dict[RunParameters.TRAIN_FILE.value]) features, dataset = squad_convert_examples_to_features( examples=examples, tokenizer=tokenizer, max_seq_length=args_dict[ SquadArchitectureHyperparameter.MAX_SEQ_LENGTH.value], doc_stride=args_dict[ SquadArchitectureHyperparameter.DOC_STRIDE.value], max_query_length=args_dict[ SquadArchitectureHyperparameter.MAX_QUERY_LENGTH.value], is_training=not evaluate, return_dataset="pt") if args_dict[RunParameters.LOCAL_RANK.value] in [-1, 0]: logger.info("Saving features into cached file %s", cached_features_file) torch.save( { "features": features, "dataset": dataset, "examples": examples }, cached_features_file) if args_dict[RunParameters.LOCAL_RANK.value] == 0 and not evaluate: # Make sure only the first process in distributed training process the dataset, and the others will use the cache torch.distributed.barrier() if output_examples: return dataset, examples, features return dataset
async def train(self, sources: Sources): self.tokenizer = AutoTokenizer.from_pretrained( self.parent.config.tokenizer_name if self.parent.config.tokenizer_name else self.parent.config.model_name_or_path, do_lower_case=self.parent.config.do_lower_case, cache_dir=self.parent.config.cache_dir if self.parent.config.cache_dir else None, ) self.model = AutoModelForQuestionAnswering.from_pretrained( self.parent.config.model_name_or_path, from_tf=self.parent.config.from_tf, config=config, cache_dir=self.parent.config.cache_dir if self.parent.config.cache_dir else None, ) if self.parent.config.local_rank == 0: # Make sure only the first process in distributed training will download model & vocab torch.distributed.barrier() self.model.to(self.parent.config.device) if self.parent.config.fp16: try: import apex apex.amp.register_half_function(torch, "einsum") except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) # Training train_examples = await self._preprocess_data(sources) _, train_dataset = squad_convert_examples_to_features( examples=train_examples, tokenizer=self.tokenizer, max_seq_length=self.parent.config.max_seq_length, doc_stride=self.parent.config.doc_stride, max_query_length=self.parent.config.max_query_length, is_training=True, return_dataset="pt", ) global_step, tr_loss = await self._custom_train(train_dataset) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) # Save the trained model and the tokenizer if (self.parent.config.local_rank == -1 or torch.distributed.get_rank() == 0): # Create output directory if needed if not os.path.exists(self.parent.config.output_dir ) and self.parent.config.local_rank in [ -1, 0 ]: os.makedirs(self.parent.config.output_dir) logger.info("Saving model checkpoint to %s", self.parent.config.output_dir) # Save a trained model, configuration and tokenizer using `save_pretrained()`. # Take care of distributed/parallel training model_to_save = (self.model.module if hasattr( self.model, "module") else self.model) model_to_save.save_pretrained(self.parent.config.output_dir) self.tokenizer.save_pretrained(self.parent.config.output_dir) # save training arguments together with the trained model torch.save( self.parent.config, os.path.join(self.parent.config.output_dir, "training_args.bin"), )
torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) model_type = "bert" shortcut = "bert-base-uncased" config = transformers.AutoConfig.from_pretrained(shortcut) tokenizer = transformers.AutoTokenizer.from_pretrained(shortcut) model = transformers.AutoModelForQuestionAnswering.from_pretrained( shortcut, config=config) model.to(device) # read in data processor = SquadV2Processor() trainData = processor.get_train_examples(".") features, trainDataset = transformers.squad_convert_examples_to_features( examples=trainData, tokenizer=tokenizer, max_seq_length=maxSequenceLength, max_query_length=maxQueryLength, doc_stride=documentStride, return_dataset="pt", is_training=True) # build up model batchSize = 12 trainSampler = torch.utils.data.RandomSampler(trainDataset) trainDataloader = torch.utils.data.DataLoader(trainDataset, sampler=trainSampler, batch_size=batchSize) trainEpoch = 10 learningRate = 3e-5 totalTrainingStep = len(trainDataloader) // trainEpoch optimizer = transformers.AdamW( params=[i for next, i in model.named_parameters()], lr=learningRate) scheduler = transformers.get_linear_schedule_with_warmup(
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TFTrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) logger.info( "n_replicas: %s, distributed training: %s, 16-bits training: %s", training_args.n_replicas, bool(training_args.n_replicas > 1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Prepare Question-Answering task # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast, ) with training_args.strategy.scope(): model = TFAutoModelForQuestionAnswering.from_pretrained( model_args.model_name_or_path, from_pt=bool(".bin" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) # Get datasets if data_args.use_tfds: if data_args.version_2_with_negative: logger.warn( "tensorflow_datasets does not handle version 2 of SQuAD. Switch to version 1 automatically" ) try: import tensorflow_datasets as tfds except ImportError: raise ImportError( "If not data_dir is specified, tensorflow_datasets needs to be installed." ) tfds_examples = tfds.load("squad", data_dir=data_args.data_dir) train_examples = (SquadV1Processor().get_examples_from_dataset( tfds_examples, evaluate=False) if training_args.do_train else None) eval_examples = (SquadV1Processor().get_examples_from_dataset( tfds_examples, evaluate=True) if training_args.do_eval else None) else: processor = SquadV2Processor( ) if data_args.version_2_with_negative else SquadV1Processor() train_examples = processor.get_train_examples( data_args.data_dir) if training_args.do_train else None eval_examples = processor.get_dev_examples( data_args.data_dir) if training_args.do_eval else None train_dataset = (squad_convert_examples_to_features( examples=train_examples, tokenizer=tokenizer, max_seq_length=data_args.max_seq_length, doc_stride=data_args.doc_stride, max_query_length=data_args.max_query_length, is_training=True, return_dataset="tf", ) if training_args.do_train else None) train_dataset = train_dataset.apply( tf.data.experimental.assert_cardinality(len(train_examples))) eval_dataset = (squad_convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=data_args.max_seq_length, doc_stride=data_args.doc_stride, max_query_length=data_args.max_query_length, is_training=False, return_dataset="tf", ) if training_args.do_eval else None) eval_dataset = eval_dataset.apply( tf.data.experimental.assert_cardinality(len(eval_examples))) # Initialize our Trainer trainer = TFTrainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, ) # Training if training_args.do_train: trainer.train() trainer.save_model() tokenizer.save_pretrained(training_args.output_dir)
def __call__(self, *texts, **kwargs) -> Iterator[Answer]: """ Args: We support multiple use-cases, the following are exclusive: X: sequence of SquadExample data: sequence of SquadExample question: (str, List[str]), batch of question(s) to map along with context context: (str, List[str]), batch of context(s) associated with the provided question keyword argument Returns: dict: {'answer': str, 'score": float, 'start": int, "end": int} answer: the textual answer in the initial context score: the score the current answer scored for the model start: the character index in the original string corresponding to the beginning of the answer' span end: the character index in the original string corresponding to the ending of the answer' span """ # Set defaults values kwargs.setdefault("topk", 1) kwargs.setdefault("doc_stride", 128) kwargs.setdefault("max_answer_len", 15) kwargs.setdefault("max_seq_len", 384) kwargs.setdefault("max_question_len", 64) kwargs.setdefault("version_2_with_negative", False) kwargs.setdefault("batch_size", 1) kwargs.setdefault("threads", 1) kwargs.setdefault( "min_score", None ) # It has priority over "topk" and it doesn't apply to the null answer. kwargs.setdefault("sort_mode", DEFAULT_SORT_MODE) if kwargs["topk"] < 1: raise ValueError( f"topk parameter should be >= 1 (got {kwargs['topk']})") if kwargs["max_answer_len"] < 1: raise ValueError( f"max_answer_len parameter should be >= 1 (got {kwargs['max_answer_len']})" ) if kwargs["sort_mode"] not in SORT_MODE_CHOICES: raise ValueError( f"sort_mode parameter should be in {SORT_MODE_CHOICES} (got {kwargs['sort_mode']})" ) sort_with_prob = kwargs["sort_mode"] == "prob" # Convert inputs to features examples = self._args_parser(*texts, **kwargs) features_list_flat = squad_convert_examples_to_features( examples, self.tokenizer, kwargs["max_seq_len"], kwargs["doc_stride"], kwargs["max_question_len"], False, threads=kwargs["threads"], ) start_logits_flat = np.empty( (len(features_list_flat), kwargs["max_seq_len"])) end_logits_flat = np.empty_like(start_logits_flat) max_batch_size = kwargs["batch_size"] for batch_idx, features_batch in enumerate( chunks(features_list_flat, max_batch_size)): kwargs_as_lists = self.inputs_for_model( [f.__dict__ for f in features_batch]) # Manage tensor allocation on correct device with self.device_placement(): if self.framework == "tf": raise ValueError("tf not supported") else: with torch.no_grad(): kwargs_as_tensors = { k: torch.tensor(v, device=self.device, dtype=torch.int64) for k, v in kwargs_as_lists.items() } start_indices, end_indices = self.model( **kwargs_as_tensors) batch_start_idx = batch_idx * max_batch_size batch_end_idx = (batch_idx + 1) * max_batch_size start_logits_flat[ batch_start_idx:batch_end_idx] = start_indices.cpu( ).numpy() end_logits_flat[ batch_start_idx:batch_end_idx] = end_indices.cpu( ).numpy() # Don't convert into (batch_size, max_features_len, max_seq_length) # because there may be a very long doc (with a lot of features; i.e., max_features_len may be very large). indices_and_features_iterable = groupby(enumerate(features_list_flat), lambda t: t[1].example_index) for example, (_, indices_and_features) in zip( examples, indices_and_features_iterable): indices, features = zip(*indices_and_features) char_to_word = np.array(example.char_to_word_offset) start_logits, end_logits = start_logits_flat[ indices, :], end_logits_flat[indices, :] # Normalize logits and spans to retrieve the answer start_probs = scipy.special.softmax(start_logits, axis=1) end_probs = scipy.special.softmax(end_logits, axis=1) # Mask padding and question p_mask = np.array([feature.p_mask for feature in features]) p_bool_mask = p_mask == 1 start_logits[p_bool_mask], end_logits[ p_bool_mask] = -np.inf, -np.inf start_probs[p_bool_mask], end_probs[p_bool_mask] = 0, 0 if kwargs["version_2_with_negative"]: null_answer = Answer( instance=example, text="", prob=(start_probs[:, 0] * end_probs[:, 0]).min().item(), logit=(start_logits[:, 0] + start_logits[:, 0]).min().item(), start=0, sort_mode=kwargs["sort_mode"], ) else: null_answer = None start_probs[:, 0] = end_probs[:, 0] = 0 start_logits[:, 0] = end_logits[:, 0] = -np.inf if sort_with_prob: start_scores, end_scores = start_probs, end_probs else: start_scores, end_scores = start_logits, end_logits # We increase the top-k because there can be repeated answers here # (e.g., they start in different tokens of the same word). feature_indices, start_indices, end_indices = self.decode( start_scores, end_scores, 5 * kwargs["topk"], kwargs["max_answer_len"], sort_with_prob) # Convert the answer (tokens) back to the original text answers = ( Answer( instance=example, text=" ".join( example.doc_tokens[features[f].token_to_orig_map[s]: features[f].token_to_orig_map[e] + 1]), prob=(start_probs[f, s] * end_probs[f, e]).item(), logit=(start_logits[f, s] + end_logits[f, e]).item(), start=np.where(char_to_word == features[f]. token_to_orig_map[s])[0][0].item(), null_answer=null_answer, sort_mode=kwargs["sort_mode"], ) for f, s, e in zip(feature_indices, start_indices, end_indices) # If topk is very large, it may be the case that some invalid answer was selected: if s in features[f].token_to_orig_map and e in features[f].token_to_orig_map) # We leave the unique answers. # An answer in considered non-unique if it's inside another one. # # Note that if they have the same text and they're in the same positions then only one will be kept. # If they have the same text but are in different positions we leave them, and this is good # (i.e., it's like having more evidence). # # We use `(start, end)` to uniquely identify the answer (note they answer the same `example`). unique_answers_by_start_and_end: MutableMapping[Tuple[int, int], Answer] = {} for answer in answers: # We iterate with copy because we may delete items. for start_and_end, another_answer in list( unique_answers_by_start_and_end.items()): if answer.one_inside_the_other_one(another_answer): if answer.sort_score > another_answer.sort_score: # TODO: we could make a new answer with the widest answer (keeping the largest score). # Or other strategies with overlapping answers. del unique_answers_by_start_and_end[start_and_end] unique_answers_by_start_and_end[( answer.start, answer.end)] = answer break else: unique_answers_by_start_and_end[(answer.start, answer.end)] = answer unique_answers = [ answer for answer in unique_answers_by_start_and_end.values() if kwargs["min_score"] is None or answer.sort_score >= kwargs["min_score"] ] if kwargs["version_2_with_negative"]: unique_answers.append(null_answer) for answer in sorted(unique_answers, key=lambda a: a.sort_score, reverse=True)[:kwargs["topk"]]: yield answer
def load_dataset(self, split, combine=False, **kwargs): cache = os.path.join( self.args.data, "cached_{}_{}_{}.pth".format(split, self.args.bpe, self.args.max_seq_len)) if os.path.exists(cache): examples, features = torch.load(cache) else: if split == 'valid': examples = self.processor.get_dev_examples( self.args.data, self.train_or_dev_file[split]) else: examples = self.processor.get_train_examples( self.args.data, self.train_or_dev_file[split]) features = squad_convert_examples_to_features( examples=examples, tokenizer=self.tokenizer, max_seq_length=self.args.max_seq_len, doc_stride=128, max_query_length=64, is_training=(split != 'valid'), return_dataset=False, ) if self.args.distributed_rank == 0: torch.save((examples, features), cache) if split == 'valid' and self.do_evaluate: self.examples = examples self.features = features src_dataset = BaseWrapperDataset( [np.array(f.input_ids) for f in features]) starts = BaseWrapperDataset( np.array([f.start_position for f in features])) ends = BaseWrapperDataset(np.array([f.end_position for f in features])) sizes = np.array([len(f.input_ids) for f in features]) src_lengths = NumelDataset(src_dataset, reduce=False) ''' Input format: <s> question here ? </s> Passage </s> ''' dataset = NestedDictionaryDataset( { 'id': IdDataset(), 'net_input': { 'src_tokens': src_dataset, 'src_lengths': NumelDataset(src_dataset, reduce=False), }, 'targets': { 'starts': starts, 'ends': ends, }, 'nsentences': NumSamplesDataset(), 'ntokens': NumelDataset(src_dataset, reduce=True), }, sizes=[sizes], ) with data_utils.numpy_seed(self.args.seed): dataset = SortDataset( dataset, sort_order=[np.random.permutation(len(dataset))], ) print('| Loaded {} with {} samples'.format(split, len(dataset))) self.datasets[split] = dataset return self.datasets[split]
def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False): # COMET model setting up device = "0" comet_model = "pretrained_models/atomic_pretrained_model.pickle" sampling_algo = "beam-2" opt, state_dict = interactive.load_model_file(comet_model) data_loader, text_encoder = interactive.load_data("atomic", opt) n_ctx = data_loader.max_event + data_loader.max_effect n_vocab = len(text_encoder.encoder) + n_ctx model = interactive.make_model(opt, n_vocab, n_ctx, state_dict) nlp = spacy.load("en_core_web_sm") if device != "cpu": cfg.device = int(device) cfg.do_gpu = True torch.cuda.set_device(cfg.device) model.cuda(cfg.device) else: cfg.device = "cpu" sampling_algorithm = sampling_algo sampler = interactive.set_sampler(opt, sampling_algorithm, data_loader) def augment(article): context = (article.numpy().decode('UTF-8')) category_list = ["xNeed", "xIntent", "xWant", "xReact"] for category in category_list: entity_list = nlp(context) input_event = context replaced = [] replacement_list = ["PersonX", "PersonY", "PersonZ"] r = 0 for entity in entity_list.ents: if entity.label_ == 'PERSON' or entity.label_ == 'NORP': input_event = input_event.replace(entity.text, replacement_list[r]) r += 1 if (r == 3): break outputs = interactive.get_atomic_sequence(input_event, model, sampler, data_loader, text_encoder, category) for key in outputs: prefix = "" if (key[0] == "o"): if (key == "oEffect"): prefix = " Everyone else " elif (key == "oReact"): prefix = "They are " elif (key == "oWant"): prefix = "They want " else: if (len(replaced) != 0): prefix = replaced[0] else: prefix = "Person" if (key == "xAttr"): prefix += " is " elif (key == "xEffect"): prefix += " " elif (key == "xIntent"): prefix += " intends " elif (key == "xReact"): prefix += " is " elif (key == "xNeed"): prefix += " needs " elif (key == "xWant"): prefix += " wants " for j in range(5): if (outputs[key]["beams"][j] != 'none'): comet_inf = outputs[key]["beams"][j] if (len(replaced) > 0): comet_inf = comet_inf.replace( "personx", replaced[0]) if (len(replaced) > 1): comet_inf = comet_inf.replace( "persony", replaced[1]) article += prefix + (comet_inf) + ". " break return article def process_example(example): example['context'] = tf.py_function(func=augment, inp=[example['context']], Tout=tf.string) return example ## End if args.local_rank not in [-1, 0] and not evaluate: # Make sure only the first process in distributed training process the dataset, and the others will use the cache torch.distributed.barrier() # Load data features from cache or dataset file input_dir = args.data_dir if args.data_dir else "." cached_features_file = os.path.join( input_dir, "cached_{}_{}_{}".format( "dev" if evaluate else "train", list(filter(None, args.model_name_or_path.split("/"))).pop(), str(args.max_seq_length), ), ) # Init features and dataset from cache if it exists if os.path.exists(cached_features_file) and not args.overwrite_cache: logger.info("Loading features from cached file %s", cached_features_file) features_and_dataset = torch.load(cached_features_file) features, dataset, examples = ( features_and_dataset["features"], features_and_dataset["dataset"], features_and_dataset["examples"], ) else: logger.info("Creating features from dataset file at %s", input_dir) if not args.data_dir and ((evaluate and not args.predict_file) or (not evaluate and not args.train_file)): try: import tensorflow_datasets as tfds except ImportError: raise ImportError( "If not data_dir is specified, tensorflow_datasets needs to be installed." ) if args.version_2_with_negative: logger.warn( "tensorflow_datasets does not handle version 2 of SQuAD.") tfds_examples = tfds.load("squad") tfds_examples["train"] = tfds_examples["train"].map( lambda x: process_example(x)) examples = SquadV1Processor().get_examples_from_dataset( tfds_examples, evaluate=evaluate) else: processor = SquadV2Processor( ) if args.version_2_with_negative else SquadV1Processor() if evaluate: examples = processor.get_dev_examples( args.data_dir, filename=args.predict_file) else: examples = processor.get_train_examples( args.data_dir, filename=args.train_file) features, dataset = squad_convert_examples_to_features( examples=examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=not evaluate, return_dataset="pt", threads=args.threads, ) if args.local_rank in [-1, 0]: logger.info("Saving features into cached file %s", cached_features_file) torch.save( { "features": features, "dataset": dataset, "examples": examples }, cached_features_file) if args.local_rank == 0 and not evaluate: # Make sure only the first process in distributed training process the dataset, and the others will use the cache torch.distributed.barrier() if output_examples: return dataset, examples, features return dataset
def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False): # Load data features from cache or dataset file input_dir = args.cache_dir if args.cache_dir else "." cached_features_file = os.path.join( input_dir, "cached_{}_{}_{}".format( "dev" if evaluate else "train", list(filter(None, args.model_name_or_path.split("/"))).pop(), str(args.max_seq_length), ), ) # Init features and dataset from cache if it exists if os.path.exists(cached_features_file): logger.info("Loading features from cached file %s", cached_features_file) features_and_dataset = torch.load(cached_features_file) features, dataset, examples = ( features_and_dataset["features"], features_and_dataset["dataset"], features_and_dataset["examples"], ) else: logger.info("Creating features from dataset file at %s", input_dir) if not args.data_dir and ((evaluate and not args.predict_file) or (not evaluate and not args.train_file)): try: import tensorflow_datasets as tfds except ImportError: raise ImportError( "If not data_dir is specified, tensorflow_datasets needs to be installed." ) if args.version_2_with_negative: logger.warn( "tensorflow_datasets does not handle version 2 of SQuAD.") tfds_examples = tfds.load("squad") examples = SquadV1Processor().get_examples_from_dataset( tfds_examples, evaluate=evaluate) else: processor = SquadV2Processor( ) if args.version_2_with_negative else SquadV1Processor() if evaluate: examples = processor.get_dev_examples( os.path.join(args.data_dir, args.task), filename=args.predict_file) else: examples = processor.get_train_examples( os.path.join(args.data_dir, args.task), filename=args.train_file) features, dataset = squad_convert_examples_to_features( examples=examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=not evaluate, return_dataset="pt", threads=args.threads, ) logger.info("Saving features into cached file %s", cached_features_file) torch.save( { "features": features, "dataset": dataset, "examples": examples }, cached_features_file) if output_examples: return dataset, examples, features return dataset
from transformers import AutoTokenizer, squad_convert_examples_to_features output_dir = "data/SQuAD/SQuAD_dev2" model_name = "albert-base-v1" data_dir = "data/SQuAD" dev_fn = "dev-v2.0.json" tokenizer = AutoTokenizer.from_pretrained(model_name) processor = SquadV2Processor() examples = processor.get_train_examples(data_dir, filename=dev_fn) features, dataset = squad_convert_examples_to_features( examples=examples, tokenizer=tokenizer, max_seq_length=384, doc_stride=128, max_query_length=64, is_training=True, return_dataset="pt", threads=2, ) for i, filename_base in enumerate(["input_ids", "attention_mask", "token_type_ids", "start_positions", "end_positions"]): np.savetxt(f"{output_dir}/{filename_base}.csv", dataset.tensors[i].numpy(), delimiter=",") with open(f"{output_dir}/strings.txt", "w") as fio: for t in dataset.tensors[0]: fio.write(f"{tokenizer.decode(t)}\n")