def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TFTrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) logger.info( "n_replicas: %s, distributed training: %s, 16-bits training: %s", training_args.n_replicas, bool(training_args.n_replicas > 1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Prepare Question-Answering task # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast, ) with training_args.strategy.scope(): model = TFAutoModelForQuestionAnswering.from_pretrained( model_args.model_name_or_path, from_pt=bool(".bin" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) # Get datasets if data_args.use_tfds: if data_args.version_2_with_negative: logger.warn( "tensorflow_datasets does not handle version 2 of SQuAD. Switch to version 1 automatically" ) try: import tensorflow_datasets as tfds except ImportError: raise ImportError( "If not data_dir is specified, tensorflow_datasets needs to be installed." ) tfds_examples = tfds.load("squad", data_dir=data_args.data_dir) train_examples = (SquadV1Processor().get_examples_from_dataset( tfds_examples, evaluate=False) if training_args.do_train else None) eval_examples = (SquadV1Processor().get_examples_from_dataset( tfds_examples, evaluate=True) if training_args.do_eval else None) else: processor = SquadV2Processor( ) if data_args.version_2_with_negative else SquadV1Processor() train_examples = processor.get_train_examples( data_args.data_dir) if training_args.do_train else None eval_examples = processor.get_dev_examples( data_args.data_dir) if training_args.do_eval else None train_dataset = (squad_convert_examples_to_features( examples=train_examples, tokenizer=tokenizer, max_seq_length=data_args.max_seq_length, doc_stride=data_args.doc_stride, max_query_length=data_args.max_query_length, is_training=True, return_dataset="tf", ) if training_args.do_train else None) train_dataset = train_dataset.apply( tf.data.experimental.assert_cardinality(len(train_examples))) eval_dataset = (squad_convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=data_args.max_seq_length, doc_stride=data_args.doc_stride, max_query_length=data_args.max_query_length, is_training=False, return_dataset="tf", ) if training_args.do_eval else None) eval_dataset = eval_dataset.apply( tf.data.experimental.assert_cardinality(len(eval_examples))) # Initialize our Trainer trainer = TFTrainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, ) # Training if training_args.do_train: trainer.train() trainer.save_model() tokenizer.save_pretrained(training_args.output_dir)
def run_squad_and_get_results( run_name: str, fsx_prefix: str, pre_layer_norm: bool, model_size: str, load_from: Union[str, tf.keras.Model], load_step: int, batch_size: int, checkpoint_frequency: Optional[int], validate_frequency: Optional[int], learning_rate: float, warmup_steps: int, total_steps: int, dataset: str, dummy_eval: bool = False, config: Optional[PretrainedConfig] = None, ) -> Dict: checkpoint_frequency = checkpoint_frequency or 1000000 validate_frequency = validate_frequency or 1000000 if isinstance(load_from, tf.keras.Model): config = load_from.config assert config is not None, "config may not be None" # Instantiate QuestionAnswering model if isinstance(load_from, TFPreTrainedModel): model = load_qa_from_pretrained(model=load_from) elif load_from == "scratch": model = TFAutoModelForQuestionAnswering.from_config(config) elif load_from == "huggingface": model = load_qa_from_pretrained(name=f"albert-{model_size}-v2") else: raise ValueError( f"'load_from' is '{load_from}'; must be in ['scratch', 'huggingface', 'amazon']" ) tokenizer = get_tokenizer() schedule = LinearWarmupLinearDecaySchedule( max_learning_rate=learning_rate, end_learning_rate=0, warmup_steps=warmup_steps, total_steps=total_steps, ) optimizer = tfa.optimizers.AdamW(weight_decay=0.0, learning_rate=schedule) optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer( optimizer, loss_scale="dynamic") model.call = wrap_tf_function_idempotent(model.call) if dataset == "squadv1": train_filename = "train-v1.1.json" val_filename = "dev-v1.1.json" processor = SquadV1Processor() elif dataset == "squadv2": train_filename = "train-v2.0.json" val_filename = "dev-v2.0.json" processor = SquadV2Processor() elif dataset == "debug": train_filename = "dev-v2.0.json" val_filename = "dev-v2.0.json" processor = SquadV2Processor() else: assert False, "--dataset must be one of ['squadv1', 'squadv2', 'debug']" data_dir = f"{fsx_prefix}/squad_data" train_dataset = get_dataset( tokenizer=tokenizer, processor=processor, data_dir=data_dir, filename=train_filename, batch_size=batch_size, shard=True, shuffle=True, repeat=True, drop_remainder=True, ) if hvd.rank() == 0: print("Starting finetuning") pbar = tqdm.tqdm(total_steps) summary_writer = None # Only create a writer if we make it through a successful step val_dataset = get_dataset( tokenizer=tokenizer, processor=processor, data_dir=data_dir, filename=val_filename, batch_size=batch_size, shard=False, shuffle=True, drop_remainder=False, ) # Need to re-wrap every time this function is called # Wrapping train_step gives an error with optimizer initialization on the second pass # of run_squad_and_get_results(). Bug report at https://github.com/tensorflow/tensorflow/issues/38875 # Discussion at https://github.com/tensorflow/tensorflow/issues/27120 wrapped_train_step = tf.function(train_step) for step, batch in enumerate(train_dataset): learning_rate = schedule(step=tf.constant(step, dtype=tf.float32)) loss, acc, exact_match, f1, precision, recall = wrapped_train_step( model=model, optimizer=optimizer, batch=batch) # Broadcast model after the first step so parameters and optimizer are initialized if step == 0: hvd.broadcast_variables(model.variables, root_rank=0) hvd.broadcast_variables(optimizer.variables(), root_rank=0) is_final_step = step >= total_steps - 1 if hvd.rank() == 0: do_checkpoint = (step % checkpoint_frequency == 0) or is_final_step do_validate = (step % validate_frequency == 0) or is_final_step pbar.update(1) description = f"Loss: {loss:.3f}, Acc: {acc:.3f}, EM: {exact_match:.3f}, F1: {f1:.3f}" pbar.set_description(description) if do_validate: print("Running validation") ( val_loss, val_acc, val_exact_match, val_f1, val_precision, val_recall, ) = run_validation(model=model, val_dataset=val_dataset) description = ( f"Step {step} validation - Loss: {val_loss:.3f}, Acc: {val_acc:.3f}, " f"EM: {val_exact_match:.3f}, F1: {val_f1:.3f}") print(description) print("Running evaluation") if dummy_eval: results = { "exact": 0.8169797018445212, "f1": 4.4469722448269335, "total": 11873, "HasAns_exact": 0.15182186234817813, "HasAns_f1": 7.422216845956518, "HasAns_total": 5928, "NoAns_exact": 1.4802354920100924, "NoAns_f1": 1.4802354920100924, "NoAns_total": 5945, "best_exact": 50.07159100480081, "best_exact_thresh": 0.0, "best_f1": 50.0772059855695, "best_f1_thresh": 0.0, } else: results: Dict = get_evaluation_metrics( model=model, data_dir=data_dir, filename=val_filename, batch_size=32, ) print_eval_metrics(results=results, step=step) if do_checkpoint: checkpoint_path = ( f"{fsx_prefix}/checkpoints/albert-squad/{run_name}-step{step}.ckpt" ) print(f"Saving checkpoint at {checkpoint_path}") model.save_weights(checkpoint_path) if summary_writer is None: summary_writer = tf.summary.create_file_writer( f"{fsx_prefix}/logs/albert-squad/{run_name}") with summary_writer.as_default(): tf.summary.scalar("learning_rate", learning_rate, step=step) tf.summary.scalar("train_loss", loss, step=step) tf.summary.scalar("train_acc", acc, step=step) tf.summary.scalar("train_exact", exact_match, step=step) tf.summary.scalar("train_f1", f1, step=step) tf.summary.scalar("train_precision", precision, step=step) tf.summary.scalar("train_recall", recall, step=step) if do_validate: tf.summary.scalar("val_loss", val_loss, step=step) tf.summary.scalar("val_acc", val_acc, step=step) tf.summary.scalar("val_exact", val_exact_match, step=step) tf.summary.scalar("val_f1", val_f1, step=step) tf.summary.scalar("val_precision", val_precision, step=step) tf.summary.scalar("val_recall", val_recall, step=step) # And the eval metrics tensorboard_eval_metrics(summary_writer=summary_writer, results=results, step=step) if is_final_step: break # Can we return a value only on a single rank? if hvd.rank() == 0: pbar.close() print(f"Finished finetuning, job name {run_name}") return results
def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False): if args.local_rank not in [-1, 0] and not evaluate: # Make sure only the first process in distributed training process the dataset, and the others will use the cache torch.distributed.barrier() # Load data features from cache or dataset file input_file = args.predict_file if evaluate else args.train_file cached_features_file = os.path.join( os.path.dirname(input_file), "cached_distillation_{}_{}_{}".format( "dev" if evaluate else "train", list(filter(None, args.model_name_or_path.split("/"))).pop(), str(args.max_seq_length), ), ) if os.path.exists(cached_features_file) and not args.overwrite_cache: logger.info("Loading features from cached file %s", cached_features_file) features_and_dataset = torch.load(cached_features_file) try: features, dataset, examples = ( features_and_dataset["features"], features_and_dataset["dataset"], features_and_dataset["examples"], ) except KeyError: raise DeprecationWarning( "You seem to be loading features from an older version of this script please delete the " "file %s in order for it to be created again" % cached_features_file) else: logger.info("Creating features from dataset file at %s", input_file) processor = SquadV2Processor( ) if args.version_2_with_negative else SquadV1Processor() if evaluate: examples = processor.get_dev_examples(args.data_dir, filename=args.predict_file) else: examples = processor.get_train_examples(args.data_dir, filename=args.train_file) features, dataset = squad_convert_examples_to_features( examples=examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=not evaluate, return_dataset="pt", threads=args.threads, ) if args.local_rank in [-1, 0]: logger.info("Saving features into cached file %s", cached_features_file) torch.save( { "features": features, "dataset": dataset, "examples": examples }, cached_features_file) if args.local_rank == 0 and not evaluate: # Make sure only the first process in distributed training process the dataset, and the others will use the cache torch.distributed.barrier() if output_examples: return dataset, examples, features return dataset
def predict(model_prefix, probes_dir, preds_dir, data_dir, data_file, layers, batch_size, hidden_dim, max_seq_length, device): # Extract examples tokenizer = AutoTokenizer.from_pretrained(model_prefix) processor = SquadV2Processor() dev_examples = processor.get_dev_examples(data_dir=data_dir, filename=data_file) # Extract dev features print("Loading dev features") dev_features, dev_dataset = squad_convert_examples_to_features( examples=dev_examples, tokenizer=tokenizer, max_seq_length=max_seq_length, doc_stride=128, max_query_length=64, is_training=False, return_dataset="pt", threads=1) # Initialize config and model config = AutoConfig.from_pretrained(model_prefix, output_hidden_states=True) model = AutoModelForQuestionAnswering.from_pretrained(model_prefix, config=config) # multi-gpu evaluate model = torch.nn.DataParallel(model) # Load probe for each layer print("Loading probes") probes = [] for i in range(layers): p = Probe(hidden_dim) p.load(probes_dir, i + 1, device) probes.append(p) # Extract IDs print("Extracting dev IDs") n = len(dev_examples) q_ids = [] for i in range(n): q_ids.append(dev_examples[i].qas_id) # Initialize dev data loader eval_sampler = SequentialSampler(dev_dataset) eval_dataloader = DataLoader(dev_dataset, sampler=eval_sampler, batch_size=batch_size) # Initialize predictions predictions = [] for i in range(layers): pred = pd.DataFrame() pred['Id'] = q_ids pred['Predicted'] = [""] * len(dev_examples) pred['Question'] = [""] * len(dev_examples) pred['Score'] = [0] * len(dev_examples) predictions.append(pred) # List to keep track of how many unique questions we've seen in each df, questions with # contexts longer than max seq len get split into multiple features based on doc_stride # a good alternative we may implement later is recording for all features, then simplifying with groupby and max # e.g. something like df.sort_values('Score', ascending=False).drop_duplicates(['Question']) question_ids = [0] * layers # Evaluation batches print("Predicting on dev set") for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], } # Distil does not use token type ids if "distil" in model_dir: inputs.pop('token_type_ids') # ALBERT/BERT/Distilibert forward pass idx = batch[3] outputs = model(**inputs) attention_hidden_states = outputs[2][1:] # Compute prediction on eval indices for j, index in enumerate(idx): index = int(index.item()) # Extract tokens for the current batch tokens = tokenizer.convert_ids_to_tokens(batch[0][j]) # Find where context starts and ends, since we want to predict in context context_start = int(max_seq_length - torch.argmax( torch.flip(batch[2][j], [0])).item()) - 1 context_end = int(torch.argmax(batch[2][j]).item()) # Find the question, starting right after [CLS] and subtracting 1 to chop off the [SEP] token question_start = 1 question_end = context_start question = tokenizer.convert_tokens_to_string( tokens[question_start:question_end - 1]) # For each layer ... for i, p in enumerate(probes): # Extract predicted indicies score, start_idx, end_idx = p.predict( attention_hidden_states[i][j].unsqueeze(0), device, threshold=0, context_start=context_start, context_end=context_end) start_idx = int(start_idx[0]) end_idx = int(end_idx[0]) # Extract predicted answer, converting start tokens to empty strings (no answer) answer = tokenizer.convert_tokens_to_string( tokens[start_idx:end_idx + 1]) if answer == '[CLS]': answer = '' # Check if the question is the same as the last one, if it is go back to the last question id and keep the higher score. # If the question is not already in the dataframe, then assign it to the dataframe. # Note we first handle the case where there are no prior questions by storing since we know there are no duplicates if question_ids[i] == 0: predictions[i].loc[question_ids[i], 'Question'] = question predictions[i].loc[question_ids[i], 'Predicted'] = answer predictions[i].loc[question_ids[i], 'Score'] = score elif (predictions[i].loc[int(question_ids[i] - 1), 'Question'] == question): question_ids[i] -= 1 old_score = predictions[i].loc[question_ids[i], 'Score'] if score > old_score: predictions[i].loc[question_ids[i], 'Predicted'] = answer predictions[i].loc[question_ids[i], 'Score'] = score else: predictions[i].loc[question_ids[i], 'Question'] = question predictions[i].loc[question_ids[i], 'Predicted'] = answer predictions[i].loc[question_ids[i], 'Score'] = score # Increment to new question id (note, for duplicate answers this gets us back to where we were) question_ids[i] += 1 # Save predictions for each layer print("Saving predictions") if not os.path.exists(preds_dir): os.mkdir(preds_dir) for i, pred in enumerate(predictions): pred[['Id', 'Predicted']].to_csv(preds_dir + "/layer_" + str(i + 1) + ".csv", index=False)
def load_and_cache_examples(data_dir: Path, tokenizer, task, max_seq_length, doc_stride, max_query_length, evaluate=False, model_name=None): if (task == "SQuAD1.1"): train_url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json" validation_url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json" train_file = "train-v1.1.json" validation_file = "dev-v1.1.json" processor = SquadV1Processor() elif (task == "SQuAD2.0"): train_url = "https://determined-ai-public-datasets.s3-us-west-2.amazonaws.com/squad/v2.0/train-v2.0-short.json" validation_url = "https://determined-ai-public-datasets.s3-us-west-2.amazonaws.com/squad/v2.0/dev-v2.0-short.json" train_file = "train-v2.0.json" validation_file = "dev-v2.0.json" processor = SquadV2Processor() else: raise NameError("Incompatible dataset detected") if not data_dir.exists(): data_dir.mkdir(parents=True) if evaluate: # TODO: Cache instead of always downloading with urllib.request.urlopen(validation_url) as url: val_path = data_dir / validation_file with val_path.open('w') as f: f.write(url.read().decode()) else: with urllib.request.urlopen(train_url) as url: train_path = data_dir / train_file with train_path.open('w') as f: f.write(url.read().decode()) # Load data features from cache or dataset file cached_features_file = os.path.join( str(data_dir.absolute()), "cache_{}_{}".format( "dev" if evaluate else "train", model_name, ), ) # Init features and dataset from cache if it exists overwrite_cache = False # Set to True to do a cache wipe (TODO: Make cache wipe configurable) if os.path.exists(cached_features_file) and not overwrite_cache: print("Loading features from cached file %s", cached_features_file) features_and_dataset = torch.load(cached_features_file) features, dataset, examples = ( features_and_dataset["features"], features_and_dataset["dataset"], features_and_dataset["examples"], ) else: if evaluate: examples = processor.get_dev_examples(data_dir, filename=validation_file) else: examples = processor.get_train_examples(data_dir, filename=train_file) features, dataset = squad_convert_examples_to_features( examples=examples, tokenizer=tokenizer, max_seq_length=max_seq_length, doc_stride=doc_stride, max_query_length=max_query_length, is_training=not evaluate, return_dataset="pt", ) print("Saving features into cached file %s", cached_features_file) torch.save({"features": features, "dataset": dataset, "examples": examples}, cached_features_file) return dataset, examples, features
def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False): if args.local_rank not in [-1, 0] and not evaluate: # Make sure only the first process in distributed training process the dataset, and the others will use the cache torch.distributed.barrier() # Load data features from cache or dataset file input_dir = args.data_dir if args.data_dir else "." cached_features_file = os.path.join( input_dir, "cached_{}_{}_{}".format( "dev" if evaluate else "train", list(filter(None, args.model_name_or_path.split("/"))).pop(), str(args.max_seq_length), ), ) # Init features and dataset from cache if it exists if os.path.exists(cached_features_file) and not args.overwrite_cache: logger.info("Loading features from cached file %s", cached_features_file) features_and_dataset = torch.load(cached_features_file) features, dataset, examples = ( features_and_dataset["features"], features_and_dataset["dataset"], features_and_dataset["examples"], ) else: logger.info("Creating features from dataset file at %s", input_dir) if not args.data_dir and ((evaluate and not args.predict_file) or (not evaluate and not args.train_file)): try: import tensorflow_datasets as tfds except ImportError: raise ImportError("If not data_dir is specified, tensorflow_datasets needs to be installed.") if args.version_2_with_negative: logger.warn("tensorflow_datasets does not handle version 2 of SQuAD.") tfds_examples = tfds.load("squad") examples = SquadV1Processor().get_examples_from_dataset(tfds_examples, evaluate=evaluate) else: processor = SquadV2Processor() if args.version_2_with_negative else SquadV1Processor() #Tydi specific if args.leave_out_languages is not None: logger.info("Creating temporary trainig file at %s", args.data_dir) leave_languages = args.leave_out_languages.split(',') with open(os.path.join(args.data_dir, args.train_file), "r", encoding="utf-8") as reader: input_data = json.load(reader) tmp_data = {} tmp_data['data'] = [] for k in input_data.keys(): if k != 'data': tmp_data[k] = input_data[k] left_out_count = 0 for entry in input_data['data']: paragraph = entry["paragraphs"][0] #only one paragraph per entry qa = paragraph["qas"][0] #single question is sufficient to determine the language lang = qa['id'].split('-')[0] if lang not in leave_languages: tmp_data['data'].append(entry) else: left_out_count += 1 logger.info("No. of training examples left out %d", left_out_count) tmp_filename = args.train_file[:-5] for lang in leave_languages: tmp_filename += '-'+lang tmp_filename += '.json' with open(os.path.join(args.data_dir, tmp_filename), 'w', encoding='utf-8') as writer: json.dump(tmp_data, writer) if args.train_on_languages is not None: logger.info("Creating temporary training file at %s", args.data_dir) keep_languages = args.train_on_languages.split(',') with open(os.path.join(args.data_dir, args.train_file), "r", encoding="utf-8") as reader: input_data = json.load(reader) tmp_data = {} tmp_data['data'] = [] for k in input_data.keys(): if k != 'data': tmp_data[k] = input_data[k] left_out_count = 0 keep_count = 0 for entry in input_data['data']: paragraph = entry["paragraphs"][0] #only one paragraph per entry qa = paragraph["qas"][0] #single question is sufficient to determine the language lang = qa['id'].split('-')[0] if lang in keep_languages: tmp_data['data'].append(entry) keep_count += 1 else: left_out_count += 1 logger.info("No. of training examples left out %d", left_out_count) logger.info("No. of training examples kept %d", keep_count) tmp_filename = args.train_file[:-5] for lang in keep_languages: tmp_filename += '-keep-'+lang tmp_filename += '.json' with open(os.path.join(args.data_dir, tmp_filename), 'w', encoding='utf-8') as writer: json.dump(tmp_data, writer) if evaluate: examples = processor.get_dev_examples(args.data_dir, filename=args.predict_file) else: if args.leave_out_languages is not None: args.train_file = tmp_filename if args.train_on_languages is not None: args.train_file = tmp_filename examples = processor.get_train_examples(args.data_dir, filename=args.train_file) features, dataset = squad_convert_examples_to_features( examples=examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=not evaluate, return_dataset="pt", threads=args.threads, ) if args.local_rank in [-1, 0]: logger.info("Saving features into cached file %s", cached_features_file) torch.save({"features": features, "dataset": dataset, "examples": examples}, cached_features_file) if args.local_rank == 0 and not evaluate: # Make sure only the first process in distributed training process the dataset, and the others will use the cache torch.distributed.barrier() if output_examples: return dataset, examples, features return dataset
def data_save(args, tokenizer, evaluate=False): data_config = DataConfig( endpoint="127.0.0.1:9000", access_key="minio", secret_key="miniosecretkey", dataset_name="SQuAD1.1" if not args.version_2_with_negative else "SQuAD2.0", additional={ "mode": "train" if not evaluate else "test", "framework": "pytorch", "version": 1.1 if not args.version_2_with_negative else 2.0, "model_name": args.tokenizer_name, "doc_stride": args.doc_stride, "max_seq_length": args.max_seq_length, "max_query_length": args.max_query_length, }, attributes=train_attributes if not evaluate else eval_attributes, ) processor = SquadV2Processor( ) if args.version_2_with_negative else SquadV1Processor() if evaluate: examples = processor.get_dev_examples(None, filename=args.predict_file) else: examples = processor.get_train_examples(None, filename=args.train_file) features, dataset = squad_convert_examples_to_features( examples=examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=not evaluate, return_dataset="pt", threads=args.threads, ) data_saver = DataSaver(config=data_config) dataloader = DataLoader(dataset, batch_size=8, num_workers=args.threads) if not evaluate: for batch in tqdm(dataloader): inputs = { "all_input_ids": batch[0], "all_attention_masks": batch[1], "all_token_type_ids": batch[2], "all_start_positions": batch[3], "all_end_positions": batch[4], "all_cls_index": batch[5], "all_p_mask": batch[6], "all_is_impossible": batch[7] } data_saver(inputs) else: for batch in tqdm(dataloader): inputs = { "all_input_ids": batch[0], "all_attention_masks": batch[1], "all_token_type_ids": batch[2], "all_feature_index": batch[3], "all_cls_index": batch[4], "all_p_mask": batch[5], } data_saver(inputs) _features, _examples = tempfile.mktemp("features"), tempfile.mktemp( "examples") torch.save(features, _features) torch.save(examples, _examples) data_saver({ "features": _features, "examples": _examples }, filetype=True) data_saver.disconnect()
def load_combined_examples(args, evaluate=False): """ Deprecated sadly """ if args.local_rank not in [-1, 0] and not evaluate: # Make sure only the first process in distributed training process the dataset, and the others will use the cache torch.distributed.barrier() if not args.data_dir and ((evaluate and not args.predict_file) or (not evaluate and not args.train_file)): try: import tensorflow_datasets as tfds except ImportError: raise ImportError( "If not data_dir is specified, tensorflow_datasets needs to be installed." ) if args.version_2_with_negative: logger.warn( "tensorflow_datasets does not handle version 2 of SQuAD.") logger.warn("Something went wrong!") tfds_examples = tfds.load("squad") examples = SquadV1Processor().get_examples_from_dataset( tfds_examples, evaluate=evaluate) else: processor = SquadV2Processor( ) if args.version_2_with_negative else SquadV1Processor() if evaluate: examples = processor.get_dev_examples(args.data_dir, filename=args.predict_file) # Sanity check for loading the correct example assert examples[ 0].question_text == 'In what country is Normandy located?', 'Invalid dev file!' else: # Normal get train examples examples = processor.get_train_examples(args.data_dir, filename=args.train_file) # Sanity check for loading the correct example assert examples[ 0].question_text == 'When did Beyonce start becoming popular?', 'Invalid train file!' assert args.saved_processed_data_dir, 'args.saved_processed_data_dir not defined!' ensemble_dir = args.saved_processed_data_dir if evaluate: with open(os.path.join(ensemble_dir, 'saved_data_dev.pkl'), 'rb') as f: saved_data = pickle.load(f) else: with open(os.path.join(ensemble_dir, 'saved_data_train.pkl'), 'rb') as f: saved_data = pickle.load(f) # saved_data: [features, all_results, tokenizer] features, combined_all_results, tokenizer = saved_data assert np.array_equal( [f.start_position for f in features[0]], [f.start_position for f in features[1]]), print("Same family Same features") # Same family same feature and tokenizer, so we pick the first one features = features[0] tokenizer = tokenizer[0] all_predict_start_logits = [] all_predict_end_logits = [] # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) for all_results in combined_all_results: all_predict_start_logits.append( torch.tensor([s.start_logits for s in all_results], dtype=torch.float)) all_predict_end_logits.append( torch.tensor([s.end_logits for s in all_results], dtype=torch.float)) if evaluate: all_example_indices = torch.arange(all_input_ids.size(0), dtype=torch.long) all_predict_start_logits = torch.stack(all_predict_start_logits).permute( 1, 0, 2) all_predict_end_logits = torch.stack(all_predict_end_logits).permute( 1, 0, 2) # print(f'all_input_ids: {all_input_ids.shape}, all_predict_start_logits{all_predict_start_logits.shape}, all_predict_end_logits:{all_predict_end_logits.shape}') if evaluate: dataset = TensorDataset(all_predict_start_logits, all_predict_end_logits, all_example_indices) else: all_start_positions = torch.tensor( [f.start_position for f in features], dtype=torch.long) all_end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long) # print(all_start_positions.shape, all_end_positions.shape) dataset = TensorDataset(all_predict_start_logits, all_predict_end_logits, all_start_positions, all_end_positions) if evaluate: assert len(examples) == 6078 else: assert len(examples) == 130319 if args.local_rank == 0 and not evaluate: # Make sure only the first process in distributed training process the dataset, and the others will use the cache torch.distributed.barrier() return examples, features, dataset, tokenizer, len(combined_all_results)
def __init__( self, args, tokenizer: AutoTokenizer, mode: Optional[str] = "train", is_language_sensitive: Optional[bool] = False, cache_dir: Optional[str] = None, dataset_format: Optional[str] = "pt", # threads: Optional[int] = 1, threads: Optional[int] = 8, debug: Optional[bool] = False, ): self.args = args self.tokenizer = tokenizer self.is_language_sensitive = is_language_sensitive self.processor = SquadV2Processor( ) if args.version_2_with_negative else SquadV1Processor() self.mode = mode self.debug = debug self.threads = threads self.max_seq_length = self.args.max_seq_length self.doc_stride = self.args.doc_stride self.max_query_length = self.args.max_query_length # dataset format configurations self.column_names = ["id", "title", "context", "question", "answers"] self.question_column_name = "question" if "question" in self.column_names else self.column_names[ 0] self.context_column_name = "context" if "context" in self.column_names else self.column_names[ 1] self.answer_column_name = "answers" if "answers" in self.column_names else self.column_names[ 2] # Padding side determines if we do (question|context) or (context|question). self.pad_on_right = tokenizer.padding_side == "right" # load data features from cache or dataset file version_tag = "v2" if args.version_2_with_negative else "v1" # print(args.data_dir) # print(tokenizer.__class__.__name__) cached_features_file = os.path.join( cache_dir if cache_dir is not None else args.data_dir, "cached_{}_{}_{}_{}".format(mode, tokenizer.__class__.__name__, str(args.max_seq_length), version_tag)) self.cached_data_file = cached_features_file # Make sure only the first process in distributed training processes the dataset, # and the others will use the cache. if os.path.exists(cached_features_file) and not args.overwrite_cache: self.old_features = torch.load(cached_features_file) # legacy cache files have only features, # which new cache files will have dataset and examples also. self.features = self.old_features["features"] self.dataset = self.old_features.get("dataset", None) self.examples = self.old_features.get("examples", None) if self.dataset is None or self.examples is None: raise ValueError else: if self.mode == "dev": self.examples = self.processor.get_dev_examples(args.data_dir) else: self.examples = self.processor.get_train_examples( args.data_dir) if self.debug: print(f"DEBUG INFO -> already load {self.mode} data ...") print(f"DEBUG INFO -> show 2 EXAMPLES ...") for idx, data_examples in enumerate(self.examples): # data_examples should be an object of transformers.data.processors.squad.SquadExample if idx <= 2: print(f"DEBUG INFO -> {idx}, {data_examples}") print(f"{idx} qas_id -> {data_examples.qas_id}") print( f"{idx} question_text -> {data_examples.question_text}" ) print( f"{idx} context_text -> {data_examples.context_text}" ) print( f"{idx} answer_text -> {data_examples.answer_text}" ) print("-*-" * 10) self.features, self.dataset = squad_convert_examples_to_features( examples=self.examples, tokenizer=tokenizer, max_seq_length=self.max_seq_length, doc_stride=self.doc_stride, max_query_length=self.max_query_length, is_training=mode == "train", threads=self.threads, return_dataset=dataset_format, ) torch.save( { "features": self.features, "dataset": self.dataset, "examples": self.examples }, cached_features_file, )
def get_evaluation_metrics( model, data_dir: str, filename: str, batch_size: int = 32, num_batches: int = None, ) -> Dict[str, "Number"]: """ Return an OrderedDict in the format: { 'exact': 0.8169797018445212, 'f1': 4.4469722448269335, 'total': 11873, 'HasAns_exact': 0.15182186234817813, 'HasAns_f1': 7.422216845956518, 'HasAns_total': 5928, 'NoAns_exact': 1.4802354920100924, 'NoAns_f1': 1.4802354920100924, 'NoAns_total': 5945, 'best_exact': 50.07159100480081, 'best_exact_thresh': 0.0, 'best_f1': 50.0772059855695, 'best_f1_thresh': 0.0 } """ # These are not used in inference, only for scoring in `compute_predictions_logits()`. processor = SquadV2Processor() tokenizer = get_tokenizer() examples: List[SquadExample] = processor.get_dev_examples(data_dir, filename=filename) features: List[SquadFeatures] = get_dataset( tokenizer=tokenizer, processor=processor, data_dir=data_dir, filename=filename, batch_size=batch_size, shard=False, shuffle=False, drop_remainder=False, return_raw_features=True, ) # Here we get the dataset instead of just the features, with return_raw_features=False. dataset: tf.data.Dataset = get_dataset( tokenizer=tokenizer, processor=processor, data_dir=data_dir, filename=filename, batch_size=batch_size, shard=False, shuffle=False, drop_remainder=False, return_raw_features=False, ) results: List[SquadResult] = get_squad_results( model=model, dataset=dataset, features=features, batch_size=batch_size, num_batches=num_batches, ) write_prediction_files = False if write_prediction_files: output_predictions_file = f"/fsx/{args.checkpoint}_predictions.json" output_nbest_file = f"/fsx/{args.checkpoint}_nbest_predictions.json" output_null_log_odds_file = f"/fsx/{args.checkpoint}_null_odds.json" else: output_predictions_file = None output_nbest_file = None output_null_log_odds_file = None predictions = compute_predictions_logits( all_examples=examples, all_features=features, all_results=results, n_best_size=20, max_answer_length=30, do_lower_case=True, output_prediction_file=output_predictions_file, output_nbest_file=output_nbest_file, output_null_log_odds_file=output_null_log_odds_file, verbose_logging=False, version_2_with_negative=True, null_score_diff_threshold=0.0, tokenizer=tokenizer, ) results: collections.OrderedDict = squad_evaluate(examples, predictions) return results
def load_and_cache_examples(args, tokenizer, evaluate=False, use_aug_path=False, output_examples=False ) -> torch.utils.data.TensorDataset: """Loads SQuAD-like data features from dataset file (or cache) Parameters ---------- args : kitanaqa.trainer.arguments.ModelArguments A set of arguments related to the model. Specifically, the following arguments are used in this function: - args.train_file_path : str Path to the training data file - args.do_aug : bool Flag to specify whether to use the augmented training set. If True, will be merged with the original training set specified in train_file_path. The default value is False. - args.aug_file_path : str Path for augmented train dataset - args.data_dir : str Path for data files - args.model_name_or_path : str Path to pretrained model or model identifier from huggingface.co/models - args.max_seq_length : Optional[int] Max length for the input tokens, specified to the Transformer model defined in `model_name_or_path` - args.overwrite_cache : Bool Overwrite cached data on load - args.predict_file_path : Dict[str, str] Paths for eval datasets, where the key is the data file tag, and the value is the data file path. Multiple file paths may be given for evaluation, and each will be cached and loaded separately. - args.version_2_with_negative : Bool Flag that specifies to use the SQuAD v2.0 preprocessors. The default value is False. - args.doc_stride : Optional[int] Corresponds to the doc_stride input param for some Huggingface Transformer models. - args.max_query_length : Optional[int] Max length for the query segment in the Transformer model input. tokenizer : The Transformer model tokenizer used to preprocess the data. evaluate : Optional(Bool) A flag to set the trainer task to either train or evaluate. The default value is False. use_aug_path : Optional(Bool) A flag to define whether to use the aug_file_path or the train_file_path. If True, the augmented data path is used when loading and caching the data. output_examples : Optional(Bool) A flag to define whether the examples and features should be returned by the data preprocessor. If False, the preprocessor only returns the dataset. This is necessary if the Trainer is used for evaluation or in a pipeline where training is followed by evaluation. Returns ------- torch.utils.data.TensorDataset The dataset containing the data to be used for training or evaluation. Important Notes: - If the output_examples is True, examples and features also are returned. - If evaluate = True, the output will be a dictionary for which the keys are the name of the datasets used for evaluation and the values are the dataset (and optionally the examples and features). """ if not args.train_file_path and not (args.do_aug and args.aug_file_path): logging.error( 'load_and_cache_examples requires one of either \"train_file_path\", \"aug_file_path\"' ) # Use the augmented data or the original training data train_or_aug_path = args.train_file_path if not use_aug_path else args.aug_file_path input_dir = args.data_dir if args.data_dir else "." cached_features_file = os.path.join( input_dir, "cached_{}_{}_{}".format( "dev" if evaluate else "train", list(filter(None, args.model_name_or_path.split("/"))).pop(), str(args.max_seq_length), ), ) # Init features and dataset from cache if it exists if os.path.exists(cached_features_file) and not args.overwrite_cache: logger.info("Loading features from cached file %s", cached_features_file) features_and_dataset = torch.load(cached_features_file) features, dataset, examples = ( features_and_dataset["features"], features_and_dataset["dataset"], features_and_dataset["examples"], ) else: logger.info("Creating features from dataset file at %s", input_dir) if not args.data_dir and ((evaluate and not args.predict_file_path) or (not evaluate and not train_or_aug_path)): try: import tensorflow_datasets as tfds except ImportError: raise ImportError( "If not data_dir is specified, tensorflow_datasets needs to be installed." ) if args.version_2_with_negative: logger.warn( "tensorflow_datasets does not handle version 2 of SQuAD.") tfds_examples = tfds.load("squad") examples = SquadV1Processor().get_examples_from_dataset( tfds_examples, evaluate=evaluate) else: if evaluate: # when does it concatenate if eval and train are both true? examples = {} processor = AlumSquadV2Processor( ) if args.version_2_with_negative else AlumSquadV1Processor() for predict_sets, predict_paths in args.predict_file_path.items( ): examples[predict_sets] = processor.alum_get_dev_examples( args.data_dir, filename=predict_paths) logger.info("Evaluation Data is fetched for %s.", predict_sets) else: processor = SquadV2Processor( ) if args.version_2_with_negative else SquadV1Processor() examples = processor.get_train_examples( args.data_dir, filename=train_or_aug_path) if not evaluate: features, dataset = squad_convert_examples_to_features( examples=examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=not evaluate, return_dataset="pt", #threads=args.threads, ) logger.info("Saving features into cached file %s", cached_features_file) torch.save( { "features": features, "dataset": dataset, "examples": examples }, cached_features_file) else: #TODO: Incremental Cache - The current version will cache all the eval files together. features, dataset = {}, {} for predict_sets, example in examples.items(): features[predict_sets], dataset[ predict_sets] = alum_squad_convert_examples_to_features( examples=example, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, return_dataset="pt", #threads=args.threads, ) logger.info( "Feature Extraction for Evaluation Data from %s is Finished.", predict_sets) logger.info("Saving features into cached file %s", cached_features_file) torch.save( { "features": features, "dataset": dataset, "examples": examples }, cached_features_file) if output_examples: return dataset, examples, features return dataset
def run_prediction(model, question_texts, context_text): """Setup function to compute predictions""" processor = SquadV2Processor() config = model.model.config tokenizer = model.tokenizer examples = [] model = model.model device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) for i, question_text in enumerate(question_texts): example = SquadExample( qas_id=str(i), question_text=question_text, context_text=context_text, answer_text=None, start_position_character=None, title="Predict", is_impossible=False, answers=None, ) examples.append(example) features, dataset = squad_convert_examples_to_features( examples=examples, tokenizer=tokenizer, max_seq_length=384, doc_stride=128, max_query_length=64, is_training=False, return_dataset="pt", threads=1, ) eval_sampler = SequentialSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=10) all_results = [] for batch in eval_dataloader: model.eval() batch = tuple(t.to(device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], } example_indices = batch[3] outputs = model(**inputs) for i, example_index in enumerate(example_indices): eval_feature = features[example_index.item()] unique_id = int(eval_feature.unique_id) output = [to_list(output[i]) for output in outputs] start_logits, end_logits = output result = SquadResult(unique_id, start_logits, end_logits) all_results.append(result) if not os.path.exists("predictions"): os.mkdir("predictions") output_prediction_file = "predictions/predictions.json" output_nbest_file = "predictions/nbest_predictions.json" output_null_log_odds_file = "predictions/null_predictions.json" predictions = compute_predictions_logits( examples, features, all_results, n_best_size, max_answer_length, do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, False, # verbose_logging True, # version_2_with_negative null_score_diff_threshold, tokenizer, ) return predictions
def run_prediction(question_texts, context_text, model_path): ### Setting hyperparameters max_seq_length = 512 doc_stride = 256 n_best_size = 1 max_query_length = 64 max_answer_length = 512 do_lower_case = False null_score_diff_threshold = 0.0 # model_name_or_path = "../cuad-models/roberta-base/" def to_list(tensor): return tensor.detach().cpu().tolist() config_class, model_class, tokenizer_class = ( AutoConfig, AutoModelForQuestionAnswering, AutoTokenizer) config = config_class.from_pretrained(model_path) tokenizer = tokenizer_class.from_pretrained( model_path, do_lower_case=True, use_fast=False) model = model_class.from_pretrained(model_path, config=config) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) processor = SquadV2Processor() examples = [] for i, question_text in enumerate(question_texts): example = SquadExample( qas_id=str(i), question_text=question_text, context_text=context_text, answer_text=None, start_position_character=None, title="Predict", answers=None, ) examples.append(example) features, dataset = squad_convert_examples_to_features( examples=examples, tokenizer=tokenizer, max_seq_length=max_seq_length, doc_stride=doc_stride, max_query_length=max_query_length, is_training=False, return_dataset="pt", threads=1, ) eval_sampler = SequentialSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=10) all_results = [] for batch in eval_dataloader: model.eval() batch = tuple(t.to(device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], } example_indices = batch[3] outputs = model(**inputs) for i, example_index in enumerate(example_indices): eval_feature = features[example_index.item()] unique_id = int(eval_feature.unique_id) output = [to_list(output[i]) for output in outputs.to_tuple()] start_logits, end_logits = output result = SquadResult(unique_id, start_logits, end_logits) all_results.append(result) final_predictions = compute_predictions_logits( all_examples=examples, all_features=features, all_results=all_results, n_best_size=n_best_size, max_answer_length=max_answer_length, do_lower_case=do_lower_case, output_prediction_file=None, output_nbest_file=None, output_null_log_odds_file=None, verbose_logging=False, version_2_with_negative=True, null_score_diff_threshold=null_score_diff_threshold, tokenizer=tokenizer ) return final_predictions
def load_and_cache_examples(args, tokenizer, prefix, evaluate=False, output_examples=False, gpt=False): """ Loads the training file from the SQuAD2.0 dataset and splits it into 90% : 10% train : test splits. It caches and return the dataset and features for each split. """ # Load data features from cache or dataset file input_dir = args.data_dir if args.data_dir else "." cached_features_file_train = os.path.join( input_dir, "cached_{}_{}_{}_train".format( prefix, list(filter(None, args.model_name.split("/"))).pop(), str(args.max_seq_length), ), ) cached_features_file_dev = os.path.join( input_dir, "cached_{}_{}_{}_dev".format( prefix, list(filter(None, args.model_name.split("/"))).pop(), str(args.max_seq_length), ), ) # Init features and dataset from cache if it exists if (os.path.exists(cached_features_file_train) and not args.overwrite_cache and not evaluate): logger.info("Loading features from cached file: {}".format( cached_features_file_train)) features_and_dataset = torch.load(cached_features_file_train) train_features, train_ds, train_examples = ( features_and_dataset["features"], features_and_dataset["dataset"], features_and_dataset["examples"], ) elif (os.path.exists(cached_features_file_dev) and not args.overwrite_cache and evaluate): logger.info("Loading features from cached file: {}".format( cached_features_file_dev)) features_and_dataset = torch.load(cached_features_file_dev) dev_features, dev_ds, dev_examples = ( features_and_dataset["features"], features_and_dataset["dataset"], features_and_dataset["examples"], ) # Preprocess examples into features if not already in cache else: logger.info("Creating features from dataset file at %s", input_dir) if not args.data_dir and not args.train_file: raise ImportError( "Please specify --data_dir or {}".format('--train_file')) else: processor = SquadV2Processor() examples = processor.get_train_examples(args.data_dir, filename=args.train_file) train_examples = examples[:len(examples) * 9 // 10] dev_examples = examples[len(examples) * 9 // 10:] train_features = squad_convert_examples_to_features( examples=train_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=True, threads=args.threads, gpt=gpt) train_ds = feats_to_ds(train_features) logger.info("Saving train features into cached file %s", cached_features_file_train) torch.save( { "features": train_features, "dataset": train_ds, "examples": train_examples }, cached_features_file_train) dev_features = squad_convert_examples_to_features( examples=dev_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=True, threads=args.threads, gpt=gpt) dev_ds = feats_to_ds(dev_features) logger.info("Saving dev features into cached file %s", cached_features_file_dev) torch.save( { "features": dev_features, "dataset": dev_ds, "examples": dev_examples }, cached_features_file_dev) if evaluate: if output_examples: return dev_ds, dev_examples, dev_features else: return dev_ds else: if output_examples: return train_ds, train_examples, train_features else: return train_ds
def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False): if args.local_rank not in [-1, 0] and not evaluate: # Make sure only the first process in distributed training process the dataset, and the others will use the cache torch.distributed.barrier() # Load data features from cache or dataset file input_dir = args.data_dir if args.data_dir else "." cached_features_file = os.path.join( input_dir, "cached_{}_{}_{}".format( "dev" if evaluate else "train", list(filter(None, args.model_name_or_path.split("/"))).pop(), str(args.max_seq_length), ), ) # Init features and dataset from cache if it exists if os.path.exists(cached_features_file) and not args.overwrite_cache: logger.info("Loading features from cached file %s", cached_features_file) features_and_dataset = torch.load(cached_features_file) features, dataset, examples = ( features_and_dataset["features"], features_and_dataset["dataset"], features_and_dataset["examples"], ) else: logger.info("Creating features from dataset file at %s", input_dir) if not args.data_dir and ((evaluate and not args.predict_file) or (not evaluate and not args.train_file)): try: import tensorflow_datasets as tfds except ImportError: raise ImportError( "If not data_dir is specified, tensorflow_datasets needs to be installed." ) if args.version_2_with_negative: logger.warn( "tensorflow_datasets does not handle version 2 of SQuAD.") tfds_examples = tfds.load("squad") examples = SquadV1Processor().get_examples_from_dataset( tfds_examples, evaluate=evaluate) else: processor = SquadV2Processor( ) if args.version_2_with_negative else SquadV1Processor() if evaluate: examples = processor.get_dev_examples( args.data_dir, filename=args.predict_file) else: examples = processor.get_train_examples( args.data_dir, filename=args.train_file) features, dataset = squad_convert_examples_to_features( examples=examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=not evaluate, return_dataset="pt", threads=args.threads, ) if args.local_rank in [-1, 0]: logger.info("Saving features into cached file %s", cached_features_file) torch.save( { "features": features, "dataset": dataset, "examples": examples }, cached_features_file) if args.local_rank == 0 and not evaluate: # Make sure only the first process in distributed training process the dataset, and the others will use the cache torch.distributed.barrier() if output_examples: return dataset, examples, features return dataset
reporthook=t.update_to) else: # Simple download with no progress bar urllib.request.urlretrieve(url, output_path) for d in ('train', 'dev', 'test'): url = 'https://github.com/chrischute/squad/data/{}-v2.0.json'.format(d) output_path = url_to_data_path(url) if not os.path.exists(output_path): print(f'Downloading {d}...') download_url(url, output_path) print("Creating features from dataset file at {}".format(input_dir)) processor = SquadV2Processor( ) if version_2_with_negative else SquadV1Processor() for d in ('train', 'dev', 'test'): evaluate = d != 'train' cached_features_file = '.data/cached_{}'.format(d) examples = processor.get_dev_examples(data_dir, filename='{}-v2.0.json'.format(d)) features, dataset = squad_convert_examples_to_features( examples=examples, tokenizer=tokenizer, max_seq_length=max_seq_length, doc_stride=doc_stride, max_query_length=max_query_length, is_training=not evaluate, return_dataset="pt", threads=threads,
from transformers import (WEIGHTS_NAME, BertConfig, BertForQuestionAnswering, BertTokenizer) from torch.utils.data import (DataLoader, SequentialSampler) # Load pretrained model and tokenizer config_class, model_class, tokenizer_class = (BertConfig, BertForQuestionAnswering, BertTokenizer) config = config_class.from_pretrained(model_name_or_path, cache_dir=cache_dir) tokenizer = tokenizer_class.from_pretrained(model_name_or_path, do_lower_case=True, cache_dir=cache_dir) model = model_class.from_pretrained(model_name_or_path, from_tf=False, config=config, cache_dir=cache_dir) # load_and_cache_examples from transformers.data.processors.squad import SquadV2Processor processor = SquadV2Processor() examples = processor.get_dev_examples(None, filename=predict_file) from transformers import squad_convert_examples_to_features features, dataset = squad_convert_examples_to_features( examples=examples, tokenizer=tokenizer, max_seq_length=max_seq_length, doc_stride=doc_stride, max_query_length=max_query_length, is_training=False, return_dataset='pt' ) cached_features_file = os.path.join(cache_dir, 'cached_{}_{}_{}'.format( 'dev',
def run_squad_and_get_results( model: tf.keras.Model, # Must be QuestionAnswering model, not PreTraining tokenizer: PreTrainedTokenizer, run_name: str, filesystem_prefix: str, per_gpu_batch_size: int, checkpoint_frequency: Optional[int], validate_frequency: Optional[int], evaluate_frequency: Optional[int], learning_rate: float, warmup_steps: int, total_steps: int, dataset: str, dummy_eval: bool = False, ) -> Dict: checkpoint_frequency = checkpoint_frequency or 1000000 validate_frequency = validate_frequency or 1000000 evaluate_frequency = evaluate_frequency or 1000000 is_sagemaker = filesystem_prefix.startswith("/opt/ml") disable_tqdm = is_sagemaker schedule = LinearWarmupPolyDecaySchedule( max_learning_rate=learning_rate, end_learning_rate=0, warmup_steps=warmup_steps, total_steps=total_steps, ) optimizer = tfa.optimizers.AdamW(weight_decay=0.0, learning_rate=schedule) optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite( optimizer, loss_scale="dynamic" ) # AMP if dataset == "squadv1": train_filename = "train-v1.1.json" val_filename = "dev-v1.1.json" processor = SquadV1Processor() elif dataset == "squadv2": train_filename = "train-v2.0.json" val_filename = "dev-v2.0.json" processor = SquadV2Processor() elif dataset == "debug": train_filename = "dev-v2.0.json" val_filename = "dev-v2.0.json" processor = SquadV2Processor() else: assert False, "--dataset must be one of ['squadv1', 'squadv2', 'debug']" data_dir = os.path.join(filesystem_prefix, "squad_data") train_dataset = get_dataset( tokenizer=tokenizer, processor=processor, data_dir=data_dir, filename=train_filename, per_gpu_batch_size=per_gpu_batch_size, shard=True, shuffle=True, repeat=True, drop_remainder=True, ) if hvd.rank() == 0: logger.info(f"Starting finetuning on {dataset}") pbar = tqdm.tqdm(total_steps, disable=disable_tqdm) summary_writer = None # Only create a writer if we make it through a successful step val_dataset = get_dataset( tokenizer=tokenizer, processor=processor, data_dir=data_dir, filename=val_filename, per_gpu_batch_size=per_gpu_batch_size, shard=False, shuffle=True, drop_remainder=False, ) # Need to re-wrap every time this function is called # Wrapping train_step gives an error with optimizer initialization on the second pass # of run_squad_and_get_results(). Bug report at https://github.com/tensorflow/tensorflow/issues/38875 # Discussion at https://github.com/tensorflow/tensorflow/issues/27120 global train_step train_step = rewrap_tf_function(train_step) for step, batch in enumerate(train_dataset): learning_rate = schedule(step=tf.constant(step, dtype=tf.float32)) loss, acc, exact_match, f1, precision, recall = train_step( model=model, optimizer=optimizer, batch=batch ) # Broadcast model after the first step so parameters and optimizer are initialized if step == 0: hvd.broadcast_variables(model.variables, root_rank=0) hvd.broadcast_variables(optimizer.variables(), root_rank=0) is_final_step = step >= total_steps - 1 if hvd.rank() == 0: do_checkpoint = ((step > 0) and step % checkpoint_frequency == 0) or is_final_step do_validate = ((step > 0) and step % validate_frequency == 0) or is_final_step do_evaluate = ((step > 0) and step % evaluate_frequency == 0) or is_final_step pbar.update(1) description = f"Loss: {loss:.3f}, Acc: {acc:.3f}, EM: {exact_match:.3f}, F1: {f1:.3f}" pbar.set_description(description) if do_validate: logger.info("Running validation") ( val_loss, val_acc, val_exact_match, val_f1, val_precision, val_recall, ) = run_validation(model=model, val_dataset=val_dataset) description = ( f"Step {step} validation - Loss: {val_loss:.3f}, Acc: {val_acc:.3f}, " f"EM: {val_exact_match:.3f}, F1: {val_f1:.3f}" ) logger.info(description) if do_evaluate: logger.info("Running evaluation") if dummy_eval: results = { "exact": 0.8169797018445212, "f1": 4.4469722448269335, "total": 11873, "HasAns_exact": 0.15182186234817813, "HasAns_f1": 7.422216845956518, "HasAns_total": 5928, "NoAns_exact": 1.4802354920100924, "NoAns_f1": 1.4802354920100924, "NoAns_total": 5945, "best_exact": 50.07159100480081, "best_exact_thresh": 0.0, "best_f1": 50.0772059855695, "best_f1_thresh": 0.0, } else: results: Dict = get_evaluation_metrics( model=model, tokenizer=tokenizer, data_dir=data_dir, filename=val_filename, per_gpu_batch_size=32, ) print_eval_metrics(results=results, step=step, dataset=dataset) if do_checkpoint: # TODO: Abstract out to specify any checkpoint path checkpoint_path = os.path.join( filesystem_prefix, f"checkpoints/squad/{run_name}-step{step}.ckpt" ) logger.info(f"Saving checkpoint at {checkpoint_path}") model.save_weights(checkpoint_path) if summary_writer is None: # TODO: Abstract out to specify any logs path summary_writer = tf.summary.create_file_writer( os.path.join(filesystem_prefix, f"logs/squad/{run_name}") ) with summary_writer.as_default(): tf.summary.scalar("learning_rate", learning_rate, step=step) tf.summary.scalar("train_loss", loss, step=step) tf.summary.scalar("train_acc", acc, step=step) tf.summary.scalar("train_exact", exact_match, step=step) tf.summary.scalar("train_f1", f1, step=step) tf.summary.scalar("train_precision", precision, step=step) tf.summary.scalar("train_recall", recall, step=step) if do_validate: tf.summary.scalar("val_loss", val_loss, step=step) tf.summary.scalar("val_acc", val_acc, step=step) tf.summary.scalar("val_exact", val_exact_match, step=step) tf.summary.scalar("val_f1", val_f1, step=step) tf.summary.scalar("val_precision", val_precision, step=step) tf.summary.scalar("val_recall", val_recall, step=step) # And the eval metrics tensorboard_eval_metrics( summary_writer=summary_writer, results=results, step=step, dataset=dataset ) if is_final_step: break del train_dataset # Can we return a value only on a single rank? if hvd.rank() == 0: pbar.close() logger.info(f"Finished finetuning, job name {run_name}") return results
import json import torch from distilbert_squad import DISTILBERT_SQUAD from torch.utils.data import DataLoader from torch.utils.tensorboard import SummaryWriter from transformers import squad_convert_examples_to_features from transformers.data.processors.squad import SquadResult, SquadV2Processor from transformers import DistilBertModel, DistilBertConfig, DistilBertTokenizer device = torch.device('cuda') logger = SummaryWriter('logs/distilbert_model') tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') feature_processor = SquadV2Processor() examples = feature_processor.get_train_examples('../data') features, dataset = squad_convert_examples_to_features(examples=examples, tokenizer=tokenizer, max_seq_length=512, doc_stride=128, max_query_length=128, is_training=True, return_dataset="pt", threads=1) train_loader = DataLoader(dataset=dataset, batch_size=6, shuffle=True) dev_loader = DataLoader(dataset=dataset, batch_size=1, shuffle=True) dbs = DISTILBERT_SQUAD().to(device) num_epochs = 2 optimizer = torch.optim.Adam(dbs.parameters(), lr=.00003)
def train(model_prefix, model_dir, data_dir, data_file, epochs, layers, batch_size, hidden_dim, max_seq_length, device): # Extract examples tokenizer = AutoTokenizer.from_pretrained(model_prefix) processor = SquadV2Processor() train_examples = processor.get_train_examples(data_dir=data_dir, filename=data_file) # Extract train features print("Loading train features") train_features, train_dataset = squad_convert_examples_to_features( examples=train_examples, tokenizer=tokenizer, max_seq_length=max_seq_length, doc_stride=128, max_query_length=64, is_training=True, return_dataset="pt", threads=1, ) # Initialize model config = AutoConfig.from_pretrained(model_prefix, output_hidden_states=True) model = AutoModelForQuestionAnswering.from_pretrained(model_prefix, config=config) # multi-gpu evaluate model = torch.nn.DataParallel(model) # Initialize probes print("Initializing probes") probes = [] for i in range(layers): p = Probe(hidden_dim) probes.append(p) # Training epochs for epoch in range(epochs): print("Training epoch: {}".format(epoch + 1)) # Initialize train data loader train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=batch_size) # Training batches for batch in tqdm(train_dataloader, desc="Iteration"): # Get batch on the right device and prepare input dict batch = tuple(t.to(device) for t in batch) inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], "start_positions": batch[3], "end_positions": batch[4], } # Distil does not use token type ids if "distil" in model_dir: inputs.pop('token_type_ids') # ALBERT/BERT/Distilibert forward pass model.eval() with torch.no_grad(): outputs = model(**inputs) # Extract hiddent states all_layer_hidden_states = outputs[3][ 1:] # (layers, batch_size, max_seq_len, hidden_size) # Get labels, and update probes for batch start = batch[3] # (batch_size) end = batch[4] # (batch_size) for i, p in enumerate(probes): hiddens = all_layer_hidden_states[ i] # (batch_size, max_seq_len, hidden_size) p.train(hiddens, start, end, device) # Save probes after each epoch print("Epoch complete, saving probes") epoch_dir = model_dir + "/epoch_" + str(epoch + 1) if not os.path.exists(epoch_dir): os.mkdir(epoch_dir) probes_dir = epoch_dir + "/probes" if not os.path.exists(probes_dir): os.mkdir(probes_dir) # Save probes for each layer, both start and end index for i, p in enumerate(probes): p.save(probes_dir, i + 1)