def test_early_stopping_callback(self): # early stopping stops training before num_training_epochs with tempfile.TemporaryDirectory() as tmp_dir: trainer = get_regression_trainer( output_dir=tmp_dir, num_train_epochs=20, gradient_accumulation_steps=1, per_device_train_batch_size=16, load_best_model_at_end=True, evaluation_strategy=EvaluationStrategy.EPOCH, compute_metrics=AlmostAccuracy(), metric_for_best_model="accuracy", ) trainer.add_callback(EarlyStoppingCallback(1, 0.0001)) train_output = trainer.train() self.assertLess(train_output.global_step, 20 * 64 / 16) # Invalid inputs to trainer with early stopping callback result in assertion error with tempfile.TemporaryDirectory() as tmp_dir: trainer = get_regression_trainer( output_dir=tmp_dir, num_train_epochs=20, gradient_accumulation_steps=1, per_device_train_batch_size=16, evaluation_strategy=EvaluationStrategy.EPOCH, compute_metrics=AlmostAccuracy(), metric_for_best_model="accuracy", ) trainer.add_callback(EarlyStoppingCallback(1)) self.assertEqual(trainer.state.global_step, 0) try: trainer.train() except AssertionError: self.assertEqual(trainer.state.global_step, 0)
def __init__(self, model, dataset, train_range: 0.95, output_dir: str = "results", num_train_epochs: int = 100, per_device_train_batch_size: int = 4, per_device_eval_batch_size: int = 4, warmup_steps: int = 500, weight_decay: float = 0.01, logging_dir: str = "logs", early_stopping_patience: int = 20, early_stopping_threshold: float = 1e-5): """ Create DIETTrainer class :param model: model to train :param dataset: dataset (including train and eval) :param train_range: percentage of training dataset :param output_dir: model output directory :param num_train_epochs: number of training epochs :param per_device_train_batch_size: batch_size of training stage :param per_device_eval_batch_size: batch_size of evaluating stage :param warmup_steps: warmup steps :param weight_decay: weight decay :param logging_dir: logging directory """ self.training_args = TrainingArguments( output_dir=output_dir, num_train_epochs=num_train_epochs, per_device_train_batch_size=per_device_train_batch_size, per_device_eval_batch_size=per_device_eval_batch_size, warmup_steps=warmup_steps, weight_decay=weight_decay, logging_dir=logging_dir, load_best_model_at_end=True, metric_for_best_model="loss", greater_is_better=False, evaluation_strategy="epoch", label_names=["entities_labels", "intent_labels"], save_total_limit=1) train_dataset, eval_dataset = random_split( dataset, [ int(len(dataset) * train_range), len(dataset) - int(len(dataset) * train_range) ], generator=torch.Generator().manual_seed(42)) self.trainer = Trainer( model=model, args=self.training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, callbacks=[ EarlyStoppingCallback( early_stopping_patience=early_stopping_patience, early_stopping_threshold=early_stopping_threshold), TensorBoardCallback() ])
def train_model(config_path: str): writer = SummaryWriter() config = read_training_pipeline_params(config_path) logger.info("pretrained_emb {b}", b=config.net_params.pretrained_emb) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') logger.info("Device is {device}", device=device) SRC, TRG, dataset = get_dataset(config.dataset_path, False) train_data, valid_data, test_data = split_data( dataset, **config.split_ration.__dict__) SRC.build_vocab(train_data, min_freq=3) TRG.build_vocab(train_data, min_freq=3) torch.save(SRC.vocab, config.src_vocab_name) torch.save(TRG.vocab, config.trg_vocab_name) logger.info("Vocab saved") print(f"Unique tokens in source (ru) vocabulary: {len(SRC.vocab)}") print(f"Unique tokens in target (en) vocabulary: {len(TRG.vocab)}") train_iterator, valid_iterator, test_iterator = BucketIterator.splits( (train_data, valid_data, test_data), batch_size=config.BATCH_SIZE, device=device, sort_key=_len_sort_key, ) INPUT_DIM = len(SRC.vocab) OUTPUT_DIM = len(TRG.vocab) config_encoder = BertConfig(vocab_size=INPUT_DIM) config_decoder = BertConfig(vocab_size=OUTPUT_DIM) config = EncoderDecoderConfig.from_encoder_decoder_configs( config_encoder, config_decoder) model = EncoderDecoderModel(config=config) config_encoder = model.config.encoder config_decoder = model.config.decoder config_decoder.is_decoder = True config_decoder.add_cross_attention = True config = EncoderDecoderConfig.from_encoder_decoder_configs( config_encoder, config_decoder) model = EncoderDecoderModel(config=config) args = TrainingArguments( output_dir="output", evaluation_strategy="steps", eval_steps=500, per_device_train_batch_size=128, per_device_eval_batch_size=128, num_train_epochs=10, save_steps=3000, seed=0, load_best_model_at_end=True, ) # args.place_model_on_device = device trainer = Trainer( model=model, args=args, train_dataset=train_iterator, eval_dataset=valid_iterator, callbacks=[EarlyStoppingCallback(early_stopping_patience=3)], ) trainer.train() model.save_pretrained("bert2bert")
def _train_transformer( self, epochs=1, early_stopping=True): """Train on parsed dialogs with Transformer""" self._prepare_labels() output_dim = self.target_labels.shape[1] sentences = [" ".join(words) for words in self.utterances] with open(f"corpora/dataset_{self.train_on}.tsv", "w") as trainset_file: print(f"sentence\tlabel", file=trainset_file) for sentence, label in zip(sentences, self.emotion_labels): print(f"{sentence}\t{label}", file=trainset_file) train_texts, val_texts, train_labels, val_labels = train_test_split(sentences, self.target_labels, test_size=.2) self.tokenizer = AutoTokenizer.from_pretrained("allegro/herbert-base-cased") train_encodings = self.tokenizer(train_texts, truncation=True, padding=True) val_encodings = self.tokenizer(val_texts, truncation=True, padding=True) train_dataset = EmotionsDataset(train_encodings, train_labels) val_dataset = EmotionsDataset(val_encodings, val_labels) callbacks = [] if early_stopping: callbacks.append(EarlyStoppingCallback(early_stopping_patience=3)) training_args = TrainingArguments( output_dir="./results", num_train_epochs=epochs, # per_device_train_batch_size=16, # per_device_eval_batch_size=64, # warmup_steps=500, # weight_decay=0.01, # logging_dir='./logs', logging_steps=100, save_steps=100, eval_steps=100, evaluation_strategy="steps", load_best_model_at_end=True, ) self.model = AutoModelForSequenceClassification.from_pretrained("allegro/herbert-base-cased", num_labels=output_dim) if self.use_cuda: self.model.cuda() trainer = Trainer( model=self.model, args=training_args, train_dataset=train_dataset, eval_dataset=val_dataset, callbacks=callbacks ) trainer.train()
def main(cfg): dataset = load_from_disk(TDATA_PATH) dataset.set_format(type="torch", columns=['input_ids', 'attention_mask', 'label']) train_ds, test_ds = dataset["train"], dataset['test'] model = AutoModelForSequenceClassification.from_pretrained(ckpt_path) trainConfig = cfg.train output_dir = os.path.join(trainConfig["output_dir"]) train_args = TrainingArguments( # module pred/ckpt output_dir=output_dir, # tensorboard logs logging_dir="./logs", num_train_epochs=trainConfig["epoch"], per_device_train_batch_size=trainConfig["train_batch_size"], per_device_eval_batch_size=trainConfig["eval_batch_size"], # x (logging / eval /save) every acc * x_steps gradient_accumulation_steps=trainConfig["acc_batch"], evaluation_strategy=IntervalStrategy.EPOCH, label_smoothing_factor=trainConfig["label_smooth"], # AdamW learning_rate=trainConfig["lr"], warmup_steps=trainConfig["warmup"], # apply to all layers but bias / LayerNorm weight_decay=trainConfig["wd"], save_total_limit=2, # if True, ignore param save_strategy / save_steps / save_total_limit load_best_model_at_end=True, # report_to=["none"], report_to=["wandb"], seed=cfg.seed, logging_strategy=IntervalStrategy.STEPS, metric_for_best_model=trainConfig["metric"]) trainer = Trainer( model, args=train_args, train_dataset=train_ds, eval_dataset=test_ds, callbacks=[ EarlyStoppingCallback( early_stopping_patience=trainConfig["early_stopping_patience"] ), ], compute_metrics=compute_metrics, ) y_pred_tuple = trainer.predict(test_ds) logits, y_true, metrics = y_pred_tuple y_pred = logits.argmax(-1) with open("LF.pl", "wb") as f: import pickle pickle.dump([y_pred, y_true], f) print(metrics) acc = accuracy_score(y_true, y_pred) print(acc)
def run_hyperp(train_dataset, eval_dataset, config, model_args, labels, num_labels, label_map, tokenizer, xargs={}): wandb.log({"params": params}) wandb.log({"xargs": xargs}) training_args_dict = { 'output_dir': params["OUTPUT_DIR"], 'num_train_epochs': params["EPOCH_TOP"], 'train_batch_size': params["BATCH_SIZE"], "save_strategy": "epoch", "evaluation_strategy": "steps", "eval_steps": max(10, train_dataset.__len__() // params["BATCH_SIZE"]), "logging_steps": max(10, train_dataset.__len__() // params["BATCH_SIZE"]), "do_train": True, "load_best_model_at_end": params["LOAD_BEST_MODEL"], "learning_rate": params["lr"], "weight_decay": params["weight_decay"], "save_total_limit": 2 } print(training_args_dict) with open(params["TRAIN_ARGS_FILE"], 'w') as fp: json.dump(training_args_dict, fp) parser = HfArgumentParser(TrainingArguments) training_args = parser.parse_json_file( json_file=params["TRAIN_ARGS_FILE"])[0] # Initialize the Trainer trainer = Trainer( model_init=model_init, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, callbacks=[ EarlyStoppingCallback(early_stopping_patience=params["patience"]), LogCallback(params["OUTPUT_DIR"] + "/train_log.json") ]) best_t = trainer.hyperparameter_search( backend="ray", # Choose among many libraries: # https://docs.ray.io/en/latest/tune/api_docs/suggestion.html n_trials=10) print(best_t)
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments, MultiLingAdapterArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args, adapter_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args, adapter_args = parser.parse_args_into_dataclasses( ) if data_args.source_prefix is None and model_args.model_name_or_path in [ "t5-small", "t5-base", "t5-large", "t5-3b", "t5-11b", ]: logger.warning( "You're running a t5 model but didn't provide a source prefix, which is the expected, e.g. with " "`--source_prefix 'summarize: ' `") # Detecting last checkpoint. last_checkpoint = None if os.path.isdir( training_args.output_dir ) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir( training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome.") elif last_checkpoint is not None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) logger.setLevel(logging.INFO if is_main_process(training_args.local_rank ) else logging.WARN) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() logger.info(f"Training/evaluation parameters {training_args}") # Set seed before initializing model. set_seed(training_args.seed) # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files this script will use the first column for the full texts and the second column for the # summaries (unless you specify column names for this with the `text_column` and `summary_column` arguments). # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name) else: data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file extension = data_args.train_file.split(".")[-1] if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file extension = data_args.validation_file.split(".")[-1] if data_args.test_file is not None: data_files["test"] = data_args.test_file extension = data_args.test_file.split(".")[-1] datasets = load_dataset(extension, data_files=data_files) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) model = AutoModelForSeq2SeqLM.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) if model.config.decoder_start_token_id is None: raise ValueError( "Make sure that `config.decoder_start_token_id` is correctly defined" ) # Setup adapters if adapter_args.train_adapter: task_name = data_args.dataset_name or "summarization" # check if adapter already exists, otherwise add it if task_name not in model.config.adapters: # resolve the adapter config adapter_config = AdapterConfig.load( adapter_args.adapter_config, non_linearity=adapter_args.adapter_non_linearity, reduction_factor=adapter_args.adapter_reduction_factor, ) # load a pre-trained from Hub if specified if adapter_args.load_adapter: model.load_adapter( adapter_args.load_adapter, config=adapter_config, load_as=task_name, ) # otherwise, add a fresh adapter else: model.add_adapter(task_name, config=adapter_config) # optionally load a pre-trained language adapter if adapter_args.load_lang_adapter: # resolve the language adapter config lang_adapter_config = AdapterConfig.load( adapter_args.lang_adapter_config, non_linearity=adapter_args.lang_adapter_non_linearity, reduction_factor=adapter_args.lang_adapter_reduction_factor, ) # load the language adapter from Hub lang_adapter_name = model.load_adapter( adapter_args.load_lang_adapter, config=lang_adapter_config, load_as=adapter_args.language, ) else: lang_adapter_name = None # Freeze all model weights except of those of this adapter model.train_adapter([task_name]) # Set the adapters to be used in every forward pass if lang_adapter_name: model.set_active_adapters([lang_adapter_name, task_name]) else: model.set_active_adapters([task_name]) else: if adapter_args.load_adapter or adapter_args.load_lang_adapter: raise ValueError( "Adapters can only be loaded in adapters training mode." "Use --train_adapter to enable adapter training") prefix = data_args.source_prefix if data_args.source_prefix is not None else "" # Preprocessing the datasets. # We need to tokenize inputs and targets. if training_args.do_train: column_names = datasets["train"].column_names elif training_args.do_eval: column_names = datasets["validation"].column_names elif training_args.do_predict: column_names = datasets["test"].column_names else: logger.info( "There is nothing to do. Please pass `do_train`, `do_eval` and/or `do_predict`." ) return # Get the column names for input/target. dataset_columns = summarization_name_mapping.get(data_args.dataset_name, None) if data_args.text_column is None: text_column = dataset_columns[ 0] if dataset_columns is not None else column_names[0] else: text_column = data_args.text_column if text_column not in column_names: raise ValueError( f"--text_column' value '{data_args.text_column}' needs to be one of: {', '.join(column_names)}" ) if data_args.summary_column is None: summary_column = dataset_columns[ 1] if dataset_columns is not None else column_names[1] else: summary_column = data_args.summary_column if summary_column not in column_names: raise ValueError( f"--summary_column' value '{data_args.summary_column}' needs to be one of: {', '.join(column_names)}" ) # Temporarily set max_target_length for training. max_target_length = data_args.max_target_length padding = "max_length" if data_args.pad_to_max_length else False if training_args.label_smoothing_factor > 0 and not hasattr( model, "prepare_decoder_input_ids_from_labels"): logger.warn( "label_smoothing is enabled but the `prepare_decoder_input_ids_from_labels` method is not defined for" f"`{model.__class__.__name__}`. This will lead to loss being calculated twice and will take up more memory" ) def preprocess_function(examples): inputs = examples[text_column] targets = examples[summary_column] inputs = [prefix + inp for inp in inputs] model_inputs = tokenizer(inputs, max_length=data_args.max_source_length, padding=padding, truncation=True) # Setup the tokenizer for targets with tokenizer.as_target_tokenizer(): labels = tokenizer(targets, max_length=max_target_length, padding=padding, truncation=True) # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore # padding in the loss. if padding == "max_length" and data_args.ignore_pad_token_for_loss: labels["input_ids"] = [[ (l if l != tokenizer.pad_token_id else -100) for l in label ] for label in labels["input_ids"]] model_inputs["labels"] = labels["input_ids"] return model_inputs if training_args.do_train: train_dataset = datasets["train"] if "train" not in datasets: raise ValueError("--do_train requires a train dataset") if data_args.max_train_samples is not None: train_dataset = train_dataset.select( range(data_args.max_train_samples)) train_dataset = train_dataset.map( preprocess_function, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, ) if training_args.do_eval: max_target_length = data_args.val_max_target_length if "validation" not in datasets: raise ValueError("--do_eval requires a validation dataset") eval_dataset = datasets["validation"] if data_args.max_val_samples is not None: eval_dataset = eval_dataset.select(range( data_args.max_val_samples)) eval_dataset = eval_dataset.map( preprocess_function, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, ) if training_args.do_predict: max_target_length = data_args.val_max_target_length if "test" not in datasets: raise ValueError("--do_predict requires a test dataset") test_dataset = datasets["test"] if data_args.max_test_samples is not None: test_dataset = test_dataset.select( range(data_args.max_test_samples)) test_dataset = test_dataset.map( preprocess_function, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, ) # Data collator label_pad_token_id = -100 if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id data_collator = DataCollatorForSeq2Seq( tokenizer, model=model, label_pad_token_id=label_pad_token_id, pad_to_multiple_of=8 if training_args.fp16 else None, ) # Metric metric = load_metric("rouge") def postprocess_text(preds, labels): preds = [pred.strip() for pred in preds] labels = [label.strip() for label in labels] # rougeLSum expects newline after each sentence preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds] labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels] return preds, labels def compute_metrics(eval_preds): preds, labels = eval_preds if isinstance(preds, tuple): preds = preds[0] decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True) if data_args.ignore_pad_token_for_loss: # Replace -100 in the labels as we can't decode them. labels = np.where(labels != -100, labels, tokenizer.pad_token_id) decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) # Some simple post-processing decoded_preds, decoded_labels = postprocess_text( decoded_preds, decoded_labels) result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True) # Extract a few results from ROUGE result = { key: value.mid.fmeasure * 100 for key, value in result.items() } prediction_lens = [ np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds ] result["gen_len"] = np.mean(prediction_lens) result = {k: round(v, 4) for k, v in result.items()} return result # Early stopping if data_args.patience and data_args.patience > 0: training_args.load_best_model_at_end = True # Initialize our Trainer trainer = Seq2SeqTrainer( model=model, args=training_args, train_dataset=train_dataset if training_args.do_train else None, eval_dataset=eval_dataset if training_args.do_eval else None, tokenizer=tokenizer, data_collator=data_collator, compute_metrics=compute_metrics if training_args.predict_with_generate else None, do_save_full_model=not adapter_args.train_adapter, do_save_adapters=adapter_args.train_adapter, ) if data_args.patience and data_args.patience > 0: callback = EarlyStoppingCallback( early_stopping_patience=data_args.patience) trainer.add_callback(callback) # Training if training_args.do_train: if last_checkpoint is not None: checkpoint = last_checkpoint elif os.path.isdir(model_args.model_name_or_path): checkpoint = model_args.model_name_or_path else: checkpoint = None train_result = trainer.train(resume_from_checkpoint=checkpoint) trainer.save_model() # Saves the tokenizer too for easy upload metrics = train_result.metrics max_train_samples = (data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)) metrics["train_samples"] = min(max_train_samples, len(train_dataset)) trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() # Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") metrics = trainer.evaluate(max_length=data_args.val_max_target_length, num_beams=data_args.num_beams, metric_key_prefix="eval") max_val_samples = data_args.max_val_samples if data_args.max_val_samples is not None else len( eval_dataset) metrics["eval_samples"] = min(max_val_samples, len(eval_dataset)) trainer.log_metrics("eval", metrics) trainer.save_metrics("eval", metrics) if training_args.do_predict: logger.info("*** Test ***") test_results = trainer.predict( test_dataset, metric_key_prefix="test", max_length=data_args.val_max_target_length, num_beams=data_args.num_beams, ) metrics = test_results.metrics max_test_samples = data_args.max_test_samples if data_args.max_test_samples is not None else len( test_dataset) metrics["test_samples"] = min(max_test_samples, len(test_dataset)) trainer.log_metrics("test", metrics) trainer.save_metrics("test", metrics) if trainer.is_world_process_zero(): if training_args.predict_with_generate: test_preds = tokenizer.batch_decode( test_results.predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True) test_preds = [pred.strip() for pred in test_preds] output_test_preds_file = os.path.join(training_args.output_dir, "test_generations.txt") with open(output_test_preds_file, "w") as writer: writer.write("\n".join(test_preds)) return results
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments, MultiLingAdapterArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args, adapter_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args, adapter_args = parser.parse_args_into_dataclasses( ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) log_level = training_args.get_process_log_level() logger.setLevel(log_level) datasets.utils.logging.set_verbosity(log_level) transformers.utils.logging.set_verbosity(log_level) transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) logger.info(f"Training/evaluation parameters {training_args}") if data_args.source_prefix is None and model_args.model_name_or_path in [ "t5-small", "t5-base", "t5-large", "t5-3b", "t5-11b", ]: logger.warning( "You're running a t5 model but didn't provide a source prefix, which is expected, e.g. with " "`--source_prefix 'translate English to German: ' `") # Detecting last checkpoint. last_checkpoint = None if os.path.isdir( training_args.output_dir ) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir( training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome.") elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # Set seed before initializing model. set_seed(training_args.seed) # Get the datasets: you can either provide your own JSON training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # # For translation, only JSON files are supported, with one field named "translation" containing two keys for the # source and target languages (unless you adapt what follows). # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. raw_datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir) else: data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file extension = data_args.train_file.split(".")[-1] if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file extension = data_args.validation_file.split(".")[-1] if data_args.test_file is not None: data_files["test"] = data_args.test_file extension = data_args.test_file.split(".")[-1] raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) model = AutoModelForSeq2SeqLM.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) model.resize_token_embeddings(len(tokenizer)) # Set decoder_start_token_id if model.config.decoder_start_token_id is None and isinstance( tokenizer, (MBartTokenizer, MBartTokenizerFast)): if isinstance(tokenizer, MBartTokenizer): model.config.decoder_start_token_id = tokenizer.lang_code_to_id[ data_args.target_lang] else: model.config.decoder_start_token_id = tokenizer.convert_tokens_to_ids( data_args.target_lang) if model.config.decoder_start_token_id is None: raise ValueError( "Make sure that `config.decoder_start_token_id` is correctly defined" ) # Setup adapters if adapter_args.train_adapter: task_name = data_args.source_lang.split( "_")[0] + "_" + data_args.target_lang.split("_")[0] # check if adapter already exists, otherwise add it if task_name not in model.config.adapters: # resolve the adapter config adapter_config = AdapterConfig.load( adapter_args.adapter_config, non_linearity=adapter_args.adapter_non_linearity, reduction_factor=adapter_args.adapter_reduction_factor, ) # load a pre-trained from Hub if specified if adapter_args.load_adapter: model.load_adapter( adapter_args.load_adapter, config=adapter_config, load_as=task_name, ) # otherwise, add a fresh adapter else: model.add_adapter(task_name, config=adapter_config) # optionally load a pre-trained language adapter if adapter_args.load_lang_adapter: # resolve the language adapter config lang_adapter_config = AdapterConfig.load( adapter_args.lang_adapter_config, non_linearity=adapter_args.lang_adapter_non_linearity, reduction_factor=adapter_args.lang_adapter_reduction_factor, ) # load the language adapter from Hub lang_adapter_name = model.load_adapter( adapter_args.load_lang_adapter, config=lang_adapter_config, load_as=adapter_args.language, ) else: lang_adapter_name = None # Freeze all model weights except of those of this adapter model.train_adapter([task_name]) # Set the adapters to be used in every forward pass if lang_adapter_name: model.set_active_adapters(ac.Stack(lang_adapter_name, task_name)) else: model.set_active_adapters([task_name]) else: if adapter_args.load_adapter or adapter_args.load_lang_adapter: raise ValueError( "Adapters can only be loaded in adapters training mode." "Use --train_adapter to enable adapter training") prefix = data_args.source_prefix if data_args.source_prefix is not None else "" # Preprocessing the datasets. # We need to tokenize inputs and targets. if training_args.do_train: column_names = raw_datasets["train"].column_names elif training_args.do_eval: column_names = raw_datasets["validation"].column_names elif training_args.do_predict: column_names = raw_datasets["test"].column_names else: logger.info( "There is nothing to do. Please pass `do_train`, `do_eval` and/or `do_predict`." ) return # For translation we set the codes of our source and target languages (only useful for mBART, the others will # ignore those attributes). if isinstance(tokenizer, tuple(MULTILINGUAL_TOKENIZERS)): assert data_args.target_lang is not None and data_args.source_lang is not None, ( f"{tokenizer.__class__.__name__} is a multilingual tokenizer which requires --source_lang and " "--target_lang arguments.") tokenizer.src_lang = data_args.source_lang tokenizer.tgt_lang = data_args.target_lang # For multilingual translation models like mBART-50 and M2M100 we need to force the target language token # as the first generated token. We ask the user to explicitly provide this as --forced_bos_token argument. forced_bos_token_id = ( tokenizer.lang_code_to_id[data_args.forced_bos_token] if data_args.forced_bos_token is not None else None) model.config.forced_bos_token_id = forced_bos_token_id # Get the language codes for input/target. source_lang = data_args.source_lang.split("_")[0] target_lang = data_args.target_lang.split("_")[0] # Temporarily set max_target_length for training. max_target_length = data_args.max_target_length padding = "max_length" if data_args.pad_to_max_length else False if training_args.label_smoothing_factor > 0 and not hasattr( model, "prepare_decoder_input_ids_from_labels"): logger.warning( "label_smoothing is enabled but the `prepare_decoder_input_ids_from_labels` method is not defined for" f"`{model.__class__.__name__}`. This will lead to loss being calculated twice and will take up more memory" ) def preprocess_function(examples): inputs = [ex[source_lang] for ex in examples["translation"]] targets = [ex[target_lang] for ex in examples["translation"]] inputs = [prefix + inp for inp in inputs] model_inputs = tokenizer(inputs, max_length=data_args.max_source_length, padding=padding, truncation=True) # Setup the tokenizer for targets with tokenizer.as_target_tokenizer(): labels = tokenizer(targets, max_length=max_target_length, padding=padding, truncation=True) # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore # padding in the loss. if padding == "max_length" and data_args.ignore_pad_token_for_loss: labels["input_ids"] = [[ (l if l != tokenizer.pad_token_id else -100) for l in label ] for label in labels["input_ids"]] model_inputs["labels"] = labels["input_ids"] return model_inputs if training_args.do_train: if "train" not in raw_datasets: raise ValueError("--do_train requires a train dataset") train_dataset = raw_datasets["train"] if data_args.max_train_samples is not None: train_dataset = train_dataset.select( range(data_args.max_train_samples)) with training_args.main_process_first( desc="train dataset map pre-processing"): train_dataset = train_dataset.map( preprocess_function, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, desc="Running tokenizer on train dataset", ) if training_args.do_eval: max_target_length = data_args.val_max_target_length if "validation" not in raw_datasets: raise ValueError("--do_eval requires a validation dataset") eval_dataset = raw_datasets["validation"] if data_args.max_eval_samples is not None: eval_dataset = eval_dataset.select( range(data_args.max_eval_samples)) with training_args.main_process_first( desc="validation dataset map pre-processing"): eval_dataset = eval_dataset.map( preprocess_function, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, desc="Running tokenizer on validation dataset", ) if training_args.do_predict: max_target_length = data_args.val_max_target_length if "test" not in raw_datasets: raise ValueError("--do_predict requires a test dataset") predict_dataset = raw_datasets["test"] if data_args.max_predict_samples is not None: predict_dataset = predict_dataset.select( range(data_args.max_predict_samples)) with training_args.main_process_first( desc="prediction dataset map pre-processing"): predict_dataset = predict_dataset.map( preprocess_function, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, desc="Running tokenizer on prediction dataset", ) # Data collator label_pad_token_id = -100 if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id if data_args.pad_to_max_length: data_collator = default_data_collator else: data_collator = DataCollatorForSeq2Seq( tokenizer, model=model, label_pad_token_id=label_pad_token_id, pad_to_multiple_of=8 if training_args.fp16 else None, ) # Metric metric = load_metric("sacrebleu") def postprocess_text(preds, labels): preds = [pred.strip() for pred in preds] labels = [[label.strip()] for label in labels] return preds, labels def compute_metrics(eval_preds): preds, labels = eval_preds if isinstance(preds, tuple): preds = preds[0] decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True) if data_args.ignore_pad_token_for_loss: # Replace -100 in the labels as we can't decode them. labels = np.where(labels != -100, labels, tokenizer.pad_token_id) decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) # Some simple post-processing decoded_preds, decoded_labels = postprocess_text( decoded_preds, decoded_labels) result = metric.compute(predictions=decoded_preds, references=decoded_labels) result = {"bleu": result["score"]} prediction_lens = [ np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds ] result["gen_len"] = np.mean(prediction_lens) result = {k: round(v, 4) for k, v in result.items()} return result # Early stopping if data_args.patience and data_args.patience > 0: training_args.load_best_model_at_end = True # Initialize our Trainer trainer_class = Seq2SeqAdapterTrainer if adapter_args.train_adapter else Seq2SeqTrainer trainer = trainer_class( model=model, args=training_args, train_dataset=train_dataset if training_args.do_train else None, eval_dataset=eval_dataset if training_args.do_eval else None, tokenizer=tokenizer, data_collator=data_collator, compute_metrics=compute_metrics if training_args.predict_with_generate else None, ) if data_args.patience and data_args.patience > 0: callback = EarlyStoppingCallback( early_stopping_patience=data_args.patience) trainer.add_callback(callback) # Training if training_args.do_train: checkpoint = None if training_args.resume_from_checkpoint is not None: checkpoint = training_args.resume_from_checkpoint elif last_checkpoint is not None: checkpoint = last_checkpoint train_result = trainer.train(resume_from_checkpoint=checkpoint) trainer.save_model() # Saves the tokenizer too for easy upload metrics = train_result.metrics max_train_samples = (data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)) metrics["train_samples"] = min(max_train_samples, len(train_dataset)) trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() # Evaluation results = {} max_length = (training_args.generation_max_length if training_args.generation_max_length is not None else data_args.val_max_target_length) num_beams = data_args.num_beams if data_args.num_beams is not None else training_args.generation_num_beams if training_args.do_eval: logger.info("*** Evaluate ***") metrics = trainer.evaluate(max_length=max_length, num_beams=num_beams, metric_key_prefix="eval") max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len( eval_dataset) metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) trainer.log_metrics("eval", metrics) trainer.save_metrics("eval", metrics) if training_args.do_predict: logger.info("*** Predict ***") predict_results = trainer.predict(predict_dataset, metric_key_prefix="predict", max_length=max_length, num_beams=num_beams) metrics = predict_results.metrics max_predict_samples = (data_args.max_predict_samples if data_args.max_predict_samples is not None else len(predict_dataset)) metrics["predict_samples"] = min(max_predict_samples, len(predict_dataset)) trainer.log_metrics("predict", metrics) trainer.save_metrics("predict", metrics) if trainer.is_world_process_zero(): if training_args.predict_with_generate: predictions = tokenizer.batch_decode( predict_results.predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True) predictions = [pred.strip() for pred in predictions] output_prediction_file = os.path.join( training_args.output_dir, "generated_predictions.txt") with open(output_prediction_file, "w", encoding="utf-8") as writer: writer.write("\n".join(predictions)) kwargs = { "finetuned_from": model_args.model_name_or_path, "tasks": "translation" } if data_args.dataset_name is not None: kwargs["dataset_tags"] = data_args.dataset_name if data_args.dataset_config_name is not None: kwargs["dataset_args"] = data_args.dataset_config_name kwargs[ "dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}" else: kwargs["dataset"] = data_args.dataset_name languages = [ l for l in [data_args.source_lang, data_args.target_lang] if l is not None ] if len(languages) > 0: kwargs["language"] = languages if training_args.push_to_hub: trainer.push_to_hub(**kwargs) else: trainer.create_model_card(**kwargs) return results
# Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, tokenizer=tokenizer, data_collator=data_collator, compute_metrics=compute_metrics_pos if args.task_name == "en_ewt" else compute_metrics_ner, ) # Early stop if args.inoculation_patience_count != -1: trainer.add_callback( EarlyStoppingCallback(args.inoculation_patience_count)) # Training if training_args.do_train: checkpoint = None train_result = trainer.train(resume_from_checkpoint=checkpoint) metrics = train_result.metrics trainer.save_model() # Saves the tokenizer too for easy upload metrics["train_samples"] = len(train_dataset) # trainer.log_metrics("train", metrics) # trainer.save_metrics("train", metrics) # trainer.save_state() # Evaluation
test_dataset = test_dataset.map(tokenize_fn, batched=True, batch_size=len(test_dataset)) test_dataset = test_dataset.rename_column("label", "labels") test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels']) ############################################################# ## Callbacks + Collator ##################################### ############################################################# callbacks = [] tmcb = None escb = EarlyStoppingCallback(early_stopping_patience=10) callbacks.append(escb) transform = None num_sampled_INV = 0 num_sampled_SIB = 0 label_type = "soft" keep_original = True if t == "ORIG": label_type = "hard" elif t == "INV": num_sampled_INV = 2 label_type = "hard" elif t == "SIB": num_sampled_SIB = 2
def train(args): # + inference 과정까지 추가하였습니다. assert sum([args.use_kfold,args.use_simple_fold,args.no_valid])==1 assert (args.concat_exp_p==0 or args.concat_log_p==0) # assert args.eval_steps == args.logging_steps if args.use_kfold==True: assert (args.num_fold_k>=2) seed_everything(args.seed) USE_KFOLD = args.use_kfold # load model and tokenizer model_type_getattr = args.model_type # ELECTRA # BERT model_name_from_pretrained = args.pretrained_model # "monologg/koelectra-small-discriminator", "monologg/koelectra-small-discriminator" tokenizer = AutoTokenizer.from_pretrained(model_name_from_pretrained) device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') # load dataset # load_data_module = getattr(import_module("load_data"), f'load_data') # dataset = load_tr_val_data("../input/data/train/train.tsv",args) dataset = load_tr_val_data("../input/data/train/final_train_ner.tsv",args) # setting model hyperparameter # config_module = getattr(import_module("transformers"), f'{args.model_type}Config') # model_config = config_module.from_pretrained(model_name_from_pretrained) model_module = getattr(import_module("transformers"), f'{model_type_getattr}ForSequenceClassification') model = model_module.from_pretrained(model_name_from_pretrained, num_labels=42) model.parameters model.to(device) # model_saved_dir = increment_output_dir(args.model_output_dir) model_saved_dir = increment_output_dir(model_name_from_pretrained.replace('/','_')) # f'./results/{output_path}' neptune.append_tag(f"{model_saved_dir.split('/')[-1]}") neptune.append_tag(f"{args.name}") with open('../input/data/label_type.pkl', 'rb') as f: label_type = pickle.load(f) # Simple Train Valid Split # Not KFOLD # => StratifiedShuffleSplit ################################################################################################# ################################################################################################# elif args.use_kfold==True: # KFOLD if not os.path.isdir('./kfold_results'): # 모델들을 저장할 상위폴더 os.makedirs('./kfold_results') kfold = StratifiedKFold(n_splits=args.num_fold_k, random_state=args.seed, shuffle=True) label = dataset['label'] # 이미 해당 모델로 kfold가 수행되고 모델 저장된 적이 있는지 확인 model_name_from_pretrained_used_for_save = model_name_from_pretrained.replace('/','_') check_upper_dir = f'./kfold_results/{model_name_from_pretrained_used_for_save}' if not os.path.isdir(check_upper_dir+'0'): # 존재하지 않는다면 그대로 사용 upper_dir=check_upper_dir+'0' else: # 존재한다면 존재하는 것들 중 숫자 찾아서 최댓값 +1 을 사용 all_directories = glob.glob(f'./kfold_results/*') max_num = max(int(re.search(rf"{model_name_from_pretrained_used_for_save}[0-9]+",ad).group().replace(model_name_from_pretrained_used_for_save,'')) for ad in all_directories if re.search(rf"{model_name_from_pretrained_used_for_save}[0-9]+",ad)) upper_dir = check_upper_dir+str(max_num+1) neptune.log_text('Model_Name_Number', f"{upper_dir.split('/')[-1]}") kfold_train_acc_score = [] kfold_val_acc_score = [] k=0 for train_idx, val_idx in kfold.split(dataset, label): # model_module = getattr(import_module("transformers"), f'{model_type_getattr}ForSequenceClassification') # model = model_module.from_pretrained(model_name_from_pretrained, num_labels=42) config_module = getattr(import_module("transformers"), f'{model_type_getattr}Config') model_config = config_module.from_pretrained(model_name_from_pretrained) # model_config = ElectraConfig.from_pretrained(model_name_from_pretrained) model_config.num_labels = 42 model_config.hidden_dropout_prob = args.hidden_dropout_prob model_module = getattr(import_module("transformers"), f'{model_type_getattr}ForSequenceClassification') model = model_module.from_pretrained(model_name_from_pretrained, config=model_config) model.parameters model.to(device) print('='*50) print('=' * 15 + f'{k}-th Fold Cross Validation Started ({k+1}/{args.num_fold_k})' + '=' * 15) train_dataset = dataset.iloc[train_idx] val_dataset = dataset.iloc[val_idx] # 새로운 외부데이터 추가해서 학습해보기 if args.concat_external_data==True: train_dataset = concat_external_data(train_dataset,label_type,args) train_label = train_dataset['label'].values val_label = val_dataset['label'].values # tokenizing dataset tokenized_train = tokenized_dataset(train_dataset, tokenizer, args) tokenized_val = tokenized_dataset(val_dataset, tokenizer, args) # make dataset for pytorch. RE_train_dataset = RE_Dataset(tokenized_train, train_label) RE_val_dataset = RE_Dataset(tokenized_val, val_label) print('='*50) print('Train & Valid Loaded Successfully!!') print(f'len(RE_train_dataset) : {len(RE_train_dataset)}, len(RE_val_dataset) : {len(RE_val_dataset)}') model_saved_dir = upper_dir+f'/{k}fold' # f'./kfold_results/{model_name_from_pretrained_used_for_save}'+f'/{k}fold' neptune.log_text(f'{k}-th model_saved_dir',model_saved_dir) neptune.log_text(f'Num_Data : {k}-th len(RE_train_dataset)',str(len(RE_train_dataset))) neptune.log_text(f'Num_Data : {k}-th len(RE_val_dataset)',str(len(RE_val_dataset))) # 사용한 option 외에도 다양한 option들이 있습니다. # https://huggingface.co/transformers/main_classes/trainer.html#trainingarguments 참고해주세요. # https://huggingface.co/transformers/main_classes/trainer.html?highlight=trainingarguments#trainingarguments total_num_epochs = (len(RE_train_dataset)//args.batch_size+1)*args.epochs if args.use_warmup_ratio: warmup_steps = total_num_epochs*args.warmup_ratio else: warmup_steps = 0 wandb_run_name = model_saved_dir.replace('./kfold_results/',f'Total {args.num_fold_k}fold :') training_args = TrainingArguments( report_to = 'wandb', # 'all' run_name = f"{args.name+wandb_run_name.replace('/','_')}", output_dir=model_saved_dir, # output directory # overwrite_output_dir=False, # 모델을 저장할 때 덮어쓰기 할 것인지 save_total_limit=args.save_total_limit, # number of total save model. save_steps=args.model_save_steps, # model saving step. num_train_epochs=args.epochs, # total number of training epochs learning_rate=args.lr, # learning_rate per_device_train_batch_size=args.batch_size, # batch size per device during training per_device_eval_batch_size=args.val_batch_size, # batch size for evaluation warmup_steps=warmup_steps, # number of warmup steps for learning rate scheduler weight_decay=args.weight_decay, # strength of weight decay logging_dir='./logs', # directory for storing logs logging_steps=args.logging_steps, # log saving step. evaluation_strategy='steps', # evaluation strategy to adopt during training eval_steps = args.eval_steps, # evaluation step. # max_grad_norm=1, label_smoothing_factor = args.label_smoothing_factor, load_best_model_at_end = args.load_best_model_at_end, # default => False # greater_is_better = True, metric_for_best_model = args.metric_for_best_model, # metric_for_best_model: Optional[str] = None # fp16 = True, # Whether to use 16-bit (mixed) precision training instead of 32-bit training. # dataloader_num_workers = 2, ) # EarlyStopping # 여기선 global epochs 하이퍼파라미터를 기준으로 하지 않고, Total_Step을 본다. # 만약 patience를 1로 설정하면 eval_step * 1만큼을 기준으로 판단한다. (eval_step=25로 설정했다면 25만큼 patience) early_stopping = EarlyStoppingCallback( early_stopping_patience = args.early_stopping_patience, early_stopping_threshold = 1e-4) ## Optimizer if args.optimizer_name == "Adam": optimizer = Adam(model.parameters(), lr=args.min_lr) elif args.optimizer_name == "AdamW": optimizer = AdamW(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) elif args.optimizer_name == "SGD": optimizer = SGD(model.parameters(), lr=args.lr, momentum=0.9, weight_decay=args.weight_decay) # https://arxiv.org/pdf/1608.03983.pdf ## Scheduler T_0 = int(np.ceil(total_num_epochs*args.first_cycle_ratio)) if args.scheduler_name == "Custom": scheduler = CustomizedCosineAnnealingWarmRestarts(optimizer, T_0=T_0, T_mult=2, eta_max=args.lr, T_up=int(T_0*args.first_warmup_ratio), gamma=args.scheduler_gamma, last_epoch=-1) elif args.scheduler_name == "Original": scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=T_0, T_mult=2, eta_min=args.min_lr) # https://huggingface.co/tkransformers/main_classes/trainer.html?highlight=trainer#id1 trainer = Trainer( model=model, # Transformers model to be trained args=training_args, # training arguments, defined above train_dataset=RE_train_dataset, # training dataset eval_dataset=RE_val_dataset, # evaluation dataset compute_metrics=compute_metrics, # define metrics function optimizers= (optimizer,scheduler), # optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]=(None, None)) callbacks= [early_stopping], # callbacks: Optional[List[TrainerCallback]]=None # model_init= ) # train model trainer.train() print(f'Neptune Saving {k}-th Model Logs Plot') # Get Log from Model # Neptune Save Plot (train_eval_loss, learning_rate, eval_accuracy) train_eval_loss_plot, learning_rate_plot, eval_accuracy_plot = k_th_plot_from_logs(trainer.state.log_history) neptune.log_image(f'Logs : {k}-th train_eval_loss_plot',train_eval_loss_plot) neptune.log_image(f'Logs : {k}-th learning_rate_plot',learning_rate_plot) neptune.log_image(f'Logs : {k}-th eval_accuracy_plot',eval_accuracy_plot) print(f'{k}-th train finished!!') state_log_history = trainer.state.log_history eval_log_dict = [log_dict for log_dict in state_log_history if 'eval_loss' in log_dict.keys() ] k_th_val_logs_dict = defaultdict(list) for dict_per_step in eval_log_dict: for key,value in dict_per_step.items(): k_th_val_logs_dict[key].append(value) best_val_acc_score = max(k_th_val_logs_dict['eval_accuracy']) # neptune.log_metric(f'{k}-th train_acc_score',best_train_acc_score) neptune.log_metric(f'{k}-th val_acc_score',best_val_acc_score) kfold_val_acc_score.append(best_val_acc_score) k=int(k) k+=1 # neptune.log_text(f"{args.num_fold_k}-fold train best acc list", f"{kfold_train_acc_score}") neptune.log_text(f"{args.num_fold_k}-fold val best acc list", f"{kfold_val_acc_score}") # neptune.log_metric(f"Result ACC : {args.num_fold_k}-fold train Total Average acc", np.mean(kfold_train_acc_score)) neptune.log_metric(f"Result ACC : {args.num_fold_k}-fold val Total Average acc", np.mean(kfold_val_acc_score))
def train(cfg): SEED = cfg.values.seed MODEL_NAME = cfg.values.model_name USE_KFOLD = cfg.values.val_args.use_kfold TRAIN_ONLY = cfg.values.train_only seed_everything(SEED) device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') # model_config_module = getattr(import_module('transformers'), cfg.values.model_arc + 'Config') model_config = AutoConfig.from_pretrained(MODEL_NAME) model_config.num_labels = 42 whole_df = load_data("/opt/ml/input/data/train/train.tsv") additional_df = load_data("/opt/ml/input/data/train/additional_train.tsv") whole_label = whole_df['label'].values # additional_label = additional_df['label'].values if cfg.values.tokenizer_arc: tokenizer_module = getattr(import_module('transformers'), cfg.values.tokenizer_arc) tokenizer = tokenizer_module.from_pretrained(MODEL_NAME) else: tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) early_stopping = EarlyStoppingCallback(early_stopping_patience=9999999, early_stopping_threshold=0.001) training_args = TrainingArguments( output_dir=cfg.values.train_args.output_dir, # output directory save_total_limit=cfg.values.train_args. save_total_limit, # number of total save model. save_steps=cfg.values.train_args.save_steps, # model saving step. num_train_epochs=cfg.values.train_args. num_epochs, # total number of training epochs learning_rate=cfg.values.train_args.lr, # learning_rate per_device_train_batch_size=cfg.values.train_args. train_batch_size, # batch size per device during training per_device_eval_batch_size=cfg.values.train_args. eval_batch_size, # batch size for evaluation warmup_steps=cfg.values.train_args. warmup_steps, # number of warmup steps for learning rate scheduler weight_decay=cfg.values.train_args. weight_decay, # strength of weight decay max_grad_norm=cfg.values.train_args.max_grad_norm, logging_dir=cfg.values.train_args. logging_dir, # directory for storing logs logging_steps=cfg.values.train_args.logging_steps, # log saving step. evaluation_strategy=cfg.values.train_args. evaluation_strategy, # evaluation strategy to adopt during training # `no`: No evaluation during training. # `steps`: Evaluate every `eval_steps`. # `epoch`: Evaluate every end of epoch. eval_steps=cfg.values.train_args.eval_steps, # evaluation step. dataloader_num_workers=4, seed=SEED, label_smoothing_factor=cfg.values.train_args.label_smoothing_factor, load_best_model_at_end=True, # metric_for_best_model='accuracy' ) if USE_KFOLD: kfold = StratifiedKFold(n_splits=cfg.values.val_args.num_k) k = 1 for train_idx, val_idx in kfold.split(whole_df, whole_label): print('\n') cpprint('=' * 15 + f'{k}-Fold Cross Validation' + '=' * 15) train_df = whole_df.iloc[train_idx] # train_df = pd.concat((train_df, additional_df)) val_df = whole_df.iloc[val_idx] if cfg.values.model_arc == 'Roberta': tokenized_train = roberta_tokenized_dataset( train_df, tokenizer) tokenized_val = roberta_tokenized_dataset(val_df, tokenizer) else: tokenized_train = tokenized_dataset(train_df, tokenizer) tokenized_val = tokenized_dataset(val_df, tokenizer) RE_train_dataset = RE_Dataset(tokenized_train, train_df['label'].values) RE_val_dataset = RE_Dataset(tokenized_val, val_df['label'].values) try: if cfg.values.model_name == 'Bert': model = BertForSequenceClassification.from_pretrained( MODEL_NAME, config=model_config) else: model = AutoModelForSequenceClassification.from_pretrained( MODEL_NAME, config=model_config) except: # model_module = getattr(import_module('transformers'), cfg.values.model_arc) model_module = getattr( import_module('transformers'), cfg.values.model_arc + 'ForSequenceClassification') model = model_module.from_pretrained(MODEL_NAME, config=model_config) model.parameters model.to(device) training_args.output_dir = cfg.values.train_args.output_dir + f'/{k}fold' training_args.logging_dir = cfg.values.train_args.output_dir + f'/{k}fold' optimizer = MADGRAD(model.parameters(), lr=training_args.learning_rate) total_step = len( RE_train_dataset ) / training_args.per_device_train_batch_size * training_args.num_train_epochs scheduler = transformers.get_linear_schedule_with_warmup( optimizer, num_warmup_steps=training_args.warmup_steps, num_training_steps=total_step) optimizers = optimizer, scheduler trainer = Trainer( model= model, # the instantiated 🤗 Transformers model to be trained args=training_args, # training arguments, defined above train_dataset=RE_train_dataset, # training dataset eval_dataset=RE_val_dataset, # evaluation dataset compute_metrics=compute_metrics, # define metrics function optimizers=optimizers, # callbacks=[early_stopping] ) k += 1 # train model trainer.train() else: cpprint('=' * 20 + f'START TRAINING' + '=' * 20) if not TRAIN_ONLY: train_df, val_df = train_test_split( whole_df, test_size=cfg.values.val_args.test_size, random_state=SEED) # train_df = pd.concat((train_df, additional_df)) if cfg.values.model_arc == 'Roberta': tokenized_train = roberta_tokenized_dataset( train_df, tokenizer) tokenized_val = roberta_tokenized_dataset(val_df, tokenizer) else: tokenized_train = tokenized_dataset(train_df, tokenizer) tokenized_val = tokenized_dataset(val_df, tokenizer) RE_train_dataset = RE_Dataset(tokenized_train, train_df['label'].values) RE_val_dataset = RE_Dataset(tokenized_val, val_df['label'].values) try: if cfg.values.model_name == 'Bert': model = BertForSequenceClassification.from_pretrained( MODEL_NAME, config=model_config) else: model = AutoModelForSequenceClassification.from_pretrained( MODEL_NAME, config=model_config) except: # model_module = getattr(import_module('transformers'), cfg.values.model_arc) model_module = getattr( import_module('transformers'), cfg.values.model_arc + 'ForSequenceClassification') model = model_module.from_pretrained(MODEL_NAME, config=model_config) model.parameters model.to(device) optimizer = transformers.AdamW(model.parameters(), lr=training_args.learning_rate) total_step = len( RE_train_dataset ) / training_args.per_device_train_batch_size * training_args.num_train_epochs # scheduler = transformers.get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=training_args.warmup_steps, num_training_steps=total_step) scheduler = transformers.get_linear_schedule_with_warmup( optimizer, num_warmup_steps=training_args.warmup_steps, num_training_steps=total_step) optimizers = optimizer, scheduler trainer = Trainer( model= model, # the instantiated 🤗 Transformers model to be trained args=training_args, # training arguments, defined above train_dataset=RE_train_dataset, # training dataset eval_dataset=RE_val_dataset, # evaluation dataset compute_metrics=compute_metrics, # define metrics function optimizers=optimizers, callbacks=[early_stopping]) # train model trainer.train() else: training_args.evaluation_strategy = 'no' if cfg.values.model_arc == 'Roberta': print('Roberta') tokenized_train = roberta_tokenized_dataset( whole_df, tokenizer) else: tokenized_train = tokenized_dataset(whole_df, tokenizer) RE_train_dataset = RE_Dataset(tokenized_train, whole_df['label'].values) try: model = AutoModelForSequenceClassification.from_pretrained( MODEL_NAME, config=model_config) except: # model_module = getattr(import_module('transformers'), cfg.values.model_arc) model_module = getattr( import_module('transformers'), cfg.values.model_arc + 'ForSequenceClassification') model = model_module.from_pretrained(MODEL_NAME, config=model_config) model.parameters model.to(device) training_args.output_dir = cfg.values.train_args.output_dir + '/only_train' training_args.logging_dir = cfg.values.train_args.output_dir + '/only_train' optimizer = AdamP(model.parameters(), lr=training_args.learning_rate) total_step = len( RE_train_dataset ) / training_args.per_device_train_batch_size * training_args.num_train_epochs scheduler = transformers.get_linear_schedule_with_warmup( optimizer, num_warmup_steps=training_args.warmup_steps, num_training_steps=total_step) optimizers = optimizer, scheduler trainer = Trainer( model= model, # the instantiated 🤗 Transformers model to be trained args=training_args, # training arguments, defined above train_dataset=RE_train_dataset, # training dataset optimizers=optimizers, # callbacks=[early_stopping] ) # train model trainer.train()
def run_train(train_dataset, eval_dataset, config, model_args, labels, num_labels, label_map, tokenizer, xargs={}): # First freeze bert weights and train # log_params = copy.copy(params) # log_params['model_type']= params['model_type'].name # wandb.log({"params":log_params}) # wandb.log({"xargs":xargs}) wb_run = wandb.init(project="NER", name=params['exp_name'] + "_top_model", reinit=True) xargs['tf'] = params.get('tf', False) model = get_model(model_path=model_args["model_name_or_path"], cache_dir=model_args['cache_dir'], config=config, model_type=params['model_type'], xargs=xargs) if not params['grad_e2e']: for param in model.base_model.parameters(): param.requires_grad = False else: freeze_model(model) if 'add_vocab' in params.keys(): model.resize_token_embeddings(len(tokenizer)) for param in model.bert.embeddings.parameters(): param.requires_grad = True # Change from default eval mode to train mode model.train() print(model) training_args_dict = { 'output_dir': params["OUTPUT_DIR"], 'num_train_epochs': params["EPOCH_TOP"], 'train_batch_size': params["BATCH_SIZE"], "save_strategy": "epoch", "evaluation_strategy": "steps", "eval_steps": max(10, train_dataset.__len__() // params["BATCH_SIZE"]), "logging_steps": max(10, train_dataset.__len__() // params["BATCH_SIZE"]), "do_train": True, "load_best_model_at_end": params["LOAD_BEST_MODEL"], "learning_rate": params["lr"], "weight_decay": params["weight_decay"], "save_total_limit": 2, "report_to": "wandb", # enable logging to W&B "run_name": params['exp_name'] + "_top_model" } print(training_args_dict) with open(params["TRAIN_ARGS_FILE"], 'w') as fp: json.dump(training_args_dict, fp) parser = HfArgumentParser(TrainingArguments) training_args = parser.parse_json_file( json_file=params["TRAIN_ARGS_FILE"])[0] # Initialize the Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, callbacks=[ EarlyStoppingCallback(early_stopping_patience=params["patience"]), LogCallback(params["OUTPUT_DIR"] + "/train_log.json") ]) # Start training trainOutput = trainer.train() trainer.save_model(params["OUTPUT_DIR"]) plot_loss_log(params["OUTPUT_DIR"] + "/train_log.json") best_model = trainer.state.best_model_checkpoint print("top_model_path is at ...", best_model) wb_run.finish() if params['grad_finetune']: # Now reload the model from best model we have found # Reading from file wb_run = wandb.init(project="NER", name=params['exp_name'] + "_full_model", reinit=True) print("The file is loaded from ---------------------------> ", params["OUTPUT_DIR"] + 'config.json') data = json.loads( open(params["OUTPUT_DIR"] + 'config.json', "r").read()) top_model_path = best_model checkpoint = top_model_path.split("/")[-1] print("checkpoint is at ... ", checkpoint) print("top_model_path is at ...", top_model_path) # Config # config = BertConfig.from_pretrained( top_model_path, num_labels=num_labels, id2label=label_map, label2id={label: i for i, label in enumerate(labels)}, cache_dir=model_args['cache_dir']) # Model # xargs['tf'] = False reloaded_model = get_model(model_path=top_model_path + "/", cache_dir=model_args['cache_dir'], config=None, model_type=params['model_type'], xargs=xargs) print("Reloaded", reloaded_model.bert.embeddings) adam_beta1 = 0.9 if params.get('xargs') and params.get('xargs').get('beta1_finetune'): adam_beta1 = params.get('xargs').get('beta1_finetune') # Training args # training_args_dict = { 'output_dir': params["OUTPUT_DIR"], 'num_train_epochs': params["EPOCH_TOP"] + params["EPOCH_END2END"], 'train_batch_size': params["BATCH_SIZE"], "evaluation_strategy": "steps", "eval_steps": max(10, train_dataset.__len__() // params["BATCH_SIZE"]), "logging_steps": max(10, train_dataset.__len__() // params["BATCH_SIZE"]), "do_train": True, "load_best_model_at_end": params["LOAD_BEST_MODEL"], "save_total_limit": 2, "learning_rate": params["lr_finetune"], "weight_decay": params["wd_finetune"] if "wd_finetune" in params.keys() else 0, "ignore_data_skip": True, "report_to": "wandb", # enable logging to W&B "run_name": params['exp_name'] + "_full_model", "adam_beta1": adam_beta1 } with open(params["TRAIN_ARGS_FILE"], 'w') as fp: json.dump(training_args_dict, fp) parser = HfArgumentParser(TrainingArguments) training_args = parser.parse_json_file( json_file=params["TRAIN_ARGS_FILE"])[0] # Then unfreeze the bert weights and fine tune end-to-end model = reloaded_model freeze_model(model) model.to('cuda') # Set to train mode. model.train() # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, callbacks=[ EarlyStoppingCallback( early_stopping_patience=params["patience"], early_stopping_threshold=params.get('esth', 0)), LogCallback(params["OUTPUT_DIR"] + "/train_finetune_log.json") ]) # checkpiont is here. trainer.train() plot_loss_log(params["OUTPUT_DIR"] + "/train_finetune_log.json") wb_run.finish() return trainer, model
def run_mrc(data_args, training_args, model_args, datasets, tokenizer, model): # wandb 사용할 경우 # wandb.login() question_column_name = "question" if "question" in column_names else column_names[0] context_column_name = "context" if "context" in column_names else column_names[1] answer_column_name = "answers" if "answers" in column_names else column_names[2] # Padding side determines if we do (question|context) or (context|question). pad_on_right = tokenizer.padding_side == "right" # check if there is an error last_checkpoint, max_seq_length = check_no_error(training_args, data_args, tokenizer, datasets) # train 데이터 가공 과정 def prepare_train_features(examples): # 토크나이징 + 정답 start, end 토큰 위치 모두 겸비한 input data 만들기! tokenized_examples = tokenizer( examples[question_column_name if pad_on_right else context_column_name], examples[context_column_name if pad_on_right else question_column_name], truncation="only_second" if pad_on_right else "only_first", max_length=max_seq_length, stride=data_args.doc_stride, return_overflowing_tokens=True, return_offsets_mapping=True, padding="max_length" if data_args.pad_to_max_length else False, ) # truncation으로 분리된 data가 원래 어느 context에 소속되었는지 저장. sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping") # 모든 token에 대해, 각 토큰의 원래 문장에서의 위치값(시작, 끝) 정보 저장. offset_mapping = tokenized_examples.pop("offset_mapping") # Let's label those examples! tokenized_examples["start_positions"] = [] tokenized_examples["end_positions"] = [] for i, offsets in enumerate(offset_mapping): # We will label impossible answers with the index of the CLS token. input_ids = tokenized_examples["input_ids"][i] cls_index = input_ids.index(tokenizer.cls_token_id) # Grab the sequence corresponding to that example (to know what is the context and what is the question). # sequence_ids -> cls, 질문, cls, 문단, end 구분 기능 -> [None, 0, 0, ..., 0, None, 1, 1, ...., 1, None] sequence_ids = tokenized_examples.sequence_ids(i) # One example can give several spans, this is the index of the example containing this span of text. # 지금 보고있는 i 번째 tokenized 결과가 어느 context로 만들어졌는지 그 번호 찾음. sample_index = sample_mapping[i] answers = examples[answer_column_name][sample_index] # print(answers) # If no answers are given, set the cls_index as answer. if len(answers["answer_start"]) == 0: tokenized_examples["start_positions"].append(cls_index) tokenized_examples["end_positions"].append(cls_index) else: # Start/end character index of the answer in the text. start_char = answers["answer_start"][0] end_char = start_char + len(answers["text"][0]) # Start token index of the current span in the text. token_start_index = 0 while sequence_ids[token_start_index] != (1 if pad_on_right else 0): token_start_index += 1 # End token index of the current span in the text. token_end_index = len(input_ids) - 1 while sequence_ids[token_end_index] != (1 if pad_on_right else 0): token_end_index -= 1 # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index). if not ( offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char ): tokenized_examples["start_positions"].append(cls_index) tokenized_examples["end_positions"].append(cls_index) else: # Otherwise move the token_start_index and token_end_index to the two ends of the answer. # Note: we could go after the last offset if the answer is the last word (edge case). while ( token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char ): token_start_index += 1 tokenized_examples["start_positions"].append(token_start_index - 1) while offsets[token_end_index][1] >= end_char: token_end_index -= 1 tokenized_examples["end_positions"].append(token_end_index + 1) return tokenized_examples if "train" not in datasets: raise ValueError("--do_train requires a train dataset") column_names = datasets["train"].column_names train_dataset = datasets["train"] train_dataset = train_dataset.map( prepare_train_features, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, ) ## validation 데이터 가공 과정 def prepare_validation_features(examples): tokenized_examples = tokenizer( examples[question_column_name if pad_on_right else context_column_name], examples[context_column_name if pad_on_right else question_column_name], truncation="only_second" if pad_on_right else "only_first", max_length=max_seq_length, stride=data_args.doc_stride, return_overflowing_tokens=True, return_offsets_mapping=True, padding="max_length" if data_args.pad_to_max_length else False, ) sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping") tokenized_examples["example_id"] = [] for i in range(len(tokenized_examples["input_ids"])): sequence_ids = tokenized_examples.sequence_ids(i) context_index = 1 if pad_on_right else 0 sample_index = sample_mapping[i] tokenized_examples["example_id"].append(examples["id"][sample_index]) # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token # position is part of the context or not. # offset_mapping : 각 토큰의 (시작위치, 끝 위치) 정보를 담고 있는데, # query 토큰들의 (시작위치, 끝 위치) 정보를 None으로 바꾸는 과정 # 왜? validation 할 때, output 으로 start logit과 end logit을 받게 된다. # 이때 해당 인덱스를 query 가 아닌 passage에서 찾기 위함. tokenized_examples["offset_mapping"][i] = [ (o if sequence_ids[k] == context_index else None) for k, o in enumerate(tokenized_examples["offset_mapping"][i]) ] return tokenized_examples eval_dataset = datasets["validation"] column_names = datasets["validation"].column_names eval_dataset = eval_dataset.map( prepare_validation_features, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, ) # Data collator # We have already padded to max length if the corresponding flag is True, otherwise we need to pad in the data collator. # pad_to_multiple_of : mixed precision을 사용할 때, 텐서 사이즈가 8의 배수일때 더 효과적이다.' # 따라서,(Funnel Transformer? 뭔지 모르겠지만 이건 32로 세팅) 8로 세팅해서 max_length을 조절 하게 된다. # 근데 이미 tokeneizer가 max_length를 384로 처리하고 있어서 작동 안할 듯. data_collator = ( DataCollatorWithPadding( tokenizer, pad_to_multiple_of=8 if training_args.fp16 else None ) ) # Post-processing : def post_processing_function(examples, features, predictions, training_args): # Post-processing: we match the start logits and end logits to answers in the original context. predictions = postprocess_qa_predictions( examples=examples, features=features, predictions=predictions, max_answer_length=data_args.max_answer_length, output_dir=training_args.output_dir, ) # Format the result to the format the metric expects. formatted_predictions = [ {"id": k, "prediction_text": v} for k, v in predictions.items() ] if training_args.do_predict: return formatted_predictions else: references = [ {"id": ex["id"], "answers": ex[answer_column_name]} for ex in datasets["validation"] ] return EvalPrediction(predictions=formatted_predictions, label_ids=references) metric = load_metric("squad") metric_key_prefix = 'eval' def compute_metrics(p: EvalPrediction): before_prefix_metrics = metric.compute(predictions=p.predictions, references=p.label_ids) metrics ={f'{metric_key_prefix}_{k}':v for k,v in before_prefix_metrics.items()} return metrics # early stop 조건 early_stopping = EarlyStoppingCallback(early_stopping_patience = 50, early_stopping_threshold = 0.2) # QuestionAnsweringTrainer trainer = QuestionAnsweringTrainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, eval_examples=datasets["validation"], tokenizer=tokenizer, data_collator=data_collator, post_process_function=post_processing_function, compute_metrics=compute_metrics, callbacks=[early_stopping], ) # Training if last_checkpoint is not None: checkpoint = last_checkpoint elif os.path.isdir(model_args.model_name_or_path): checkpoint = model_args.model_name_or_path else: checkpoint = None train_result = trainer.train(resume_from_checkpoint=checkpoint) metrics = train_result.metrics metrics["train_samples"] = len(train_dataset) trainer.log_metrics("train", metrics) output_train_file = os.path.join(training_args.output_dir, "train_results.txt") with open(output_train_file, "w") as writer: logger.info("***** Train results *****") for key, value in sorted(train_result.metrics.items()): logger.info(f" {key} = {value}") writer.write(f"{key} = {value}\n")
def run_train(train_dataset, eval_dataset, config, model_args, labels, num_labels, label_map): # First freeze bert weights and train model = get_model(model_path=model_args["model_name_or_path"], cache_dir=model_args['cache_dir'], config=config, model_type=params['model_type']) if not params['grad_e2e']: for param in model.base_model.parameters(): param.requires_grad = False # Change from default eval mode to train mode model.train() training_args_dict = { 'output_dir': params['OUTPUT_DIR'], 'num_train_epochs': params['EPOCH_TOP'], 'train_batch_size': params['BATCH_SIZE'], "save_strategy": "epoch", "evaluation_strategy": "epoch", "load_best_model_at_end": params['LOAD_BEST_MODEL'], "learning_rate": params["lr"], "weight_decay": params["weight_decay"] } with open(params['TRAIN_ARGS_FILE'], 'w') as fp: json.dump(training_args_dict, fp) parser = HfArgumentParser(TrainingArguments) training_args = parser.parse_json_file( json_file=params['TRAIN_ARGS_FILE'])[0] # Initialize the Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]) # Start training trainOutput = trainer.train() trainer.save_model(params['OUTPUT_DIR']) if params['grad_finetune']: # Now reload the model from best model we have found # Reading from file print("The file is loaded from ---------------------------> ", params['OUTPUT_DIR'] + 'config.json') data = json.loads( open(params['OUTPUT_DIR'] + 'config.json', "r").read()) top_model_path = data['_name_or_path'] checkpoint = top_model_path.split("/")[-1] print("checkpoint is at ... ", checkpoint) print("top_model_path is at ...", params['LOAD_BEST_MODEL']) # Config # config = BertConfig.from_pretrained( top_model_path, num_labels=num_labels, id2label=label_map, label2id={label: i for i, label in enumerate(labels)}, cache_dir=model_args['cache_dir']) # Model # reloaded_model = get_model(model_path=top_model_path, cache_dir=model_args['cache_dir'], config=config, model_type=params['model_type']) # Training args # training_args_dict = { 'output_dir': params['OUTPUT_DIR'], 'num_train_epochs': params['EPOCH_END2END'], 'train_batch_size': params['BATCH_SIZE'], "evaluation_strategy": "epoch", "load_best_model_at_end": params['LOAD_BEST_MODEL'], "learning_rate": params["lr"], "weight_decay": params["weight_decay"] } with open(params['TRAIN_ARGS_FILE'], 'w') as fp: json.dump(training_args_dict, fp) parser = HfArgumentParser(TrainingArguments) training_args = parser.parse_json_file( json_file=params['TRAIN_ARGS_FILE'])[0] # Then unfreeze the bert weights and fine tune end-to-end model = reloaded_model COUNT = 1 for param in model.base_model.parameters(): if COUNT >= params['grad_finetune_layers']: param.requires_grad = True COUNT += 1 model.to('cuda') # Set to train mode. model.train() # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]) # checkpiont is here. trainer.train(checkpoint) return trainer, model
def fine_tune(): print("=== Fine-tune ===") args = get_args() print(args) if args.task == "imdb": data_dir = "./../../asset/imdb/" train_labels, train_texts = read_imdb_train(data_dir) elif args.task == "twitter_semeval": data_dir = "./../../asset/twitter_semeval/" train_labels, train_texts = read_twitter_train(data_dir) elif args.task == "twitter_s140": data_dir = "./../../asset/twitter_s140/" train_labels, train_texts = read_twitter_train(data_dir) # check_data() train_texts, val_texts, train_labels, val_labels = train_test_split( train_texts, train_labels, test_size=args.test_size) ## IF HAVE MUCH TIME, try to increase test size because the fine-tuning run fast train_texts = list(train_texts) val_texts = list(val_texts) train_labels = list(train_labels) val_labels = list(val_labels) model_name = args.model # model_name = "bert-base-cased" # model_name = "roberta-base" # model_name = "microsoft/deberta-large-mnli" # model_name = "bert-base-uncased" tokenizer = AutoTokenizer.from_pretrained(model_name) # check_data() train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512) val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512) train_dataset = CustomDataset(train_encodings, train_labels) val_dataset = CustomDataset(val_encodings, val_labels) model = AutoModelForSequenceClassification.from_pretrained(model_name) training_args = TrainingArguments( # output directory output_dir=f'./models/{args.task}/{model_name}/', num_train_epochs=args.epochs, # total number of training epochs per_device_train_batch_size=args. train_bs, # batch size per device during training per_device_eval_batch_size=64, # batch size for evaluation warmup_steps=args. warmup_steps, # number of warmup steps for learning rate scheduler weight_decay=args.weight_decay, # strength of weight decay # directory for storing logs logging_dir=f'./logs/{args.task}/{model_name}/', logging_steps=args.logging_steps, learning_rate=args.learning_rate, seed=0, evaluation_strategy="steps", eval_steps=args.eval_steps, save_total_limit=5, save_steps=args.save_steps, load_best_model_at_end=True) # trainer = Trainer( # # the instantiated 🤗 Transformers model to be trained # model=model, # args=training_args, # training arguments, defined above # train_dataset=train_dataset, # training dataset # eval_dataset=val_dataset, # evaluation dataset # compute_metrics=compute_metrics, # ) trainer = Trainer( # the instantiated 🤗 Transformers model to be trained model=model, args=training_args, # training arguments, defined above train_dataset=train_dataset, # training dataset eval_dataset=val_dataset, # evaluation dataset compute_metrics=compute_metrics, callbacks=[EarlyStoppingCallback(early_stopping_patience=7)], ) trainer.train()
def train(args): wandb.login() seed_everything(args.seed) device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') MODEL_NAME = args.model_name tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) # load dataset train_dataset_dir = "/opt/ml/input/data/train/train.tsv" dataset = load_data(train_dataset_dir) label = dataset['label'].values # k-fold cross validation cv = StratifiedKFold(n_splits=5, random_state=args.seed, shuffle=True) for idx, (train_idx, val_idx) in enumerate(cv.split(dataset, label)): # prepare tokenized datasets and labels each fold train_dataset = tokenized_dataset(dataset.iloc[train_idx], tokenizer) val_dataset = tokenized_dataset(dataset.iloc[val_idx], tokenizer) train_y = label[train_idx] val_y = label[val_idx] # make dataset for pytorch RE_train_dataset = RE_Dataset(train_dataset, train_y) RE_valid_dataset = RE_Dataset(val_dataset, val_y) # instantiate pretrained language model model = AutoModelForSequenceClassification.from_pretrained( MODEL_NAME, num_labels=42) model.to(device) # callbacks early_stopping = EarlyStoppingCallback( early_stopping_patience=args.early_stopping_patience, early_stopping_threshold=0.00005) # set training arguemnts output_dir = './result' + str(idx) training_args = TrainingArguments( output_dir=output_dir, logging_dir='./logs', logging_steps=100, save_total_limit=1, evaluation_strategy='steps', eval_steps=100, load_best_model_at_end=True, metric_for_best_model='accuracy', greater_is_better=True, dataloader_num_workers=args.num_workers, fp16=True, seed=args.seed, run_name=args.run_name, num_train_epochs=args.epochs, per_device_train_batch_size=args.train_batch_size, per_device_eval_batch_size=args.eval_batch_size, label_smoothing_factor=args.label_smoothing_factor, learning_rate=args.lr, warmup_steps=args.warmup_steps, weight_decay=args.weight_decay, ) # traniner trainer = Trainer( model= model, # the instantiated 🤗 Transformers model to be trained args=training_args, # training arguments, defined above train_dataset=RE_train_dataset, # training dataset eval_dataset=RE_valid_dataset, # evaluation dataset tokenizer=tokenizer, compute_metrics=compute_metrics, # define metrics function callbacks=[early_stopping]) # train model trainer.train() # del model model.cpu() del model gc.collect() torch.cuda.empty_cache() # del cache path = glob(f"/opt/ml/code/result{idx}/*")[0] for filename in os.listdir(path): if filename not in [ 'config.json', 'pytorch_model.bin', '.ipynb_checkpoints' ]: rm_filename = os.path.join(path, filename) os.remove(rm_filename) wandb.finish()
def bert(train_path, val_path, INPUT_EPOCH, EVAL_STEPS, test_count, \ hyper_count, fold_count, predict): # define pretrained tokenizer and model model_name = "bert-base-uncased" tokenizer = BertTokenizer.from_pretrained(model_name) model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2) # read data train_data = pd.read_csv(train_path, sep='\t', encoding="utf-8", names=["y", "X"]) val_data = pd.read_csv(val_path, sep='\t', encoding="utf-8", names=["y", "X"]) # preprocess data X_train = list(train_data["X"]) y_train = list(train_data["y"]) X_val = list(val_data["X"]) y_val = list(val_data["y"]) X_train_tokenized = tokenizer(X_train, padding=True, truncation=True, max_length=128) X_val_tokenized = tokenizer(X_val, padding=True, truncation=True, max_length=128) train_dataset = Dataset(X_train_tokenized, y_train) val_dataset = Dataset(X_val_tokenized, y_val) # define trainer args = TrainingArguments( output_dir="output", evaluation_strategy="steps", eval_steps=EVAL_STEPS, per_device_train_batch_size=32, per_device_eval_batch_size=32, num_train_epochs=INPUT_EPOCH, save_steps=3000, seed=0, load_best_model_at_end=True, ) # train pre-trained BERT model trainer = Trainer( model=model, args=args, train_dataset=train_dataset, eval_dataset=val_dataset, compute_metrics=compute_metrics, callbacks=[EarlyStoppingCallback(early_stopping_patience=5)], ) trainer.train() # find the final stored model result = os.listdir("./output")[-1] with open('./output/{}/trainer_state.json'.format(result)) as f: data = json.load(f) # not predict (test) mode if predict == False: with open( f'./result/final_trainer_state_{test_count}_{hyper_count}_{fold_count}_.json', 'w') as output_file: json.dump(data, output_file) else: with open(f'./result/final_test_state_{test_count}.json', 'w') as output_file: json.dump(data, output_file) # retrieve best training loss, eval loss and accuracy best = data['best_model_checkpoint'].split("-")[-1] history = {} history['train_acc'] = 0 history['train_loss'] = 0 print(data) print(data['log_history']) for i in data['log_history']: print(i) if i['step'] == int(best): if 'loss' in i: print("training loss:\t", i['loss']) history['train_loss'] = i['loss'] if 'eval_accuracy' in i: print("eval loss:\t", i['eval_loss']) print("eval accuracy:\t", i['eval_accuracy']) print("eval f1:\t", i['eval_f1']) print("eval precision:\t", i['eval_precision']) print("eval recall:\t", i['eval_recall']) history['val_loss'] = i['eval_loss'] history['val_acc'] = i['eval_accuracy'] history['val_f1'] = i['eval_f1'] history['val_precision'] = i['eval_precision'] history['val_recall'] = i['eval_recall'] raw_pred_train, _, _ = trainer.predict(train_dataset) y_pred_train = np.argmax(raw_pred_train, axis=1) accuracy = accuracy_score(y_true=y_train, y_pred=y_pred_train) history['train_acc'] = accuracy if predict == True: raw_pred, _, _ = trainer.predict(val_dataset) # preprocess raw predictions y_pred = np.argmax(raw_pred, axis=1) report = classification_report(y_val, y_pred, target_names=class_names, digits=4) report_path = "./result/report_{}.txt".format(test_count) text_file = open(report_path, "w") text_file.write(report) # copy the best trained model for current test fold copytree(f'./output/checkpoint-{best}', f'./result/best_test_{test_count}') # clearn the output dictory shutil.rmtree('./output', ignore_errors=True) return history
def train(self, inoculation_train_df, eval_df, model_path, training_args, max_length=128, inoculation_patience_count=5, pd_format=True, scramble_proportion=0.0, eval_with_scramble=False): if pd_format: datasets = {} datasets["train"] = Dataset.from_pandas(inoculation_train_df) datasets["validation"] = Dataset.from_pandas(eval_df) else: datasets = {} datasets["train"] = inoculation_train_df datasets["validation"] = eval_df logger.info(f"***** Train Sample Count (Verify): %s *****"%(len(datasets["train"]))) logger.info(f"***** Valid Sample Count (Verify): %s *****"%(len(datasets["validation"]))) label_list = datasets["validation"].unique("label") label_list.sort() # Let's sort it for determinism sentence1_key, sentence2_key = self.task_config # we will scramble out input sentence here # TODO: we scramble both train and eval sets if self.task_name == "sst3" or self.task_name == "cola": def scramble_inputs(proportion, example): original_text = example[sentence1_key] original_sentence = basic_tokenizer.tokenize(original_text) max_length = len(original_sentence) scramble_length = int(max_length*proportion) scramble_start = random.randint(0, len(original_sentence)-scramble_length) scramble_end = scramble_start + scramble_length scramble_sentence = original_sentence[scramble_start:scramble_end] random.shuffle(scramble_sentence) scramble_text = original_sentence[:scramble_start] + scramble_sentence + original_sentence[scramble_end:] out_string = " ".join(scramble_text).replace(" ##", "").strip() example[sentence1_key] = out_string return example elif self.task_name == "snli" or self.task_name == "mrpc" or self.task_name == "qnli": def scramble_inputs(proportion, example): original_premise = example[sentence1_key] original_hypothesis = example[sentence2_key] if original_hypothesis == None: original_hypothesis = "" try: original_premise_tokens = basic_tokenizer.tokenize(original_premise) original_hypothesis_tokens = basic_tokenizer.tokenize(original_hypothesis) except: print("Please debug these sequence...") print(original_premise) print(original_hypothesis) max_length = len(original_premise_tokens) scramble_length = int(max_length*proportion) scramble_start = random.randint(0, max_length-scramble_length) scramble_end = scramble_start + scramble_length scramble_sentence = original_premise_tokens[scramble_start:scramble_end] random.shuffle(scramble_sentence) scramble_text_premise = original_premise_tokens[:scramble_start] + scramble_sentence + original_premise_tokens[scramble_end:] max_length = len(original_hypothesis_tokens) scramble_length = int(max_length*proportion) scramble_start = random.randint(0, max_length-scramble_length) scramble_end = scramble_start + scramble_length scramble_sentence = original_hypothesis_tokens[scramble_start:scramble_end] random.shuffle(scramble_sentence) scramble_text_hypothesis = original_hypothesis_tokens[:scramble_start] + scramble_sentence + original_hypothesis_tokens[scramble_end:] out_string_premise = " ".join(scramble_text_premise).replace(" ##", "").strip() out_string_hypothesis = " ".join(scramble_text_hypothesis).replace(" ##", "").strip() example[sentence1_key] = out_string_premise example[sentence2_key] = out_string_hypothesis return example if scramble_proportion > 0.0: logger.info(f"You are scrambling the inputs to test syntactic feature importance!") datasets["train"] = datasets["train"].map(partial(scramble_inputs, scramble_proportion)) if eval_with_scramble: logger.info(f"You are scrambling the evaluation data as well!") datasets["validation"] = datasets["validation"].map(partial(scramble_inputs, scramble_proportion)) padding = "max_length" sentence1_key, sentence2_key = self.task_config label_to_id = None def preprocess_function(examples): # Tokenize the texts args = ( (examples[sentence1_key],) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key]) ) result = self.tokenizer(*args, padding=padding, max_length=max_length, truncation=True) # Map labels to IDs (not necessary for GLUE tasks) if label_to_id is not None and "label" in examples: result["label"] = [label_to_id[l] for l in examples["label"]] return result datasets["train"] = datasets["train"].map(preprocess_function, batched=True) datasets["validation"] = datasets["validation"].map(preprocess_function, batched=True) train_dataset = datasets["train"] eval_dataset = datasets["validation"] # Log a few random samples from the training set: for index in random.sample(range(len(train_dataset)), 3): logger.info(f"Sample {index} of the training set: {train_dataset[index]}.") metric = load_metric("glue", "sst2") # any glue task will do the job, just for eval loss def asenti_compute_metrics(p: EvalPrediction): preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions preds = np.argmax(preds, axis=1) result_to_print = classification_report(p.label_ids, preds, digits=5, output_dict=True) print(classification_report(p.label_ids, preds, digits=5)) mcc_scores = matthews_corrcoef(p.label_ids, preds) logger.info(f"MCC scores: {mcc_scores}.") result_to_return = metric.compute(predictions=preds, references=p.label_ids) result_to_return["Macro-F1"] = result_to_print["macro avg"]["f1-score"] result_to_return["MCC"] = mcc_scores return result_to_return # Initialize our Trainer. We are only intersted in evaluations trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, compute_metrics=asenti_compute_metrics, tokenizer=self.tokenizer, # Data collator will default to DataCollatorWithPadding, so we change it if we already did the padding. data_collator=default_data_collator ) # Early stop if inoculation_patience_count != -1: trainer.add_callback(EarlyStoppingCallback(inoculation_patience_count)) # Training if training_args.do_train: logger.info("*** Training our model ***") trainer.train( # we don't need this now. # model_path=model_path ) trainer.save_model() # Saves the tokenizer too for easy upload # Evaluation eval_results = {} if training_args.do_eval: logger.info("*** Evaluate ***") tasks = [self.task_name] eval_datasets = [eval_dataset] for eval_dataset, task in zip(eval_datasets, tasks): eval_result = trainer.evaluate(eval_dataset=eval_dataset) output_eval_file = os.path.join(training_args.output_dir, f"eval_results_{task}.txt") if trainer.is_world_process_zero(): with open(output_eval_file, "w") as writer: logger.info(f"***** Eval results {task} *****") for key, value in eval_result.items(): logger.info(f" {key} = {value}") writer.write(f"{key} = {value}\n") eval_results.update(eval_result)
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) # override default run name and log all args wandb.init(project="wav2vec4humans", config=parser.parse_args()) # Detecting last checkpoint. last_checkpoint = None if (os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir( training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome.") elif last_checkpoint is not None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) logger.setLevel(logging.INFO if is_main_process(training_args.local_rank ) else logging.WARN) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() logger.info("Training/evaluation parameters %s", training_args) # Set seed before initializing model. set_seed(training_args.seed) chars_to_ignore_regex = f'[{"".join(data_args.chars_to_ignore)}]' def remove_special_characters(batch, train=True): batch["text"] = (re.sub(chars_to_ignore_regex, "", unidecode(batch["sentence"])).lower().strip()) if train: batch["text"] += " " return batch def extract_all_chars(batch): all_text = " ".join(batch["text"]) vocab = list(set(all_text)) return {"vocab": [vocab], "all_text": [all_text]} resampler = dict() def get_resampler(sampling_rate): if sampling_rate in resampler.keys(): return resampler[sampling_rate] else: logger.info(f"Creating new resampler for {sampling_rate}") resampler[sampling_rate] = torchaudio.transforms.Resample( sampling_rate, 16_000) return resampler[sampling_rate] # Preprocessing the datasets. # We need to read the audio files as arrays and tokenize the targets. def speech_file_to_array_fn(batch): speech_array, sampling_rate = torchaudio.load(batch["path"]) batch["speech"] = get_resampler(sampling_rate)( speech_array).squeeze().numpy() batch["sampling_rate"] = 16_000 batch["target_text"] = batch["text"] batch["duration"] = len(speech_array.squeeze()) / sampling_rate return batch def filter_by_duration(batch): return (batch["duration"] <= 10 and batch["duration"] >= 1 and len(batch["target_text"]) > 5) # about 98% of samples def prepare_dataset(batch): # check that all files have the correct sampling rate assert ( len(set(batch["sampling_rate"])) == 1 ), f"Make sure all inputs have the same sampling rate of {processor.feature_extractor.sampling_rate}." batch["input_values"] = processor( batch["speech"], sampling_rate=batch["sampling_rate"][0]).input_values # Setup the processor for targets with processor.as_target_processor(): batch["labels"] = processor(batch["target_text"]).input_ids return batch def get_length(item): # speeds up grouping by length in pre-loaded dataset item["length"] = len(item["input_values"]) return item # Pre-processed datasets dataset_path = Path(os.getenv("HF_HOME", ".")) / "datasets" dataset_train_path = f"{dataset_path}/{data_args.dataset_config_name}/train/{data_args.train_split_name}" dataset_eval_path = f"{dataset_path}/{data_args.dataset_config_name}/eval" dataset_test_path = f"{dataset_path}/{data_args.dataset_config_name}/test" vocab_path = f"{dataset_path}/{data_args.dataset_config_name}/vocab/vocab_test_{data_args.train_split_name}.json" train_dataset = None eval_dataset = None if training_args.do_eval else False log_timestamp() if Path(dataset_train_path).exists() and Path(vocab_path).exists(): train_dataset = datasets.load_from_disk(dataset_train_path) log_timestamp("load pre-processed data") else: train_dataset = datasets.load_dataset( "common_voice", data_args.dataset_config_name, split=data_args.train_split_name, ) log_timestamp("load data") train_dataset = train_dataset.map(remove_special_characters, remove_columns=["sentence"]) log_timestamp("remove special characters") if training_args.do_eval: if Path(dataset_eval_path).exists(): eval_dataset = datasets.load_from_disk(dataset_eval_path) else: eval_dataset = datasets.load_dataset("common_voice", data_args.dataset_config_name, split="test") eval_dataset = eval_dataset.map(remove_special_characters, remove_columns=["sentence"]) log_timestamp() if Path(dataset_test_path).exists() and Path(vocab_path).exists(): test_dataset = datasets.load_from_disk(dataset_test_path) else: test_dataset = datasets.load_dataset("common_voice", data_args.dataset_config_name, split="test") test_dataset = test_dataset.map( lambda x: remove_special_characters(x, train=False), remove_columns=["sentence"], ) log_timestamp() if not Path(vocab_path).exists(): # create vocab vocab_train = train_dataset.map( extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=train_dataset.column_names, ) vocab_test = test_dataset.map( extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=test_dataset.column_names, ) vocab_list = list( set(vocab_train["vocab"][0]) | set(vocab_test["vocab"][0])) vocab_dict = {v: k for k, v in enumerate(vocab_list)} vocab_dict["|"] = vocab_dict[" "] del vocab_dict[" "] vocab_dict["[UNK]"] = len(vocab_dict) vocab_dict["[PAD]"] = len(vocab_dict) Path(vocab_path).parent.mkdir(parents=True, exist_ok=True) with open(vocab_path, "w") as vocab_file: json.dump(vocab_dict, vocab_file) log_timestamp("create vocab") # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. tokenizer = Wav2Vec2CTCTokenizer( vocab_path, unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|", ) feature_extractor = Wav2Vec2FeatureExtractor( feature_size=1, sampling_rate=16_000, padding_value=0.0, do_normalize=True, return_attention_mask=True, ) processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer) model = Wav2Vec2ForCTC.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, activation_dropout=model_args.activation_dropout, attention_dropout=model_args.attention_dropout, hidden_dropout=model_args.hidden_dropout, feat_proj_dropout=model_args.feat_proj_dropout, mask_time_prob=model_args.mask_time_prob, gradient_checkpointing=model_args.gradient_checkpointing, layerdrop=model_args.layerdrop, ctc_loss_reduction="mean", pad_token_id=processor.tokenizer.pad_token_id, vocab_size=len(processor.tokenizer), ) log_timestamp("load model") if not Path(dataset_train_path).exists(): train_dataset = train_dataset.map( speech_file_to_array_fn, remove_columns=train_dataset.column_names, num_proc=data_args.preprocessing_num_workers, ) log_timestamp("load audio") train_dataset = train_dataset.filter( filter_by_duration, remove_columns=["duration"], num_proc=data_args.preprocessing_num_workers, ) log_timestamp("filter data") train_dataset = train_dataset.map( prepare_dataset, remove_columns=train_dataset.column_names, batch_size=training_args.per_device_train_batch_size, batched=True, num_proc=data_args.preprocessing_num_workers, ) log_timestamp("process data") train_dataset = train_dataset.map( get_length, num_proc=data_args.preprocessing_num_workers, ) log_timestamp("add input length") train_dataset.save_to_disk(dataset_train_path) log_timestamp("save to disk") if not Path(dataset_eval_path).exists() and training_args.do_eval: eval_dataset = eval_dataset.map( speech_file_to_array_fn, remove_columns=eval_dataset.column_names, num_proc=data_args.preprocessing_num_workers, ) eval_dataset = eval_dataset.filter( filter_by_duration, remove_columns=["duration"], num_proc=data_args.preprocessing_num_workers, ) eval_dataset = eval_dataset.map( prepare_dataset, remove_columns=eval_dataset.column_names, batch_size=training_args.per_device_eval_batch_size, batched=True, num_proc=data_args.preprocessing_num_workers, ) eval_dataset = eval_dataset.map( get_length, num_proc=data_args.preprocessing_num_workers, ) eval_dataset.save_to_disk(dataset_eval_path) log_timestamp() if not Path(dataset_test_path).exists(): test_dataset = test_dataset.map( speech_file_to_array_fn, num_proc=data_args.preprocessing_num_workers, ) test_dataset = test_dataset.filter(filter_by_duration, remove_columns=["duration"]) test_dataset.save_to_disk(dataset_test_path) log_timestamp() # Metric cer_metric = datasets.load_metric("cer") # we use a custom WER that considers punctuation wer_metric = datasets.load_metric("metrics/wer_punctuation.py") def compute_metrics(pred): pred_logits = pred.predictions pred_ids = np.argmax(pred_logits, axis=-1) pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id pred_str = processor.batch_decode(pred_ids) # we do not want to group tokens when computing the metrics label_str = processor.batch_decode(pred.label_ids, group_tokens=False) cer = cer_metric.compute(predictions=pred_str, references=label_str) wer = wer_metric.compute(predictions=pred_str, references=label_str) return {"cer": cer, "wer": wer} log_timestamp() if model_args.freeze_feature_extractor: model.freeze_feature_extractor() log_timestamp("freeze feature extractor") # Data collator data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True) log_timestamp("create data collator") # Initialize our Trainer trainer = CTCTrainer( model=model, data_collator=data_collator, args=training_args, compute_metrics=compute_metrics, train_dataset=train_dataset if training_args.do_train else None, eval_dataset=eval_dataset if training_args.do_eval else None, tokenizer=processor.feature_extractor, ) loss_nan_stopping_callback = LossNaNStoppingCallback() early_stopping_callback = EarlyStoppingCallback() timing_callback = TimingCallback() trainer.add_callback(loss_nan_stopping_callback) trainer.add_callback(early_stopping_callback) trainer.add_callback(timing_callback) # Training log_timestamp("setup trainer") if training_args.do_train: if last_checkpoint is not None: checkpoint = last_checkpoint elif os.path.isdir(model_args.model_name_or_path): checkpoint = model_args.model_name_or_path else: checkpoint = None log_timestamp() train_result = trainer.train(resume_from_checkpoint=checkpoint) log_timestamp("train model") trainer.save_model() # save the feature_extractor and the tokenizer if is_main_process(training_args.local_rank): processor.save_pretrained(training_args.output_dir) metrics = train_result.metrics metrics["train_samples"] = len(train_dataset) trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() # Final test metrics logger.info("*** Test ***") log_timestamp() if loss_nan_stopping_callback.stopped: test_cer, test_wer = 1.0, 2.0 logger.info( "Loss NaN detected, typically resulting in bad WER & CER so we won't calculate them." ) else: def evaluate(batch): inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True) with torch.no_grad(): logits = model( inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda"), ).logits pred_ids = torch.argmax(logits, dim=-1) batch["pred_strings"] = processor.batch_decode(pred_ids) return batch model.to("cuda") # no need to cache mapped test_dataset datasets.set_caching_enabled(False) result = test_dataset.map( evaluate, batched=True, batch_size=training_args.per_device_eval_batch_size) log_timestamp("get test predictions") test_cer = cer_metric.compute(predictions=result["pred_strings"], references=result["text"]) test_wer = wer_metric.compute(predictions=result["pred_strings"], references=result["text"]) log_timestamp("compute test metrics") metrics = {"cer": test_cer, "wer": test_wer} wandb.log({f"test/{k}": v for k, v in metrics.items()}) trainer.save_metrics("test", metrics) logger.info(metrics) # save model files log_timestamp() if not loss_nan_stopping_callback.stopped: artifact = wandb.Artifact(name=f"model-{wandb.run.id}", type="model", metadata={"cer": test_cer}) for f in Path(training_args.output_dir).iterdir(): if f.is_file(): artifact.add_file(str(f)) wandb.run.log_artifact(artifact) log_timestamp("log artifacts")
def train(args): wandb.login() seed_everything(args.seed) device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') MODEL_NAME = args.model_name tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) # load dataset train_dataset_dir = "/opt/ml/code2/data/train_new_tag.tsv" train_data = load_dataset(train_dataset_dir) train_x = list(train_data.iloc[:, 0]) train_y = list(train_data.iloc[:, -1]) valid_dataset_dir = "/opt/ml/code2/data/valid_tag.tsv" valid_data = load_dataset(valid_dataset_dir) val_x = list(valid_data.iloc[:, 0]) val_y = list(valid_data.iloc[:, -1]) # tokenize datasets tokenized_train = tokenized_dataset(train_x, tokenizer) tokenized_val = tokenized_dataset(val_x, tokenizer) # make dataset for pytorch RE_train_dataset = RE_Dataset(tokenized_train, train_y) RE_valid_dataset = RE_Dataset(tokenized_val, val_y) # instantiate pretrained language model model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=8) model.resize_token_embeddings(len(tokenizer)) model.to(device) # optimizer and scheduler # optimizer = AdamW(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) # scheduler = get_linear_schedule_with_warmup(optimizer=optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=300*args.epochs) # callbacks early_stopping = EarlyStoppingCallback( early_stopping_patience=args.early_stopping_patience, early_stopping_threshold=0.00005) training_args = TrainingArguments( output_dir='./results', logging_dir='./logs', logging_steps=100, save_total_limit=1, evaluation_strategy='steps', eval_steps=100, load_best_model_at_end=True, metric_for_best_model='accuracy', greater_is_better=True, dataloader_num_workers=args.num_workers, fp16=True, seed=args.seed, run_name=args.run_name, num_train_epochs=args.epochs, per_device_train_batch_size=args.train_batch_size, per_device_eval_batch_size=args.eval_batch_size, label_smoothing_factor=args.label_smoothing_factor, learning_rate=args.lr, warmup_steps=args.warmup_steps, weight_decay=args.weight_decay, ) trainer = Trainer( model=model, # the instantiated 🤗 Transformers model to be trained args=training_args, # training arguments, defined above train_dataset=RE_train_dataset, # training dataset eval_dataset=RE_valid_dataset, # evaluation dataset tokenizer=tokenizer, compute_metrics=compute_metrics, # define metrics function # optimizers=[optimizer, scheduler], callbacks=[early_stopping]) # train model trainer.train() wandb.finish()
output_dir=args.checkpoint_dir, # output directory num_train_epochs=args.epochs, # total number of training epochs per_device_train_batch_size=args. bs, # batch size per device during training per_device_eval_batch_size=args.bs, # batch size for evaluation # warmup_steps=warmup_steps, # number of warmup steps for learning rate scheduler weight_decay=args.wd, # strength of weight decay evaluation_strategy="epoch", # evaluation interval logging_dir=args.checkpoint_dir, # directory for storing logs save_strategy="epoch", # checkpoint save interval logging_steps=500, metric_for_best_model=args.criterion, load_best_model_at_end=True) collator = get_collator(tokenizer) es_callback = EarlyStoppingCallback(early_stopping_patience=5) print(f"- Training args: {training_args}") trainer = Trainer(model, args=training_args, train_dataset=train_ds, eval_dataset=test_ds, compute_metrics=compute_metrics, optimizers=(optimizer, scheduler), data_collator=collator) trainer.add_callback(es_callback) trainer.train() print(f"- Label encoder mapping:")
"recall": recall, "f1": f1 } # Define Trainer args = TrainingArguments( output_dir= "E:\Projects\Emotion_detection_gihan\\finbert_experiments\models\emotion_lines_500_steps", evaluation_strategy="steps", eval_steps=500, per_device_train_batch_size=8, per_device_eval_batch_size=8, num_train_epochs=3, save_steps=3000, seed=0, load_best_model_at_end=True, ) trainer = Trainer( model=model, args=args, train_dataset=train_dataset, eval_dataset=val_dataset, compute_metrics=compute_metrics, callbacks=[EarlyStoppingCallback(early_stopping_patience=3)], ) # Train pre-trained model trainer.train() # 500 is the best
def fine_tune(cfg: DictConfig) -> float: """fine tune bert module""" init_wandb(cfg) train_ds, test_ds = getDataset(cfg) config = AutoConfig.from_pretrained(cfg.model.arch, num_labels=cfg.model.num_labels) model = AutoModelForSequenceClassification.from_pretrained(cfg.model.arch, config=config) id = wandb.run.name.rsplit("-", 1)[1] trainConfig = cfg.train output_dir = os.path.join(trainConfig["output_dir"], id) print("module output dir = ", output_dir) train_args = TrainingArguments( # module pred/ckpt output_dir=output_dir, # tensorboard logs logging_dir="./logs", num_train_epochs=trainConfig["epoch"], per_device_train_batch_size=trainConfig["train_batch_size"], per_device_eval_batch_size=trainConfig["eval_batch_size"], # x (logging / eval /save) every acc * x_steps gradient_accumulation_steps=trainConfig["acc_batch"], evaluation_strategy=IntervalStrategy.EPOCH, label_smoothing_factor=trainConfig["label_smooth"], # AdamW learning_rate=trainConfig["lr"], warmup_steps=trainConfig["warmup"], # apply to all layers but bias / LayerNorm weight_decay=trainConfig["wd"], # save_total_limit=2, # if True, ignore param save_strategy / save_steps / save_total_limit load_best_model_at_end=True, # report_to=["none"], report_to=["wandb"], seed=cfg.seed, logging_strategy=IntervalStrategy.STEPS, metric_for_best_model=trainConfig["metric"]) trainer = Trainer( model, args=train_args, train_dataset=train_ds, eval_dataset=test_ds, callbacks=[ EarlyStoppingCallback( early_stopping_patience=trainConfig["early_stopping_patience"] ), ], compute_metrics=compute_metrics, ) print("logs in dir", os.getcwd()) print("gpu count = ", trainer.args.n_gpu, "is_fp16 =", trainer.args.fp16) trainer.train() trainer.evaluate() # best module trainer.model.save_pretrained(os.path.join(output_dir, "best")) y_pred_tuple = trainer.predict(test_ds) logits, y_true, metrics = y_pred_tuple y_pred = logits.argmax(-1) plot_heat_map(y_true, y_pred, cfg.model.num_labels) acc = accuracy_score(y_true, y_pred) print(acc) wandb.finish() return acc