vocab_size=30_000, min_freq=2, max_len=256, block_size=64, mlm_probability=0.15, num_attention_heads=6, num_hidden_layers=3, epochs=5, batch_size=30, val_batch_size=60, eval_steps=50, **kwargs): # instantiate tokenizer bpe_tokenizer = ByteLevelBPETokenizer() # train tokenizer _pretty_print("Training tokenizer") bpe_tokenizer.train([input_path, input_path_val], vocab_size=vocab_size, min_frequency=min_freq, special_tokens=[ "<s>", "<pad>", "</s>", "<unk>", "<mask>", ]) # save tokenizer tok_path = os.path.join(output_path, "tokenizer") os.makedirs(tok_path, exist_ok=True) bpe_tokenizer.save_model(tok_path)
output_path = os.path.join(output_path, f"{flag}-{c_time}") os.makedirs(output_path, exist_ok=True) test_size = train_config.get("test_size", 0.2) list_procs = train_config["list_procs"] text_col = train_config["text_col"] label_col = train_config["label_col"] tokenizer_path = train_config["tokenizer_path"] max_length = train_config["max_length"] train_batch_size = train_config["train_batch_size"] val_batch_size = train_config["val_batch_size"] val_batch_size = train_config["val_batch_size"] num_workers = train_config["num_workers"] pin_memory = train_config.get("pin_memory", False) balanced_classes = train_config.get("balanced_classes", False) _pretty_print("Loading data") df = pd.read_csv(input_path) df_test = pd.read_csv(test_path) df_train, df_val = train_test_split( df, test_size=test_size, random_state=123, stratify=df[label_col]) tokenizer = RobertaTokenizerFast.from_pretrained( tokenizer_path, max_len=max_length) train_loader, val_loader = get_roberta_dataloaders(df_train, df_val, list_procs, text_col, label_col, tokenizer, max_length, train_batch_size, val_batch_size, num_workers, pin_memory) # prepare class_weigths classes_weights = None if balanced_classes: val_counts = df_train[label_col].value_counts()
def run_training(model, optimizer, scheduler, output_path, train_loader, val_loader, epochs, patience, epochs_pretrain, mixed_precision, classes_weights): # trainer device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') if classes_weights is not None: classes_weights = classes_weights.to(device) crit = nn.CrossEntropyLoss(weight=classes_weights) metrics = {"accuracy": Accuracy(), "loss": Loss(crit)} trainer = create_supervised_trainer_with_pretraining( model, optimizer, crit, device=device, epochs_pretrain=epochs_pretrain, mixed_precision=mixed_precision) train_evaluator = create_supervised_evaluator( model, metrics=metrics, device=device) val_evaluator = create_supervised_evaluator( model, metrics=metrics, device=device) # Out paths path_ckpt = os.path.join(output_path, "model_ckpt") log_dir = os.path.join(output_path, "log_dir") os.makedirs(log_dir, exist_ok=True) # tensorboard tb_logger = TensorboardLogger(log_dir=log_dir) tb_logger.attach(train_evaluator, log_handler=OutputHandler(tag="training", metric_names=[ "accuracy", "loss"], another_engine=trainer), event_name=Events.EPOCH_COMPLETED) tb_logger.attach(val_evaluator, log_handler=OutputHandler(tag="validation", metric_names=[ "accuracy", "loss"], another_engine=trainer), event_name=Events.EPOCH_COMPLETED) # training progress pbar = ProgressBar(persist=True) pbar.attach(trainer, metric_names="all") # @trainer.on(Events.EPOCH_COMPLETED) def log_training_results(engine): train_evaluator.run(train_loader) val_evaluator.run(val_loader) train_loss = train_evaluator.state.metrics["loss"] val_loss = val_evaluator.state.metrics["loss"] train_acc = train_evaluator.state.metrics["accuracy"] val_acc = val_evaluator.state.metrics["accuracy"] pbar.log_message( "Training Results - Epoch: {} Loss: {:.6f} Accuracy: {:.6f}".format(engine.state.epoch, train_loss, train_acc)) pbar.log_message( "Validation Results - Epoch: {} Loss: {:.6f} Accuracy: {:.6f}".format(engine.state.epoch, val_loss, val_acc)) pbar.n = pbar.last_print_n = 0 trainer.add_event_handler(Events.EPOCH_COMPLETED, log_training_results) # def get_val_loss(engine): # return -engine.state.metrics['loss'] def get_val_acc(engine): return engine.state.metrics['accuracy'] # checkpoint and early stopping checkpointer = ModelCheckpoint( path_ckpt, "model", score_function=get_val_acc, score_name="accuracy", require_empty=False) early_stopper = EarlyStopping(patience, get_val_acc, trainer) to_save = {'optimizer': optimizer, 'model': model} if scheduler is not None: to_save["scheduler"] = scheduler val_evaluator.add_event_handler(Events.COMPLETED, checkpointer, to_save) val_evaluator.add_event_handler(Events.COMPLETED, early_stopper) if scheduler is not None: trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) # free resources trainer.add_event_handler( Events.ITERATION_COMPLETED, lambda _: _empty_cache()) train_evaluator.add_event_handler( Events.ITERATION_COMPLETED, lambda _: _empty_cache()) val_evaluator.add_event_handler( Events.ITERATION_COMPLETED, lambda _: _empty_cache()) trainer.run(train_loader, max_epochs=epochs) tb_logger.close() # Evaluation with best model model.load_state_dict(torch.load( glob.glob(os.path.join(path_ckpt, "*.pth"))[0])["model"]) train_evaluator = create_supervised_evaluator( model, metrics=metrics, device=device) val_evaluator = create_supervised_evaluator( model, metrics=metrics, device=device) train_evaluator.run(train_loader) val_evaluator.run(val_loader) _pretty_print("Evaluating best model") pbar.log_message( "Best model on training set - Loss: {:.6f} Accuracy: {:.6f}" .format(train_evaluator.state.metrics["loss"], train_evaluator.state.metrics["accuracy"])) pbar.log_message( "Best model on validation set - Loss: {:.6f} Accuracy: {:.6f}" .format(val_evaluator.state.metrics["loss"], val_evaluator.state.metrics["accuracy"])) return model, train_evaluator.state.metrics, val_evaluator.state.metrics
output_path = os.path.join(output_path, f"{flag}-{c_time}") os.makedirs(output_path, exist_ok=True) list_procs = train_config["list_procs"] text_col = train_config["text_col"] label_col = train_config["label_col"] with open(train_config["dict_label_path"]) as f: dict_label = json.load(f) tokenizer_path = train_config["tokenizer_path"] max_length = train_config["max_length"] train_batch_size = train_config["train_batch_size"] val_batch_size = train_config["val_batch_size"] num_workers = train_config.get("num_workers", 1) pin_memory = train_config.get("pin_memory", False) balanced_classes = train_config.get("balanced_classes", False) _pretty_print("Loading data") df = pd.read_csv(input_path).reset_index(drop=True) df_test = pd.read_csv(test_path).reset_index(drop=True) df_sub = pd.DataFrame({"ID": df_test["ID"]}) list_all_preds = [] tokenizer = PreTrainedTokenizerFast( tokenizer_file=tokenizer_path, max_length=max_length, lowercase=True, unk_token="<unk>", sep_token="<sep>", pad_token="<pad>", cls_token="<cls>", mask_token="<mask>", bos_token="<s>",