vocab_size=30_000,
                                  min_freq=2,
                                  max_len=256,
                                  block_size=64,
                                  mlm_probability=0.15,
                                  num_attention_heads=6,
                                  num_hidden_layers=3,
                                  epochs=5,
                                  batch_size=30,
                                  val_batch_size=60,
                                  eval_steps=50,
                                  **kwargs):
    # instantiate tokenizer
    bpe_tokenizer = ByteLevelBPETokenizer()
    # train tokenizer
    _pretty_print("Training tokenizer")
    bpe_tokenizer.train([input_path, input_path_val],
                        vocab_size=vocab_size,
                        min_frequency=min_freq,
                        special_tokens=[
                            "<s>",
                            "<pad>",
                            "</s>",
                            "<unk>",
                            "<mask>",
                        ])
    # save tokenizer
    tok_path = os.path.join(output_path, "tokenizer")
    os.makedirs(tok_path, exist_ok=True)
    bpe_tokenizer.save_model(tok_path)
    output_path = os.path.join(output_path, f"{flag}-{c_time}")
    os.makedirs(output_path, exist_ok=True)
    test_size = train_config.get("test_size", 0.2)
    list_procs = train_config["list_procs"]
    text_col = train_config["text_col"]
    label_col = train_config["label_col"]
    tokenizer_path = train_config["tokenizer_path"]
    max_length = train_config["max_length"]
    train_batch_size = train_config["train_batch_size"]
    val_batch_size = train_config["val_batch_size"]
    val_batch_size = train_config["val_batch_size"]
    num_workers = train_config["num_workers"]
    pin_memory = train_config.get("pin_memory", False)
    balanced_classes = train_config.get("balanced_classes", False)

    _pretty_print("Loading data")
    df = pd.read_csv(input_path)
    df_test = pd.read_csv(test_path)
    df_train, df_val = train_test_split(
        df, test_size=test_size, random_state=123, stratify=df[label_col])

    tokenizer = RobertaTokenizerFast.from_pretrained(
        tokenizer_path, max_len=max_length)
    train_loader, val_loader = get_roberta_dataloaders(df_train, df_val, list_procs,
                                                       text_col, label_col, tokenizer, max_length,
                                                       train_batch_size, val_batch_size, num_workers, pin_memory)

    # prepare class_weigths
    classes_weights = None
    if balanced_classes:
        val_counts = df_train[label_col].value_counts()
def run_training(model, optimizer, scheduler, output_path,
                 train_loader, val_loader, epochs, patience,
                  epochs_pretrain, mixed_precision, classes_weights):

    # trainer
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    if classes_weights is not None:
        classes_weights = classes_weights.to(device)
    crit = nn.CrossEntropyLoss(weight=classes_weights)
    metrics = {"accuracy": Accuracy(), "loss": Loss(crit)}
    trainer = create_supervised_trainer_with_pretraining(
        model, optimizer, crit, device=device, epochs_pretrain=epochs_pretrain,
        mixed_precision=mixed_precision)
    train_evaluator = create_supervised_evaluator(
        model, metrics=metrics, device=device)
    val_evaluator = create_supervised_evaluator(
        model, metrics=metrics, device=device)

    # Out paths
    path_ckpt = os.path.join(output_path, "model_ckpt")
    log_dir = os.path.join(output_path, "log_dir")
    os.makedirs(log_dir, exist_ok=True)

    # tensorboard
    tb_logger = TensorboardLogger(log_dir=log_dir)
    tb_logger.attach(train_evaluator, log_handler=OutputHandler(tag="training", metric_names=[
        "accuracy", "loss"], another_engine=trainer), event_name=Events.EPOCH_COMPLETED)
    tb_logger.attach(val_evaluator, log_handler=OutputHandler(tag="validation", metric_names=[
        "accuracy", "loss"], another_engine=trainer), event_name=Events.EPOCH_COMPLETED)

    # training progress
    pbar = ProgressBar(persist=True)
    pbar.attach(trainer, metric_names="all")

    # @trainer.on(Events.EPOCH_COMPLETED)
    def log_training_results(engine):
        train_evaluator.run(train_loader)
        val_evaluator.run(val_loader)
        train_loss = train_evaluator.state.metrics["loss"]
        val_loss = val_evaluator.state.metrics["loss"]
        train_acc = train_evaluator.state.metrics["accuracy"]
        val_acc = val_evaluator.state.metrics["accuracy"]
        pbar.log_message(
            "Training Results - Epoch: {}  Loss: {:.6f}  Accuracy: {:.6f}".format(engine.state.epoch, train_loss, train_acc))
        pbar.log_message(
            "Validation Results - Epoch: {}  Loss: {:.6f}  Accuracy: {:.6f}".format(engine.state.epoch, val_loss, val_acc))

        pbar.n = pbar.last_print_n = 0

    trainer.add_event_handler(Events.EPOCH_COMPLETED, log_training_results)

    # def get_val_loss(engine):
    # 	return -engine.state.metrics['loss']
    def get_val_acc(engine):
        return engine.state.metrics['accuracy']

    # checkpoint and early stopping
    checkpointer = ModelCheckpoint(
        path_ckpt, "model", score_function=get_val_acc, score_name="accuracy", require_empty=False)
    early_stopper = EarlyStopping(patience, get_val_acc, trainer)

    to_save = {'optimizer': optimizer, 'model': model}
    if scheduler is not None:
        to_save["scheduler"] = scheduler
    val_evaluator.add_event_handler(Events.COMPLETED, checkpointer, to_save)
    val_evaluator.add_event_handler(Events.COMPLETED, early_stopper)
    if scheduler is not None:
        trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)

    # free resources
    trainer.add_event_handler(
        Events.ITERATION_COMPLETED, lambda _: _empty_cache())
    train_evaluator.add_event_handler(
        Events.ITERATION_COMPLETED, lambda _: _empty_cache())
    val_evaluator.add_event_handler(
        Events.ITERATION_COMPLETED, lambda _: _empty_cache())

    trainer.run(train_loader, max_epochs=epochs)
    tb_logger.close()

    # Evaluation with best model
    model.load_state_dict(torch.load(
        glob.glob(os.path.join(path_ckpt, "*.pth"))[0])["model"])
    train_evaluator = create_supervised_evaluator(
        model, metrics=metrics, device=device)
    val_evaluator = create_supervised_evaluator(
        model, metrics=metrics, device=device)

    train_evaluator.run(train_loader)
    val_evaluator.run(val_loader)

    _pretty_print("Evaluating best model")
    pbar.log_message(
        "Best model on training set - Loss: {:.6f}  Accuracy: {:.6f}"
        .format(train_evaluator.state.metrics["loss"], train_evaluator.state.metrics["accuracy"]))
    pbar.log_message(
        "Best model on validation set - Loss: {:.6f}  Accuracy: {:.6f}"
        .format(val_evaluator.state.metrics["loss"], val_evaluator.state.metrics["accuracy"]))

    return model, train_evaluator.state.metrics, val_evaluator.state.metrics
예제 #4
0
    output_path = os.path.join(output_path, f"{flag}-{c_time}")
    os.makedirs(output_path, exist_ok=True)
    list_procs = train_config["list_procs"]
    text_col = train_config["text_col"]
    label_col = train_config["label_col"]
    with open(train_config["dict_label_path"]) as f:
        dict_label = json.load(f)
    tokenizer_path = train_config["tokenizer_path"]
    max_length = train_config["max_length"]
    train_batch_size = train_config["train_batch_size"]
    val_batch_size = train_config["val_batch_size"]
    num_workers = train_config.get("num_workers", 1)
    pin_memory = train_config.get("pin_memory", False)
    balanced_classes = train_config.get("balanced_classes", False)

    _pretty_print("Loading data")
    df = pd.read_csv(input_path).reset_index(drop=True)
    df_test = pd.read_csv(test_path).reset_index(drop=True)
    df_sub = pd.DataFrame({"ID": df_test["ID"]})
    list_all_preds = []

    tokenizer = PreTrainedTokenizerFast(
        tokenizer_file=tokenizer_path,
        max_length=max_length,
        lowercase=True,
        unk_token="<unk>",
        sep_token="<sep>",
        pad_token="<pad>",
        cls_token="<cls>",
        mask_token="<mask>",
        bos_token="<s>",