Пример #1
0
class Predictor():
    def __init__(self):
        try:
            f = open("params.json", "r", encoding='utf8')
            self.params = json.loads(f.read())
        except FileNotFoundError:
            self.params = save_corpus()
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.device = device
        self.model = BiGRU(torch.zeros((len(self.params['tbl']) + 1, 300)),
                           MODEL_PARAMS['gru_hidden_dim'],
                           MODEL_PARAMS['gru_num_layers'],
                           len(self.params['tagset']),
                           MODEL_PARAMS['concat']).to(device)
        self.model.load_state_dict(
            torch.load('trained_model.pt',
                       map_location=lambda storage, loc: storage))
        self.model.eval()

    def predict(self, sentence):
        words = sentence.split()

        lis = []
        new_words = []
        for word in words:
            symbol = None
            if not word[-1].isalnum():
                symbol = word[-1]
                word = word[:-1]
            if word.lower() in self.params['tbl']:
                lis.append(self.params['tbl'][word.lower()])
            else:
                lis.append(0)
            new_words.append(word)
            if symbol != None:
                if symbol in self.params['tbl']:
                    lis.append(self.params['tbl'][symbol])
                else:
                    lis.append(0)
                new_words.append(symbol)

        x = torch.LongTensor(lis).to(self.device)
        x = x.unsqueeze(0)
        y_raw = self.model(x)
        y_pred = torch.argmax(y_raw, dim=2).view(-1)
        tagged_sent = ''
        for i in range(len(y_pred)):
            tagged_sent += new_words[i]
            tagged_sent += ' '
            tagged_sent += self.params['reverse_tagset'][y_pred[i]]
            tagged_sent += ' '
        print(tagged_sent)

    def tag_lookup(self, tag):
        try:
            print('TAG:', tag)
            print('Definition:', self.params['tag_definition'][tag][0])
        except:
            print('Error: Tag not found.')
Пример #2
0
def model_load_test(test_df,
                    vocab_file,
                    embeddings_file,
                    pretrained_file,
                    test_prediction_dir,
                    test_prediction_name,
                    mode,
                    num_labels=2,
                    max_length=50,
                    gpu_index=0,
                    batch_size=128):

    device = torch.device(
        "cuda:{}".format(gpu_index) if torch.cuda.is_available() else "cpu")
    print(20 * "=", " Preparing for testing ", 20 * "=")
    if platform == "linux" or platform == "linux2":
        checkpoint = torch.load(pretrained_file)
    else:
        checkpoint = torch.load(pretrained_file, map_location=device)
    # Retrieving model parameters from checkpoint.
    embeddings = load_embeddings(embeddings_file)
    print("\t* Loading test data...")
    test_data = My_Dataset(test_df, vocab_file, max_length, mode)
    test_loader = DataLoader(test_data, shuffle=False, batch_size=batch_size)
    print("\t* Building model...")
    model = BiGRU(embeddings, num_labels=num_labels, device=device).to(device)
    model.load_state_dict(checkpoint["model"])
    print(20 * "=", " Testing BiGRU model on device: {} ".format(device),
          20 * "=")
    batch_time, total_time, accuracy, predictions = test(model, test_loader)
    print(
        "\n-> Average batch processing time: {:.4f}s, total test time: {:.4f}s, accuracy: {:.4f}%\n"
        .format(batch_time, total_time, (accuracy * 100)))
    test_prediction = pd.DataFrame({'prediction': predictions})
    if not os.path.exists(test_prediction_dir):
        os.makedirs(test_prediction_dir)
    test_prediction.to_csv(os.path.join(test_prediction_dir,
                                        test_prediction_name),
                           index=False)
Пример #3
0
def train_model(args,
                train_text=None,
                train_labels=None,
                eval_text=None,
                eval_labels=None,
                tokenizer=None):
    textattack.shared.utils.set_seed(args.random_seed)

    _make_directories(args.output_dir)

    num_gpus = torch.cuda.device_count()

    # Save logger writes to file
    log_txt_path = os.path.join(args.output_dir, "log.txt")
    fh = logging.FileHandler(log_txt_path)
    fh.setLevel(logging.DEBUG)
    logger.addHandler(fh)
    logger.info(f"Writing logs to {log_txt_path}.")

    train_examples_len = len(train_text)

    # label_id_len = len(train_labels)
    label_set = set(train_labels)
    args.num_labels = len(label_set)
    logger.info(
        f"Loaded dataset. Found: {args.num_labels} labels: {sorted(label_set)}"
    )

    if len(train_labels) != len(train_text):
        raise ValueError(
            f"Number of train examples ({len(train_text)}) does not match number of labels ({len(train_labels)})"
        )
    if len(eval_labels) != len(eval_text):
        raise ValueError(
            f"Number of teste xamples ({len(eval_text)}) does not match number of labels ({len(eval_labels)})"
        )

    if args.model == "gru":
        textattack.shared.logger.info(
            "Loading textattack model: GRUForClassification")
        model = BiGRU()
        model.to(device)
    elif args.model == "lstm":
        textattack.shared.logger.info(
            "Loading textattack model: LSTMForClassification")
        model = BiLSTM()
        model.to(device)

    # attack_class = attack_from_args(args)
    # We are adversarial training if the user specified an attack along with
    # the training args.
    # adversarial_training = (attack_class is not None) and (not args.check_robustness)

    # multi-gpu training
    if num_gpus > 1:
        model = torch.nn.DataParallel(model)
        logger.info("Using torch.nn.DataParallel.")
    logger.info(f"Training model across {num_gpus} GPUs")

    num_train_optimization_steps = (
        int(train_examples_len / args.batch_size / args.grad_accum_steps) *
        args.num_train_epochs)

    if args.model == "lstm" or args.model == "cnn" or args.model == "gru":

        def need_grad(x):
            return x.requires_grad

        optimizer = torch.optim.Adam(filter(need_grad, model.parameters()),
                                     lr=args.learning_rate)
        scheduler = None
    else:
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [
                    p for n, p in param_optimizer
                    if not any(nd in n for nd in no_decay)
                ],
                "weight_decay":
                0.01,
            },
            {
                "params": [
                    p for n, p in param_optimizer
                    if any(nd in n for nd in no_decay)
                ],
                "weight_decay":
                0.0,
            },
        ]

        optimizer = transformers.optimization.AdamW(
            optimizer_grouped_parameters, lr=args.learning_rate)

        scheduler = transformers.optimization.get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=args.warmup_proportion,
            num_training_steps=num_train_optimization_steps,
        )

    # Start Tensorboard and log hyperparams.
    from torch.utils.tensorboard import SummaryWriter

    tb_writer = SummaryWriter(args.output_dir)

    # Use Weights & Biases, if enabled.
    if args.enable_wandb:
        global wandb
        wandb = textattack.shared.utils.LazyLoader("wandb", globals(), "wandb")
        wandb.init(sync_tensorboard=True)

    # Save original args to file
    args_save_path = os.path.join(args.output_dir, "train_args.json")
    _save_args(args, args_save_path)
    logger.info(f"Wrote original training args to {args_save_path}.")

    tb_writer.add_hparams(
        {k: v
         for k, v in vars(args).items() if _is_writable_type(v)}, {})

    # Start training
    logger.info("***** Running training *****")
    # if augmenter:
    #     logger.info(f"\tNum original examples = {train_examples_len}")
    #     logger.info(f"\tNum examples after augmentation = {len(train_text)}")
    # else:
    #     logger.info(f"\tNum examples = {train_examples_len}")
    logger.info(f"\tNum examples = {train_examples_len}")
    logger.info(f"\tBatch size = {args.batch_size}")
    logger.info(f"\tMax sequence length = {args.max_length}")
    logger.info(f"\tNum steps = {num_train_optimization_steps}")
    logger.info(f"\tNum epochs = {args.num_train_epochs}")
    logger.info(f"\tLearning rate = {args.learning_rate}")

    eval_dataloader = _make_dataloader(tokenizer, eval_text, eval_labels,
                                       args.batch_size)
    train_dataloader = _make_dataloader(tokenizer, train_text, train_labels,
                                        args.batch_size)

    global_step = 0
    tr_loss = 0

    model.train()
    args.best_eval_score = 0
    args.best_eval_score_epoch = 0
    args.epochs_since_best_eval_score = 0

    def loss_backward(loss):
        if num_gpus > 1:
            loss = loss.mean(
            )  # mean() to average on multi-gpu parallel training
        if args.grad_accum_steps > 1:
            loss = loss / args.grad_accum_steps
        loss.backward()
        return loss

    # if args.do_regression:
    #     # TODO integrate with textattack `metrics` package
    #     loss_fct = torch.nn.MSELoss()
    # else:
    #     loss_fct = torch.nn.CrossEntropyLoss()
    loss_fct = torch.nn.CrossEntropyLoss()

    for epoch in tqdm.trange(int(args.num_train_epochs),
                             desc="Epoch",
                             position=0,
                             leave=True):
        # if adversarial_training:
        #     if epoch >= args.num_clean_epochs:
        #         if (epoch - args.num_clean_epochs) % args.attack_period == 0:
        #             # only generate a new adversarial training set every args.attack_period epochs
        #             # after the clean epochs
        #             logger.info("Attacking model to generate new training set...")

        #             adv_attack_results = _generate_adversarial_examples(
        #                 model_wrapper, attack_class, list(zip(train_text, train_labels))
        #             )
        #             adv_train_text = [r.perturbed_text() for r in adv_attack_results]
        #             train_dataloader = _make_dataloader(
        #                 tokenizer, adv_train_text, train_labels, args.batch_size
        #             )
        #     else:
        #         logger.info(f"Running clean epoch {epoch+1}/{args.num_clean_epochs}")

        prog_bar = tqdm.tqdm(train_dataloader,
                             desc="Iteration",
                             position=0,
                             leave=True)

        # Use these variables to track training accuracy during classification.
        correct_predictions = 0
        total_predictions = 0
        for step, batch in enumerate(prog_bar):
            ids1, ids2, msk1, msk2, labels = batch
            # input_ids, labels = batch
            labels = labels.to(device)
            # if isinstance(input_ids, dict):
            #     ## dataloader collates dict backwards. This is a workaround to get
            #     # ids in the right shape for HuggingFace models
            #     input_ids = {
            #         k: torch.stack(v).T.to(device) for k, v in input_ids.items()
            #     }
            #     logits = model(**input_ids)[0]
            # else:

            ids1 = ids1.to(device)
            ids2 = ids2.to(device)
            msk1 = msk1.to(device)
            msk2 = msk2.to(device)
            logits = model(ids1, ids2, msk1, msk2)

            # if args.do_regression:
            #     # TODO integrate with textattack `metrics` package
            #     loss = loss_fct(logits.squeeze(), labels.squeeze())
            # else:
            loss = loss_fct(logits, labels)
            pred_labels = logits.argmax(dim=-1)
            correct_predictions += (pred_labels == labels).sum().item()
            total_predictions += len(pred_labels)

            loss = loss_backward(loss)
            tr_loss += loss.item()

            if global_step % args.tb_writer_step == 0:
                tb_writer.add_scalar("loss", loss.item(), global_step)
                if scheduler is not None:
                    tb_writer.add_scalar("lr",
                                         scheduler.get_last_lr()[0],
                                         global_step)
                else:
                    tb_writer.add_scalar("lr", args.learning_rate, global_step)
            if global_step > 0:
                prog_bar.set_description(f"Loss {tr_loss/global_step}")
            if (step + 1) % args.grad_accum_steps == 0:
                optimizer.step()
                if scheduler is not None:
                    scheduler.step()
                optimizer.zero_grad()
            # Save model checkpoint to file.
            if (global_step > 0 and (args.checkpoint_steps > 0)
                    and (global_step % args.checkpoint_steps) == 0):
                _save_model_checkpoint(model, args.output_dir, global_step)

            # Inc step counter.
            global_step += 1

        # Print training accuracy, if we're tracking it.
        if total_predictions > 0:
            train_acc = correct_predictions / total_predictions
            logger.info(f"Train accuracy: {train_acc*100}%")
            tb_writer.add_scalar("epoch_train_score", train_acc, epoch)

        # Check accuracy after each epoch.
        # skip args.num_clean_epochs during adversarial training
        # if (not adversarial_training) or (epoch >= args.num_clean_epochs):
        if (epoch >= args.num_clean_epochs):
            eval_score = _get_eval_score(model, eval_dataloader, False)
            tb_writer.add_scalar("epoch_eval_score", eval_score, epoch)

            if args.checkpoint_every_epoch:
                _save_model_checkpoint(model, args.output_dir,
                                       args.global_step)

            logger.info(
                f"Eval {'pearson correlation' if args.do_regression else 'accuracy'}: {eval_score*100}%"
            )
            if eval_score > args.best_eval_score:
                args.best_eval_score = eval_score
                args.best_eval_score_epoch = epoch
                args.epochs_since_best_eval_score = 0
                _save_model(model, args.output_dir, args.weights_name,
                            args.config_name)
                logger.info(
                    f"Best acc found. Saved model to {args.output_dir}.")
                _save_args(args, args_save_path)
                logger.info(f"Saved updated args to {args_save_path}")
            else:
                args.epochs_since_best_eval_score += 1
                if (args.early_stopping_epochs >
                        0) and (args.epochs_since_best_eval_score >
                                args.early_stopping_epochs):
                    logger.info(
                        f"Stopping early since it's been {args.early_stopping_epochs} steps since validation acc increased"
                    )
                    break

        if args.check_robustness:
            samples_to_attack = list(zip(eval_text, eval_labels))
            samples_to_attack = random.sample(samples_to_attack, 1000)
            adv_attack_results = _generate_adversarial_examples(
                model_wrapper, attack_class, samples_to_attack)
            attack_types = [r.__class__.__name__ for r in adv_attack_results]
            attack_types = collections.Counter(attack_types)

            adv_acc = 1 - (attack_types["SkippedAttackResult"] /
                           len(adv_attack_results))
            total_attacks = (attack_types["SuccessfulAttackResult"] +
                             attack_types["FailedAttackResult"])
            adv_succ_rate = attack_types[
                "SuccessfulAttackResult"] / total_attacks
            after_attack_acc = attack_types["FailedAttackResult"] / len(
                adv_attack_results)

            tb_writer.add_scalar("robustness_test_acc", adv_acc, global_step)
            tb_writer.add_scalar("robustness_total_attacks", total_attacks,
                                 global_step)
            tb_writer.add_scalar("robustness_attack_succ_rate", adv_succ_rate,
                                 global_step)
            tb_writer.add_scalar("robustness_after_attack_acc",
                                 after_attack_acc, global_step)

            logger.info(f"Eval after-attack accuracy: {100*after_attack_acc}%")

    # read the saved model and report its eval performance
    logger.info(
        "Finished training. Re-loading and evaluating model from disk.")
    model_wrapper = model_from_args(args, args.num_labels)
    model = model_wrapper.model
    model.load_state_dict(
        torch.load(os.path.join(args.output_dir, args.weights_name)))
    eval_score = _get_eval_score(model, eval_dataloader, args.do_regression)
    logger.info(
        f"Saved model {'pearson correlation' if args.do_regression else 'accuracy'}: {eval_score*100}%"
    )

    if args.save_last:
        _save_model(model, args.output_dir, args.weights_name,
                    args.config_name)

    # end of training, save tokenizer
    try:
        tokenizer.save_pretrained(args.output_dir)
        logger.info(f"Saved tokenizer {tokenizer} to {args.output_dir}.")
    except AttributeError:
        logger.warn(
            f"Error: could not save tokenizer {tokenizer} to {args.output_dir}."
        )

    # Save a little readme with model info
    write_readme(args, args.best_eval_score, args.best_eval_score_epoch)

    _save_args(args, args_save_path)
    tb_writer.close()
    logger.info(f"Wrote final training args to {args_save_path}.")
Пример #4
0
def model_train_validate_test(train_df,
                              dev_df,
                              test_df,
                              embeddings_file,
                              vocab_file,
                              target_dir,
                              mode,
                              num_labels=2,
                              max_length=50,
                              epochs=50,
                              batch_size=128,
                              lr=0.0005,
                              patience=5,
                              max_grad_norm=10.0,
                              gpu_index=0,
                              if_save_model=False,
                              checkpoint=None):
    device = torch.device(
        "cuda:{}".format(gpu_index) if torch.cuda.is_available() else "cpu")
    print(20 * "=", " Preparing for training ", 20 * "=")
    # 保存模型的路径
    if not os.path.exists(target_dir):
        os.makedirs(target_dir)
    # -------------------- Data loading ------------------- #
    print("\t* Loading training data...")
    train_data = My_Dataset(train_df, vocab_file, max_length, mode)
    train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
    print("\t* Loading validation data...")
    dev_data = My_Dataset(dev_df, vocab_file, max_length, mode)
    dev_loader = DataLoader(dev_data, shuffle=True, batch_size=batch_size)
    print("\t* Loading test data...")
    test_data = My_Dataset(test_df, vocab_file, max_length, mode)
    test_loader = DataLoader(test_data, shuffle=False, batch_size=batch_size)
    # -------------------- Model definition ------------------- #
    print("\t* Building model...")
    if (embeddings_file is not None):
        embeddings = load_embeddings(embeddings_file)
    else:
        embeddings = None
    model = BiGRU(embeddings, num_labels=num_labels, device=device).to(device)
    total_params = sum(p.numel() for p in model.parameters())
    print(f'{total_params:,} total parameters.')
    total_trainable_params = sum(p.numel() for p in model.parameters()
                                 if p.requires_grad)
    print(f'{total_trainable_params:,} training parameters.')
    # -------------------- Preparation for training  ------------------- #
    criterion = nn.CrossEntropyLoss()
    # 过滤出需要梯度更新的参数
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    # optimizer = optim.Adadelta(parameters, params["LEARNING_RATE"])
    optimizer = torch.optim.Adam(parameters, lr=lr)
    # optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                           mode="max",
                                                           factor=0.85,
                                                           patience=0)
    best_score = 0.0
    start_epoch = 1
    # Data for loss curves plot
    epochs_count = []
    train_losses = []
    valid_losses = []
    # Continuing training from a checkpoint if one was given as argument
    if checkpoint:
        checkpoint = torch.load(checkpoint)
        start_epoch = checkpoint["epoch"] + 1
        best_score = checkpoint["best_score"]
        print("\t* Training will continue on existing model from epoch {}...".
              format(start_epoch))
        model.load_state_dict(checkpoint["model"])
        optimizer.load_state_dict(checkpoint["optimizer"])
        epochs_count = checkpoint["epochs_count"]
        train_losses = checkpoint["train_losses"]
        valid_losses = checkpoint["valid_losses"]
    # Compute loss and accuracy before starting (or resuming) training.
    _, valid_loss, valid_accuracy, _, = validate(model, dev_loader, criterion)
    print("\t* Validation loss before training: {:.4f}, accuracy: {:.4f}%".
          format(valid_loss, (valid_accuracy * 100)))
    # -------------------- Training epochs ------------------- #
    print("\n", 20 * "=", "Training BiGRU model on device: {}".format(device),
          20 * "=")
    patience_counter = 0
    for epoch in range(start_epoch, epochs + 1):
        epochs_count.append(epoch)
        print("* Training epoch {}:".format(epoch))
        epoch_time, epoch_loss, epoch_accuracy = train(model, train_loader,
                                                       optimizer, criterion,
                                                       epoch, max_grad_norm)
        train_losses.append(epoch_loss)
        print("-> Training time: {:.4f}s, loss = {:.4f}, accuracy: {:.4f}%".
              format(epoch_time, epoch_loss, (epoch_accuracy * 100)))
        print("* Validation for epoch {}:".format(epoch))
        epoch_time, epoch_loss, epoch_accuracy, _, = validate(
            model, dev_loader, criterion)
        valid_losses.append(epoch_loss)
        print("-> Valid. time: {:.4f}s, loss: {:.4f}, accuracy: {:.4f}%\n".
              format(epoch_time, epoch_loss, (epoch_accuracy * 100)))
        # Update the optimizer's learning rate with the scheduler.
        scheduler.step(epoch_accuracy)
        # Early stopping on validation accuracy.
        if epoch_accuracy < best_score:
            patience_counter += 1
        else:
            best_score = epoch_accuracy
            patience_counter = 0

            if (if_save_model):
                torch.save(
                    {
                        "epoch": epoch,
                        "model": model.state_dict(),
                        "best_score": best_score,
                        "epochs_count": epochs_count,
                        "train_losses": train_losses,
                        "valid_losses": valid_losses
                    }, os.path.join(target_dir, "best.pth.tar"))

                print("save model succesfully!\n")

            print("* Test for epoch {}:".format(epoch))
            _, _, test_accuracy, predictions = validate(
                model, test_loader, criterion)
            print("Test accuracy: {:.4f}%\n".format(test_accuracy))
            test_prediction = pd.DataFrame({'prediction': predictions})
            test_prediction.to_csv(os.path.join(target_dir,
                                                "test_prediction.csv"),
                                   index=False)

        if patience_counter >= patience:
            print("-> Early stopping: patience limit reached, stopping...")
            break