Пример #1
0
 def train_model(self, num_epochs, train_data, eval):
     batched_data = batching_list_instances(self.config, train_data)
     self.optimizer = get_optimizer(self.config, self.model, 'sgd')
     for epoch in range(num_epochs):
         epoch_loss = 0
         self.model.zero_grad()
         for index in tqdm(np.random.permutation(len(batched_data))):
             self.model.train()
             sequence_loss = self.model(*batched_data[index][0:5],
                                        batched_data[index][-2],
                                        batched_data[index][-3])
             loss = sequence_loss
             epoch_loss = epoch_loss + loss.data
             loss.backward(retain_graph=True)
             self.optimizer.step()
             self.model.zero_grad()
         print(epoch_loss)
         if eval:
             self.model.eval()
             dev_batches = batching_list_instances(self.config, self.dev)
             test_batches = batching_list_instances(self.config, self.test)
             dev_metrics = self.evaluate_model_top(dev_batches, "dev",
                                                   self.dev, self.triggers)
             test_metrics = self.evaluate_model_top(test_batches, "test",
                                                    self.test,
                                                    self.triggers)
             self.model.zero_grad()
     return self.model
Пример #2
0
    def self_training(self, num_epochs, train_data, unlabeled_data):
        self.optimizer = get_optimizer(self.config, self.model, 'sgd')
        merged_data = train_data
        unlabels = unlabeled_data
        for epoch in range(num_epochs):
            batched_data = batching_list_instances(self.config, merged_data)
            epoch_loss = 0
            self.model.zero_grad()
            for index in tqdm(np.random.permutation(len(batched_data))):
                self.model.train()
                sequence_loss = self.model(*batched_data[index][0:5],
                                           batched_data[index][-2],
                                           batched_data[index][-3])
                loss = sequence_loss
                epoch_loss = epoch_loss + loss.data
                loss.backward(retain_graph=True)
                self.optimizer.step()
                self.model.zero_grad()
            print(epoch_loss)

            self.model.eval()
            dev_batches = batching_list_instances(self.config, self.dev)
            test_batches = batching_list_instances(self.config, self.test)
            dev_metrics = self.evaluate_model_top(dev_batches, "dev", self.dev,
                                                  self.triggers)
            test_metrics = self.evaluate_model_top(test_batches, "test",
                                                   self.test, self.triggers)
            self.model.zero_grad()

            weaklabel, unlabel = self.weak_label_selftrain(
                unlabels, self.triggers)
            merged_data = merged_data + weaklabel
            unlabels = unlabel
            print(len(merged_data), len(weaklabel), len(unlabels))
        return self.model
def train_one(config: Config,
              train_insts: List[Instance],
              dev_insts: List[Instance],
              model_name: str,
              test_insts: List[Instance] = None,
              config_name: str = None,
              result_filename: str = None) -> NNCRF:
    train_batches = batching_list_instances(config, train_insts)
    dev_batches = batching_list_instances(config, dev_insts)
    if test_insts:
        test_batches = simple_batching(config, test_insts)
    else:
        test_batches = None
    model = NNCRF(config)
    model.train()
    optimizer = get_optimizer(config, model)
    epoch = config.num_epochs
    best_dev_f1 = -1
    saved_test_metrics = None
    for i in range(1, epoch + 1):
        epoch_loss = 0
        start_time = time.time()
        model.zero_grad()
        if config.optimizer.lower() == "sgd":
            optimizer = lr_decay(config, optimizer, i)
        for index in np.random.permutation(len(train_batches)):
            model.train()
            loss = model(*train_batches[index])
            epoch_loss += loss.item()
            loss.backward()
            optimizer.step()
            model.zero_grad()
        end_time = time.time()
        print("Epoch %d: %.5f, Time is %.2fs" %
              (i, epoch_loss, end_time - start_time),
              flush=True)

        model.eval()
        # metric is [precision, recall, f_score]
        dev_metrics = evaluate_model(config, model, "dev", dev_insts)
        if test_insts is not None:
            test_metrics = evaluate_model(config, model, "test", test_insts)
        if dev_metrics[2] > best_dev_f1:
            print("saving the best model...")
            best_dev_f1 = dev_metrics[2]
            if test_insts is not None:
                saved_test_metrics = test_metrics
            torch.save(model.state_dict(), model_name)
            # # Save the corresponding config as well.
            if config_name:
                f = open(config_name, 'wb')
                pickle.dump(config, f)
                f.close()
            if result_filename:
                write_results(result_filename, test_insts)
        model.zero_grad()
    if test_insts is not None:
        print(f"The best dev F1: {best_dev_f1}")
        print(f"The corresponding test: {saved_test_metrics}")
    return model
Пример #4
0
def train_model(config: Config, train_insts: List[List[Instance]],
                dev_insts: List[Instance]):
    train_num = len(train_insts)
    logging.info(("[Training Info] number of instances: %d" % (train_num)))
    dev_batches = batching_list_instances(config, dev_insts)  # 验证集一直不会改变

    model_folder = config.model_folder
    logging.info("[Training Info] The model will be saved to: %s" %
                 (model_folder))
    if not os.path.exists(model_folder):
        os.makedirs(model_folder)

    logging.info("-" * 20 +
                 f" [Training Info] Running for {iter}th large iterations. " +
                 "-" * 20)
    train_batches = batching_list_instances(config, train_insts)

    logging.info(
        "\n" +
        f"-------- [Training Info] Training fold {0}. Initialized from pre-trained Model -------"
    )
    model_name = model_folder + f"/bert_crf_simple"
    train_one(
        config=config,
        train_batches=train_batches,  # Initialize bert model
        dev_insts=dev_insts,
        dev_batches=dev_batches,
        model_name=model_name)
Пример #5
0
def train_model(config: Config, train_insts: List[List[Instance]], dev_insts: List[Instance]):
    train_num = sum([len(insts) for insts in train_insts])
    logging.info(("[Training Info] number of instances: %d" % (train_num)))
    dev_batches = batching_list_instances(config, dev_insts)   # 验证集一直不会改变

    model_folder = config.model_folder
    logging.info("[Training Info] The model will be saved to: %s" % (model_folder))
    if not os.path.exists(model_folder):
        os.makedirs(model_folder)

    num_outer_iterations = config.num_outer_iterations
    for iter in range(num_outer_iterations):

        logging.info("-" * 20 + f" [Training Info] Running for {iter}th large iterations. " + "-" * 20)
        train_batches = [batching_list_instances(config, insts) for insts in train_insts]

        for fold_id in range(2):  # train 2 models in 2 folds
            logging.info("\n" + f"-------- [Training Info] Training fold {fold_id}. Initialized from pre-trained Model -------")
            model_name = model_folder + f"/bert_crf_{fold_id}"
            train_one(config=config, train_batches=train_batches[fold_id],  # Initialize bert model
                      dev_insts=dev_insts, dev_batches=dev_batches, model_name=model_name)

        logging.info("\n\n[Data Info] Assigning labels")
        # 模型0更新训练数据1,模型1更新训练数据0
        for fold_id in range(2):
            model = load_model(config)
            model_name = model_folder + f"/bert_crf_{fold_id}"

            utils.load_checkpoint(os.path.join(model_name, 'best.pth.tar'), model)
            dev_metrics = evaluate_model(config, model, train_batches[fold_id], train_insts[fold_id])
            logging.info(str(fold_id) + "  self [train set] Precision: %.2f, Recall: %.2f, F1: %.2f" % (dev_metrics[0], dev_metrics[1], dev_metrics[2]))
            dev_metrics = evaluate_model(config, model, train_batches[1-fold_id], train_insts[1-fold_id])
            logging.info(str(fold_id) + " other [train set] Precision: %.2f, Recall: %.2f, F1: %.2f" % (dev_metrics[0], dev_metrics[1], dev_metrics[2]))
            hard_constraint_predict(config=config, model=model,
                                    fold_batches=train_batches[1 - fold_id],
                                    folded_insts=train_insts[1 - fold_id])  # set a new label id, k is set to 2, so 1 - fold_id can be used

        # train the final model
        logging.info("\n\n")
        logging.info("-------- [Training Info] Training the final model-------- ")

        # merge the result data to training the final model
        all_train_insts = list(itertools.chain.from_iterable(train_insts))

        logging.info("Initialized from pre-trained Model")
        model_name = model_folder + "/final_bert_crf"
        config_name = model_folder + "/config.conf"
        all_train_batches = batching_list_instances(config=config, insts=all_train_insts)
        train_one(config=config, train_batches=all_train_batches, dev_insts=dev_insts, dev_batches=dev_batches,
                          model_name=model_name, config_name=config_name)
        # load the best final model
        # utils.load_checkpoint(os.path.join(model_name, 'best.pth.tar'), model)
        # model.eval()
        # logging.info("\n")
        # result = evaluate_model(config, model, dev_batches, "dev", dev_insts)
        logging.info("\n\n")
Пример #6
0
    def train_model(self, num_epochs, train_data):
        batched_data = batching_list_instances(self.config, train_data)
        self.optimizer = get_optimizer(self.config, self.model, 'adam')
        criterion = nn.NLLLoss()
        for epoch in range(num_epochs):
            epoch_loss = 0
            self.model.zero_grad()
            for index in tqdm(np.random.permutation(len(batched_data))):
                self.model.train()
                trig_rep, trig_type_probas, match_trig, match_sent = self.model(
                    *batched_data[index][0:5], batched_data[index][-2])
                trigger_loss = criterion(trig_type_probas,
                                         batched_data[index][-1])
                soft_matching_loss = self.contrastive_loss(
                    match_trig, match_sent,
                    torch.stack([torch.tensor(1)] * trig_rep.size(0) +
                                [torch.tensor(0)] * trig_rep.size(0)))
                loss = trigger_loss + soft_matching_loss
                epoch_loss = epoch_loss + loss.data
                loss.backward(retain_graph=True)
                self.optimizer.step()
                self.model.zero_grad()
            print(epoch_loss)
            self.test_model(train_data)
            self.model.zero_grad()

        return self.model
Пример #7
0
    def test_model(self, test_data):
        batched_data = batching_list_instances(self.config, test_data)
        self.model.eval()
        predicted_list = []
        target_list = []
        match_target_list = []
        matched_list = []
        for index in tqdm(np.random.permutation(len(batched_data))):
            trig_rep, trig_type_probas, match_trig, match_sent = self.model(
                *batched_data[index][0:5], batched_data[index][-2])
            trig_type_value, trig_type_predicted = torch.max(
                trig_type_probas, 1)
            target = batched_data[index][-1]
            target_list.extend(target.tolist())
            predicted_list.extend(trig_type_predicted.tolist())

            match_target_list.extend([torch.tensor(1)] * trig_rep.size(0) +
                                     [torch.tensor(0)] * trig_rep.size(0))
            distances = (match_trig - match_sent).pow(2).sum(1)
            distances = torch.sqrt(distances)
            matched_list.extend((distances < 1.0).long().tolist())

        print("trigger classification accuracy ",
              accuracy_score(predicted_list, target_list))
        print("soft matching accuracy ",
              accuracy_score(matched_list, match_target_list))
Пример #8
0
    def get_triggervec(self, data):
        batched_data = batching_list_instances(self.config, data)
        self.model.eval()
        logits_list = []
        predicted_list = []
        trigger_list = []
        for index in tqdm(range(len(batched_data))):
            trig_rep, trig_type_probas, match_trig, match_sent = self.model(
                *batched_data[index][0:5], batched_data[index][-2])
            trig_type_value, trig_type_predicted = torch.max(
                trig_type_probas, 1)
            ne_batch_insts = data[index * self.config.batch_size:(index + 1) *
                                  self.config.batch_size]
            for idx in range(len(trig_rep)):
                ne_batch_insts[idx].trigger_vec = trig_rep[idx]
            logits_list.extend(trig_rep)
            predicted_list.extend(trig_type_predicted)
            word_seq = batched_data[index][0]
            trigger_positions = batched_data[index][-2]

            for ws, tp in zip(word_seq, trigger_positions):
                trigger_list.append(" ".join(self.config.idx2word[ws[index]]
                                             for index in tp))

        return logits_list, predicted_list, trigger_list
def evaluate_model(config: Config, model: NNCRF, name: str,
                   insts: List[Instance]):
    ## evaluation
    batch_insts_ids = batching_list_instances(config, insts)
    metrics = np.asarray([0, 0, 0], dtype=int)
    batch_id = 0
    batch_size = config.batch_size
    for batch in batch_insts_ids:
        one_batch_insts = insts[batch_id * batch_size:(batch_id + 1) *
                                batch_size]
        with torch.no_grad():
            batch_max_scores, batch_max_ids = model.decode(batch)
        metrics += evaluate_batch_insts(batch_insts=one_batch_insts,
                                        batch_pred_ids=batch_max_ids,
                                        batch_gold_ids=batch[-1],
                                        word_seq_lens=batch[1],
                                        idx2label=config.idx2labels)
        batch_id += 1
    p, total_predict, total_entity = metrics[0], metrics[1], metrics[2]
    precision = p * 1.0 / total_predict * 100 if total_predict != 0 else 0
    recall = p * 1.0 / total_entity * 100 if total_entity != 0 else 0
    fscore = 2.0 * precision * recall / (
        precision + recall) if precision != 0 or recall != 0 else 0
    print("[%s set] Precision: %.2f, Recall: %.2f, F1: %.2f" %
          (name, precision, recall, fscore),
          flush=True)
    return [precision, recall, fscore]
Пример #10
0
    def weak_label_selftrain(self, unlabeled_data, triggers):
        batched_data = batching_list_instances(self.config, unlabeled_data, is_soft=False, is_naive=True)
        weakly_labeled, unlabeled, confidence = self.weakly_labeling(batched_data, unlabeled_data, triggers)

        confidence_order = [i[0] for i in sorted(enumerate(confidence), key=lambda x: x[1])]
        threshold = int(len(confidence_order) * 0.01)
        high_confidence = confidence_order[:threshold]
        low_confidence = confidence_order[threshold:]

        final_weakly_labeled = [weakly_labeled[i] for i in high_confidence]
        unlabeled = unlabeled + [weakly_labeled[i] for i in low_confidence]

        return final_weakly_labeled, unlabeled
def update_train_insts(config: Config, train_insts: List[List[Instance]],
                       model_names):
    # assign hard prediction to other folds
    if config.variant == "hard":
        print("\n\n[Data Info] Assigning labels for the HARD approach")
    else:
        print(
            "\n\n[Data Info] Performing marginal decoding to assign the marginals"
        )
    train_batches = [
        batching_list_instances(config, insts) for insts in train_insts
    ]
    for fold_id, folded_train_insts in enumerate(train_insts):
        model = NNCRF(config)
        model_name = model_names[fold_id]
        model.load_state_dict(torch.load(model_name))
        predict_with_constraints(
            config=config,
            model=model,
            fold_batches=train_batches[1 - fold_id],
            folded_insts=train_insts[1 - fold_id])  ## set a new label id

    print("\n\n")
    return train_insts
Пример #12
0
def train_model(config: Config, epoch: int, train_insts: List[Instance],
                dev_insts: List[Instance], test_insts: List[Instance]):
    model = NNCRF(config)
    optimizer = get_optimizer(config, model)
    train_num = len(train_insts)
    print("number of instances: %d" % (train_num))
    print(colored("[Shuffled] Shuffle the training instance ids", "red"))
    random.shuffle(train_insts)

    batched_data = batching_list_instances(config, train_insts)
    dev_batches = batching_list_instances(config, dev_insts)
    test_batches = batching_list_instances(config, test_insts)

    best_dev = [-1, 0]
    best_test = [-1, 0]

    model_folder = config.model_folder
    res_folder = "results"
    if os.path.exists("model_files/" + model_folder):
        raise FileExistsError(
            f"The folder model_files/{model_folder} exists. Please either delete it or create a new one "
            f"to avoid override.")
    model_path = f"model_files/{model_folder}/lstm_crf.m"
    config_path = f"model_files/{model_folder}/config.conf"
    res_path = f"{res_folder}/{model_folder}.results"
    print("[Info] The model will be saved to: %s.tar.gz" % (model_folder))
    os.makedirs(f"model_files/{model_folder}",
                exist_ok=True)  ## create model files. not raise error if exist
    os.makedirs(res_folder, exist_ok=True)
    no_incre_dev = 0
    for i in tqdm(range(1, epoch + 1), desc="Epoch"):
        epoch_loss = 0
        start_time = time.time()
        model.zero_grad()
        if config.optimizer.lower() == "sgd":
            optimizer = lr_decay(config, optimizer, i)
        for index in tqdm(np.random.permutation(len(batched_data)),
                          desc="--training batch",
                          total=len(batched_data)):
            model.train()
            loss = model(*batched_data[index])
            epoch_loss += loss.item()
            loss.backward()
            optimizer.step()
            model.zero_grad()

        end_time = time.time()
        print("Epoch %d: %.5f, Time is %.2fs" %
              (i, epoch_loss, end_time - start_time),
              flush=True)

        model.eval()
        dev_metrics = evaluate_model(config, model, dev_batches, "dev",
                                     dev_insts)
        test_metrics = evaluate_model(config, model, test_batches, "test",
                                      test_insts)
        if dev_metrics[2] > best_dev[0]:
            print("saving the best model...")
            no_incre_dev = 0
            best_dev[0] = dev_metrics[2]
            best_dev[1] = i
            best_test[0] = test_metrics[2]
            best_test[1] = i
            torch.save(model.state_dict(), model_path)
            # Save the corresponding config as well.
            f = open(config_path, 'wb')
            pickle.dump(config, f)
            f.close()
            write_results(res_path, test_insts)
        else:
            no_incre_dev += 1
        model.zero_grad()
        if no_incre_dev >= config.max_no_incre:
            print(
                "early stop because there are %d epochs not increasing f1 on dev"
                % no_incre_dev)
            break

    print("Archiving the best Model...")
    with tarfile.open(f"model_files/{model_folder}/{model_folder}.tar.gz",
                      "w:gz") as tar:
        tar.add(f"model_files/{model_folder}",
                arcname=os.path.basename(model_folder))

    print("Finished archiving the models")

    print("The best dev: %.2f" % (best_dev[0]))
    print("The corresponding test: %.2f" % (best_test[0]))
    print("Final testing.")
    model.load_state_dict(torch.load(model_path))
    model.eval()
    evaluate_model(config, model, test_batches, "test", test_insts)
    write_results(res_path, test_insts)
Пример #13
0
    def train_model(self,
                    num_epochs,
                    train_data,
                    output_count="",
                    is_paint=True):
        batched_data, batch_insts = batching_list_instances(
            self.config, train_data)
        size = len(batched_data) // 10
        self.optimizer = get_optimizer(self.config, self.model,
                                       self.config.optimizer)
        start = time.gmtime()
        losses = []
        train_precisions = []
        train_recalls = []
        train_fscores = []
        test_precisions = []
        test_recalls = []
        test_fscores = []
        for epoch in range(num_epochs):
            epoch_loss = 0
            self.model.zero_grad()
            print(f"------------------epoch: {(epoch+1)}------------------")
            for index in tqdm(np.random.permutation(len(batched_data))):
                self.model.train()
                sequence_loss = self.model(*batched_data[index][0:5],
                                           batched_data[index][-1],
                                           batch_insts[index])
                loss = sequence_loss
                if index % size == 0:
                    losses.append(loss.data)
                epoch_loss = epoch_loss + loss.data
                loss.backward(retain_graph=True)
                self.optimizer.step()
                self.model.zero_grad()
            print(epoch_loss)
            self.model.eval()
            # train_batches, train_insts = batching_list_instances(self.config, train_data)
            # train_metrics = self.evaluate_model(train_batches, "train", train_data, train_insts)

            # train_precisions.append(train_metrics[0])
            # train_recalls.append(train_metrics[1])
            # train_fscores.append(train_metrics[2])

            test_batches, test_insts = batching_list_instances(
                self.config, self.test)
            test_metrics = self.evaluate_model(test_batches, "test", self.test,
                                               test_insts)

            test_precisions.append(test_metrics[0])
            test_recalls.append(test_metrics[1])
            test_fscores.append(test_metrics[2])
            self.model.zero_grad()

        end = time.gmtime()
        start = time.strftime("%H:%M:%S", start).split(":")
        start = [str((int(start[0]) + 8) % 24)] + start[1:]
        end = time.strftime("%H:%M:%S", end).split(":")
        end = [str((int(end[0]) + 8) % 24)] + end[1:]
        print(f"startTime: {start}")
        print(f"endTime: {end}")
        # print("Train")
        # print("precisions", train_precisions)
        # print("recalls", train_recalls)
        # print("fscores:", train_fscores)
        # print("Test")
        print("precisions", test_precisions)
        print("recalls", test_recalls)
        print("fscores:", test_fscores)
        x = list(range(1, num_epochs + 1))
        x_list = [
            i / (len(losses) / num_epochs)
            for i in list(range(1,
                                len(losses) + 1))
        ]
        # for i, v in enumerate(epoch_list):
        #     if ((i + 1) % train_plt_size) == 0:
        #         epoch_list[i] = (i // train_plt_size) + 1
        if is_paint:
            plt.figure()
            plt.grid(linestyle="--")  # 设置背景网格线为虚线
            ax = plt.gca()
            ax.spines['top'].set_visible(False)  # 去掉上边框
            ax.spines['right'].set_visible(False)  # 去掉右边框
            plt.plot(x,
                     test_precisions,
                     marker='o',
                     color="red",
                     label="precision",
                     linewidth=1.5)
            plt.plot(x,
                     test_recalls,
                     marker='o',
                     color="green",
                     label="recall",
                     linewidth=1.5)
            plt.plot(x,
                     test_fscores,
                     marker='o',
                     color="blue",
                     label="fscore",
                     linewidth=1.5)
            plt.xlabel('epoch')
            plt.ylabel('Performance Percentile')
            plt.legend(loc=0, numpoints=1)
            leg = plt.gca().get_legend()
            ltext = leg.get_texts()
            plt.setp(ltext, fontsize=12, fontweight='bold')  # 设置图例字体的大小和粗细
            plt.savefig(
                f'per-{self.config.dataset}-{self.config.optimizer}-{num_epochs}-{self.config.learning_rate}-{output_count}.pdf',
                format='pdf')
            plt.savefig(
                f'per-{self.config.dataset}-{self.config.optimizer}-{num_epochs}-{self.config.learning_rate}-{output_count}.svg',
                format='svg')
            # plt.show()

            plt.figure()
            plt.grid(linestyle="--")  # 设置背景网格线为虚线
            ax = plt.gca()
            ax.spines['top'].set_visible(False)  # 去掉上边框
            ax.spines['right'].set_visible(False)  # 去掉右边框
            plt.plot(x_list, losses)
            plt.xlabel('epoch')
            plt.ylabel('Train Loss')
            plt.legend(loc=0, numpoints=1)
            leg = plt.gca().get_legend()
            ltext = leg.get_texts()
            plt.setp(ltext, fontsize=12, fontweight='bold')  # 设置图例字体的大小和粗细
            plt.savefig(
                f'loss-{self.config.dataset}-{self.config.optimizer}-{num_epochs}-{self.config.learning_rate}-{output_count}.pdf',
                format='pdf')
            plt.savefig(
                f'loss-{self.config.dataset}-{self.config.optimizer}-{num_epochs}-{self.config.learning_rate}-{output_count}.svg',
                format='svg')
            # plt.show()
        else:
            plt.figure()
            plt.grid(linestyle="--")  # 设置背景网格线为虚线
            ax = plt.gca()
            ax.spines['top'].set_visible(False)  # 去掉上边框
            ax.spines['right'].set_visible(False)  # 去掉右边框
            plt.plot(x_list, losses)
            plt.xlabel('epoch')
            plt.ylabel('Train Loss')
            plt.legend(loc=0, numpoints=1)
            leg = plt.gca().get_legend()
            ltext = leg.get_texts()
            plt.setp(ltext, fontsize=12, fontweight='bold')  # 设置图例字体的大小和粗细
            plt.savefig(
                f'loss-{self.config.dataset}-{self.config.optimizer}-{num_epochs}-{self.config.learning_rate}-{output_count}.pdf',
                format='pdf')
            plt.savefig(
                f'loss-{self.config.dataset}-{self.config.optimizer}-{num_epochs}-{self.config.learning_rate}-{output_count}.svg',
                format='svg')
        return self.model
Пример #14
0
def train_model(config: Config, train_insts: List[List[Instance]],
                dev_insts: List[Instance]):
    train_num = sum([len(insts) for insts in train_insts])
    logging.info(("[Training Info] number of instances: %d" % (train_num)))
    # get the batched data
    dev_batches = batching_list_instances(config, dev_insts)

    model_folder = config.model_folder

    logging.info("[Training Info] The model will be saved to: %s" %
                 (model_folder))
    if not os.path.exists(model_folder):
        os.makedirs(model_folder)

    num_outer_iterations = config.num_outer_iterations

    for iter in range(num_outer_iterations):

        logging.info(f"[Training Info] Running for {iter}th large iterations.")

        model_names = []  # model names for each fold

        train_batches = [
            batching_list_instances(config, insts) for insts in train_insts
        ]

        logging.info("length of train_insts:%d" % len(train_insts))

        # train 2 models in 2 folds
        for fold_id, folded_train_insts in enumerate(train_insts):
            logging.info(f"[Training Info] Training fold {fold_id}.")
            # Initialize bert model
            logging.info("Initialized from pre-trained Model")

            model_name = model_folder + f"/bert_crf_{fold_id}"
            model_names.append(model_name)
            train_one(config=config,
                      train_batches=train_batches[fold_id],
                      dev_insts=dev_insts,
                      dev_batches=dev_batches,
                      model_name=model_name)

        # assign prediction to other folds
        logging.info("\n\n")
        logging.info("[Data Info] Assigning labels")

        # using the model trained in one fold to predict the result of another fold's data
        # and update the label of another fold with the predict result
        for fold_id, folded_train_insts in enumerate(train_insts):

            cfig_path = os.path.join(config.bert_model_dir, 'bert_config.json')
            cfig = BertConfig.from_json_file(cfig_path)
            cfig.device = config.device
            cfig.label2idx = config.label2idx
            cfig.label_size = config.label_size
            cfig.idx2labels = config.idx2labels

            model_name = model_folder + f"/bert_crf_{fold_id}"
            model = BertCRF(cfig=cfig)
            model.to(cfig.device)
            utils.load_checkpoint(os.path.join(model_name, 'best.pth.tar'),
                                  model)

            hard_constraint_predict(
                config=config,
                model=model,
                fold_batches=train_batches[1 - fold_id],
                folded_insts=train_insts[1 - fold_id]
            )  # set a new label id, k is set to 2, so 1 - fold_id can be used
        logging.info("\n\n")

        logging.info("[Training Info] Training the final model")

        # merge the result data to training the final model
        all_train_insts = list(itertools.chain.from_iterable(train_insts))

        logging.info("Initialized from pre-trained Model")

        model_name = model_folder + "/final_bert_crf"
        config_name = model_folder + "/config.conf"

        all_train_batches = batching_list_instances(config=config,
                                                    insts=all_train_insts)
        # train the final model
        model = train_one(config=config,
                          train_batches=all_train_batches,
                          dev_insts=dev_insts,
                          dev_batches=dev_batches,
                          model_name=model_name,
                          config_name=config_name)
        # load the best final model
        utils.load_checkpoint(os.path.join(model_name, 'best.pth.tar'), model)
        model.eval()
        logging.info("\n")
        result = evaluate_model(config, model, dev_batches, "dev", dev_insts)
        logging.info("\n\n")
def train_model(config: Config, train_insts: List[List[Instance]],
                dev_insts: List[Instance], test_insts: List[Instance]):
    train_num = sum([len(insts) for insts in train_insts])
    print("[Training Info] number of instances: %d" % (train_num))

    dev_batches = batching_list_instances(config, dev_insts)
    test_batches = batching_list_instances(config, test_insts)

    best_dev = [-1, 0]
    best_test = [-1, 0]

    model_folder = config.model_folder
    res_folder = config.res_folder
    if os.path.exists(model_folder):
        raise FileExistsError(
            f"The folder {model_folder} exists. Please either delete it or create a new one "
            f"to avoid override.")

    print("[Training Info] The model will be saved to: %s.tar.gz" %
          (model_folder))
    if not os.path.exists(model_folder):
        os.makedirs(model_folder)
    if not os.path.exists(res_folder):
        os.makedirs(res_folder)

    num_outer_iterations = config.num_outer_iterations

    SL_warmup = 2

    for iter in range(num_outer_iterations):
        print(f"[Training Info] Running for {iter}th large iterations.")

        #change fold devision every two iter

        if (iter > 0 and iter // 2 != (iter - 1) // 2):
            train_insts = train_insts[0] + train_insts[1]
            random.shuffle(train_insts)
            num_insts_in_fold = math.ceil(len(train_insts) / config.num_folds)
            train_insts = [
                train_insts[i * num_insts_in_fold:(i + 1) * num_insts_in_fold]
                for i in range(config.num_folds)
            ]

        model_names = []  #model names for each fold
        train_batches = [
            batching_list_instances(config, insts) for insts in train_insts
        ]

        neg_noise_rate_gold, pos_noise_rate_gold = ratio_estimation(
            config, train_insts)
        if (config.neg_noise_rate >= 0):
            neg_noise_rate = config.neg_noise_rate
        else:
            neg_noise_rate = neg_noise_rate_gold
        if (config.pos_noise_rate >= 0):
            pos_noise_rate = config.pos_noise_rate
        else:
            pos_noise_rate = pos_noise_rate_gold

        if (iter > 0):
            neg_noise_rate = 0.005
            pos_noise_rate = 0.15

        print('negative noise rate: ' + str(neg_noise_rate))
        print('positve noise rate: ' + str(pos_noise_rate))

        if (config.warm_up_num == 0):
            rate_schedule_neg, rate_schedule_pos = gen_forget_rate(
                config.num_epochs, neg_noise_rate, pos_noise_rate,
                config.num_gradual_neg, config.num_gradual_pos)
        else:
            rate_schedule_neg, rate_schedule_pos = gen_forget_rate_warmup(
                config.num_epochs, neg_noise_rate, pos_noise_rate,
                config.warm_up_num, config.num_gradual_neg,
                config.num_gradual_pos)

        for fold_id, folded_train_insts in enumerate(train_insts):
            print(f"[Training Info] Training fold {fold_id}.")
            model_name = model_folder + f"/lstm_crf_{fold_id}.m"
            model_names.append(model_name)
            train_one(config=config,
                      train_batches=train_batches[fold_id],
                      dev_insts=dev_insts,
                      dev_batches=dev_batches,
                      model_name=model_name,
                      rate_schedule_neg=rate_schedule_neg,
                      rate_schedule_pos=rate_schedule_pos)

        # assign hard prediction to other folds
        print("\n\n[Data Info] Assigning labels for the HARD approach")

        for fold_id, folded_train_insts in enumerate(train_insts):
            model = NNCRF_sl(config)
            model_name = model_names[fold_id]
            model.load_state_dict(torch.load(model_name))
            hard_constraint_predict(
                config=config,
                model=model,
                fold_batches=train_batches[1 - fold_id],
                folded_insts=train_insts[1 - fold_id])  ## set a new label id
        print("\n\n")

        print("[Training Info] Training the final model")
        all_train_insts = list(itertools.chain.from_iterable(train_insts))
        model_name = model_folder + "/num_outer_iterations_final_lstm_crf.m"
        config_name = model_folder + "/num_outer_iterations_config.conf"
        res_name = res_folder + "/num_outer_iterations_lstm_crf.results".format(
        )
        all_train_batches = batching_list_instances(config=config,
                                                    insts=all_train_insts)

        neg_noise_rate, pos_noise_rate = ratio_estimation(config, train_insts)

        rate_schedule_neg = np.zeros(config.num_epochs)
        rate_schedule_pos = np.zeros(config.num_epochs)

        model = train_one(config=config,
                          train_batches=all_train_batches,
                          dev_insts=dev_insts,
                          dev_batches=dev_batches,
                          model_name=model_name,
                          config_name=config_name,
                          test_insts=test_insts,
                          test_batches=test_batches,
                          result_filename=res_name,
                          rate_schedule_neg=rate_schedule_neg,
                          rate_schedule_pos=rate_schedule_pos)
        print("Archiving the best Model...")
        with tarfile.open(
                model_folder + "/" + str(num_outer_iterations) + model_folder +
                ".tar.gz", "w:gz") as tar:
            tar.add(model_folder, arcname=os.path.basename(model_folder))

        model.load_state_dict(torch.load(model_name))
        model.eval()
        evaluate_model(config, model, test_batches, "test", test_insts)
        write_results(res_name, test_insts)
def train_model(config: Config, epoch: int, train_insts: List[Instance],
                dev_insts: List[Instance], test_insts: List[Instance]):
    model = NNCRF(config)
    optimizer = get_optimizer(config, model)
    train_num = len(train_insts)
    print("number of instances: %d" % (train_num))
    print(colored("[Shuffled] Shuffle the training instance ids", "red"))
    random.shuffle(train_insts)

    batched_data = batching_list_instances(config, train_insts)
    dev_batches = batching_list_instances(config, dev_insts)
    test_batches = batching_list_instances(config, test_insts)

    best_dev = [-1, 0]
    best_test = [-1, 0]

    model_folder = config.model_folder
    res_folder = "results"
    if os.path.exists(model_folder):
        raise FileExistsError(
            f"The folder {model_folder} exists. Please either delete it or create a new one "
            f"to avoid override.")
    model_name = model_folder + "/lstm_crf.m".format()
    config_name = model_folder + "/config.conf"
    res_name = res_folder + "/lstm_crf.results".format()
    print("[Info] The model will be saved to: %s.tar.gz" % (model_folder))
    if not os.path.exists(model_folder):
        os.makedirs(model_folder)
    if not os.path.exists(res_folder):
        os.makedirs(res_folder)

    for i in range(1, epoch + 1):
        epoch_loss = 0
        start_time = time.time()
        model.zero_grad()
        if config.optimizer.lower() == "sgd":
            optimizer = lr_decay(config, optimizer, i)
        for index in np.random.permutation(len(batched_data)):
            model.train()
            loss = model(*batched_data[index])
            epoch_loss += loss.item()
            loss.backward()
            optimizer.step()
            model.zero_grad()
            loss.detach()

        end_time = time.time()
        print("Epoch %d: %.5f, Time is %.2fs" %
              (i, epoch_loss, end_time - start_time),
              flush=True)

        model.eval()
        dev_metrics = evaluate_model(config, model, dev_batches, "dev",
                                     dev_insts)
        test_metrics = evaluate_model(config, model, test_batches, "test",
                                      test_insts)
        if test_metrics[1][2] > best_test[0]:
            print("saving the best model...")
            best_dev[0] = dev_metrics[1][2]
            best_dev[1] = i
            best_test[0] = test_metrics[1][2]
            best_test[1] = i
            torch.save(model.state_dict(), model_name)
            # Save the corresponding config as well.
            f = open(config_name, 'wb')
            pickle.dump(config, f)
            f.close()
            print('Exact\n')
            print_report(test_metrics[-2])
            print('Overlap\n')
            print_report(test_metrics[-1])
            write_results(res_name, test_insts)
            print("Archiving the best Model...")
            with tarfile.open(model_folder + "/" + model_folder + ".tar.gz",
                              "w:gz") as tar:
                tar.add(model_folder, arcname=os.path.basename(model_folder))
        model.zero_grad()

    print("Finished archiving the models")

    print("The best dev: %.2f" % (best_dev[0]))
    print("The corresponding test: %.2f" % (best_test[0]))
    print("Final testing.")
    model.load_state_dict(torch.load(model_name))
    model.eval()
    evaluate_model(config, model, test_batches, "test", test_insts)
    write_results(res_name, test_insts)
Пример #17
0
def train_model(config: Config, epoch: int, train_insts: List[Instance],
                dev_insts: List[Instance], test_insts: List[Instance]):
    ### Data Processing Info
    train_num = len(train_insts)
    print("number of instances: %d" % (train_num))
    print(colored("[Shuffled] Shuffle the training instance ids", "red"))
    random.shuffle(train_insts)

    batched_data = batching_list_instances(config, train_insts)
    dev_batches = batching_list_instances(config, dev_insts)
    test_batches = batching_list_instances(config, test_insts)

    if config.embedder_type == "normal":
        model = NNCRF(config)
        optimizer = get_optimizer(config, model)
        scheduler = None
    else:
        print(
            colored(
                f"[Model Info]: Working with transformers package from huggingface with {config.embedder_type}",
                'red'))
        print(
            colored(
                f"[Optimizer Info]: You should be aware that you are using the optimizer from huggingface.",
                'red'))
        print(
            colored(
                f"[Optimizer Info]: Change the optimier in transformers_util.py if you want to make some modifications.",
                'red'))
        model = TransformersCRF(config)
        optimizer, scheduler = get_huggingface_optimizer_and_scheduler(
            config,
            model,
            num_training_steps=len(batched_data) * epoch,
            weight_decay=0.0,
            eps=1e-8,
            warmup_step=0)
        print(
            colored(f"[Optimizer Info] Modify the optimizer info as you need.",
                    'red'))
        print(optimizer)

    model.to(config.device)

    best_dev = [-1, 0]
    best_test = [-1, 0]

    model_folder = config.model_folder
    res_folder = "results"
    if os.path.exists("model_files/" + model_folder):
        raise FileExistsError(
            f"The folder model_files/{model_folder} exists. Please either delete it or create a new one "
            f"to avoid override.")
    model_path = f"model_files/{model_folder}/lstm_crf.m"
    config_path = f"model_files/{model_folder}/config.conf"
    res_path = f"{res_folder}/{model_folder}.results"
    print("[Info] The model will be saved to: %s.tar.gz" % (model_folder))
    os.makedirs(f"model_files/{model_folder}",
                exist_ok=True)  ## create model files. not raise error if exist
    os.makedirs(res_folder, exist_ok=True)
    no_incre_dev = 0
    print(
        colored(
            f"[Train Info] Start training, you have set to stop if performace not increase for {config.max_no_incre} epochs",
            'red'))
    for i in tqdm(range(1, epoch + 1), desc="Epoch"):
        epoch_loss = 0
        start_time = time.time()
        model.zero_grad()
        if config.optimizer.lower() == "sgd":
            optimizer = lr_decay(config, optimizer, i)
        for index in tqdm(np.random.permutation(len(batched_data)),
                          desc="--training batch",
                          total=len(batched_data)):
            model.train()
            loss = model(**batched_data[index])
            epoch_loss += loss.item()
            loss.backward()
            if config.max_grad_norm > 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(),
                                               config.max_grad_norm)
            optimizer.step()
            optimizer.zero_grad()
            model.zero_grad()
            if scheduler is not None:
                scheduler.step()
        end_time = time.time()
        print("Epoch %d: %.5f, Time is %.2fs" %
              (i, epoch_loss, end_time - start_time),
              flush=True)

        model.eval()
        dev_metrics = evaluate_model(config, model, dev_batches, "dev",
                                     dev_insts)
        test_metrics = evaluate_model(config, model, test_batches, "test",
                                      test_insts)
        if dev_metrics[2] > best_dev[0]:
            print("saving the best model...")
            no_incre_dev = 0
            best_dev[0] = dev_metrics[2]
            best_dev[1] = i
            best_test[0] = test_metrics[2]
            best_test[1] = i
            torch.save(model.state_dict(), model_path)
            # Save the corresponding config as well.
            f = open(config_path, 'wb')
            pickle.dump(config, f)
            f.close()
            write_results(res_path, test_insts)
        else:
            no_incre_dev += 1
        model.zero_grad()
        if no_incre_dev >= config.max_no_incre:
            print(
                "early stop because there are %d epochs not increasing f1 on dev"
                % no_incre_dev)
            break

    print("Archiving the best Model...")
    with tarfile.open(f"model_files/{model_folder}/{model_folder}.tar.gz",
                      "w:gz") as tar:
        tar.add(f"model_files/{model_folder}",
                arcname=os.path.basename(model_folder))

    print("Finished archiving the models")

    print("The best dev: %.2f" % (best_dev[0]))
    print("The corresponding test: %.2f" % (best_test[0]))
    print("Final testing.")
    model.load_state_dict(torch.load(model_path))
    model.eval()
    evaluate_model(config, model, test_batches, "test", test_insts)
    write_results(res_path, test_insts)
Пример #18
0
def train_model(config: Config, train_insts: List[List[Instance]], dev_insts: List[Instance],
                test_insts: List[Instance]):
    train_num = sum([len(insts) for insts in train_insts])
    print(f"[Training Info] number of instances: {train_num:d}")

    dev_batches = batching_list_instances(config, dev_insts)
    test_batches = batching_list_instances(config, test_insts)

    best_dev = [-1, 0]
    best_test = [-1, 0]

    model_folder = config.model_folder
    res_folder = "results"
    # if os.path.exists(model_folder):
    #     raise FileExistsError(f"The folder {model_folder} exists. Please either delete it or create a new one "
    #                           f"to avoid override.")

    print(f"[Training Info] The model will be saved to: {model_folder}.tar.gz")
    if not os.path.exists(model_folder):
        os.makedirs(model_folder)
    if not os.path.exists(res_folder):
        os.makedirs(res_folder)

    num_outer_iterations = config.num_outer_iterations
    for iter in range(num_outer_iterations):
        print(f"[Training Info] Running for {iter}th large iterations.")
        model_names = []  # model names for each fold
        train_batches = [batching_list_instances(config, insts) for insts in train_insts]
        for fold_id, folded_train_insts in enumerate(train_insts):
            print(f"[Training Info] Training fold {fold_id}.")
            model_name = model_folder + f"/lstm_crf_{fold_id}.m"
            model_names.append(model_name)
            train_one(config=config, train_batches=train_batches[fold_id],
                      dev_insts=dev_insts, dev_batches=dev_batches, model_name=model_name)

        # assign hard prediction to other folds
        print("\n\n[Data Info] Assigning labels for the HARD approach")

        for fold_id, folded_train_insts in enumerate(train_insts):
            model = NNCRF(config)
            model_name = model_names[fold_id]
            model.load_state_dict(torch.load(model_name))
            hard_constraint_predict(config=config, model=model,
                                    fold_batches=train_batches[1 - fold_id],
                                    folded_insts=train_insts[1 - fold_id])  # set a new label id
        print("\n\n")

        print("[Training Info] Training the final model")
        all_train_insts = list(itertools.chain.from_iterable(train_insts))
        model_name = model_folder + "/final_lstm_crf.m"
        config_name = model_folder + "/config.conf"
        res_name = res_folder + "/lstm_crf.results".format()
        all_train_batches = batching_list_instances(config=config, insts=all_train_insts)
        model = train_one(config=config, train_batches=all_train_batches, dev_insts=dev_insts, dev_batches=dev_batches,
                          model_name=model_name, config_name=config_name, test_insts=test_insts,
                          test_batches=test_batches, result_filename=res_name)
        print("Archiving the best Model...")
        with tarfile.open(model_folder + "/" + model_folder + ".tar.gz", "w:gz") as tar:
            tar.add(model_folder, arcname=os.path.basename(model_folder))
        # print("The best dev: %.2f" % (best_dev[0]))
        # print("The corresponding test: %.2f" % (best_test[0]))
        # print("Final testing.")
        model.load_state_dict(torch.load(model_name))
        model.eval()
        evaluate_model(config, model, test_batches, "test", test_insts)
        write_results(res_name, test_insts)