def _eval(data):
        model.eval()  # 不启用 BatchNormalization 和 Dropout
        # data = dev_data
        y_pred = []
        y_true = []
        with torch.no_grad():
            for batch_data in data_iter(data, test_batch_size, shuffle=False):
                torch.cuda.empty_cache()
                batch_inputs, batch_labels = batch2tensor(batch_data)
                batch_outputs = model(batch_inputs)
                y_pred.extend(torch.max(batch_outputs, dim=1)
                              [1].cpu().numpy().tolist())
                y_true.extend(batch_labels.cpu().numpy().tolist())

            score, dev_f1 = get_score(y_true, y_pred)
        return score, dev_f1
Пример #2
0
def run(mtd="fold_split"):
    def _eval(data):
        model.eval()  # 不启用 BatchNormalization 和 Dropout
        # data = dev_data
        y_pred = []
        y_true = []
        with torch.no_grad():
            for batch_data in dataset_processer.data_iter(
                    data, config['test_batch_size'], shuffle=False):
                torch.cuda.empty_cache()
                batch_inputs, batch_labels = dataset_processer.batch2tensor(
                    batch_data)
                batch_outputs = model(batch_inputs)
                y_pred.extend(
                    torch.max(batch_outputs, dim=1)[1].cpu().numpy().tolist())
                y_true.extend(batch_labels.cpu().numpy().tolist())

            score, dev_f1 = scores.get_score(y_true, y_pred)
        return score, dev_f1

    if mtd == "fold_split":
        demo_preprocess.split_dataset(raw_path, train_path, dev_path,
                                      test_path)
    elif mtd == "process_data":
        demo_preprocess.process_data(config, train_path, dev_path)
    elif mtd == "train":
        Train_data = file_utils.read_json(config["train_set"])
        Dev_data = file_utils.read_json(config["dev_set"])
        # 生成模型可处理的格式
        train_data = dataset_processer.get_examples(Train_data, label_encoder)
        dev_data = dataset_processer.get_examples(Dev_data, label_encoder)
        del Train_data, Dev_data
        # 一个epoch的batch个数
        batch_num = int(
            np.ceil(len(train_data) / float(config["train_batch_size"])))
        print("batch_num:{}".format(batch_num))
        # model = BertSoftmaxModel(cfg.bert_path, label_encoder)
        optimizer = Optimizer(model.all_parameters,
                              steps=batch_num * config["epochs"])  # 优化器

        # loss
        # criterion = nn.CrossEntropyLoss()  # obj
        criterion = loss_factory.focal_loss()
        best_train_f1, best_dev_f1 = 0, 0
        early_stop = -1
        EarlyStopEpochs = 10  # 当多个epoch,dev的指标都没有提升,则早停
        # train
        print("start train")
        for epoch in range(cfg.RESUME_EPOCH + 1, config["epochs"] + 1):
            optimizer.zero_grad()
            model.train()  # 启用 BatchNormalization 和 Dropout
            overall_losses = 0
            losses = 0
            # batch_idx = 1
            y_pred = []
            y_true = []
            step = 0
            for batch_data in dataset_processer.data_iter(
                    train_data, config["train_batch_size"], shuffle=True):
                torch.cuda.empty_cache()
                batch_inputs, batch_labels = dataset_processer.batch2tensor(
                    batch_data)
                batch_outputs = model(batch_inputs)
                print(batch_outputs.shape)  #
                loss = criterion(batch_outputs, batch_labels)
                loss.backward()

                loss_value = loss.detach().cpu().item()
                losses += loss_value
                overall_losses += loss_value

                y_pred.extend(
                    torch.max(batch_outputs, dim=1)[1].cpu().numpy().tolist())
                y_true.extend(batch_labels.cpu().numpy().tolist())

                # nn.utils.clip_grad_norm_(optimizer.all_params, max_norm=config["clip"])  # 梯度裁剪
                for cur_optim, scheduler in zip(optimizer.optims,
                                                optimizer.schedulers):
                    cur_optim.step()
                    scheduler.step()
                optimizer.zero_grad()
                step += 1
                # print(step, time.time())
            overall_losses /= batch_num
            overall_losses = scores.reformat(overall_losses, 4)
            score, train_f1 = scores.get_score(y_true, y_pred)
            print("epoch:{},train_score:{}, train_f1:{}, overall_loss:{} ".
                  format(epoch, train_f1, score, overall_losses))
            # if set(y_true) == set(y_pred):
            #     print("report")
            #     report = classification_report(y_true, y_pred, digits=4, target_names=label_encoder.target_names)
            #     # logging.info('\n' + report)
            #     print(report)

            # eval
            _, dev_f1 = _eval(data=dev_data)

            if best_dev_f1 < dev_f1:
                best_dev_f1 = dev_f1
                early_stop = 0
                best_train_f1 = train_f1
                save_path = model_utils.save_checkpoint(
                    model,
                    epoch,
                    save_folder=os.path.join(cfg.proj_path, "data/bert_nn"))
                print("save_path:{}".format(save_path))
                # torch.save(model.state_dict(), save_model)
            else:
                early_stop += 1
                if early_stop == EarlyStopEpochs:  # 达到早停次数,则停止训练
                    break
            print(
                "early_stop:{}, score:{}, dev_f1:{}, best_train_f1:{}, best_dev_f1:{}"
                .format(early_stop, dev_f1, score, best_train_f1, best_dev_f1))
def run(method="train", save_path=None, infer_texts=[]):
    shuffle_slicer = ShuffleSlicer()
    # start_time = time.time()

    raw_data_path = "/home/wujinjie/kesci_question_multilabel_classification/data/raw_data/baidu/nlp_db.baidu_text.csv"
    texts = pd.read_csv(raw_data_path)
    train_df, dev_df, test_df = shuffle_slicer.split(texts, dev=True)

    # Test_data = {'label': [0] * len(texts), 'text': test_texts}

    clip = 5.0
    epochs = 100
    # log_interval = 50
    test_batch_size = 128
    train_batch_size = 128
    train_texts, train_labels = process_corpus_dl(train_df)
    Train_data = {'label': train_labels, 'text': train_texts}

    dev_texts, dev_labels = process_corpus_dl(dev_df)
    Dev_data = {'label': dev_labels, 'text': dev_texts}
    vocab = Vocab(Train_data)
    step = 0

    def _eval(data):
        model.eval()  # 不启用 BatchNormalization 和 Dropout
        # data = dev_data
        y_pred = []
        y_true = []
        with torch.no_grad():
            for batch_data in data_iter(data, test_batch_size, shuffle=False):
                torch.cuda.empty_cache()
                batch_inputs, batch_labels = batch2tensor(batch_data)
                batch_outputs = model(batch_inputs)
                y_pred.extend(torch.max(batch_outputs, dim=1)
                              [1].cpu().numpy().tolist())
                y_true.extend(batch_labels.cpu().numpy().tolist())

            score, dev_f1 = get_score(y_true, y_pred)
        return score, dev_f1

    def _infer(data):
        model.eval()
        # data = dev_data
        y_pred = []
        with torch.no_grad():
            for batch_data in data_iter(data, test_batch_size, shuffle=False):
                torch.cuda.empty_cache()
                batch_inputs, batch_labels = batch2tensor(batch_data)
                batch_outputs = model(batch_inputs)
                y_pred.extend(torch.max(batch_outputs, dim=1)
                              [1].cpu().numpy().tolist())
                print(label_encoder.label2name(y_pred))

    if method == "train":
        model = Model(vocab, label_encoder)
        # loss
        criterion = nn.CrossEntropyLoss()  # obj

        # 生成模型可处理的格式
        train_data = get_examples_bert(Train_data, model.word_encoder, vocab, label_encoder)
        dev_data = get_examples_bert(Dev_data, model.word_encoder, vocab, label_encoder)
        # 一个epoch的batch个数
        batch_num = int(np.ceil(len(train_data) / float(train_batch_size)))
        optimizer = Optimizer(model.all_parameters, steps=batch_num * epochs)  # 优化器
        best_train_f1, best_dev_f1 = 0, 0
        early_stop = -1
        EarlyStopEpochs = 3  # 当多个epoch,dev的指标都没有提升,则早停
        # train
        print("start train")
        for epoch in range(1, epochs + 1):
            optimizer.zero_grad()
            model.train()  # 启用 BatchNormalization 和 Dropout
            overall_losses = 0
            losses = 0
            # batch_idx = 1
            y_pred = []
            y_true = []
            for batch_data in data_iter(train_data, train_batch_size, shuffle=True):
                torch.cuda.empty_cache()
                batch_inputs, batch_labels = batch2tensor(batch_data)
                batch_outputs = model(batch_inputs)
                loss = criterion(batch_outputs, batch_labels)
                loss.backward()

                loss_value = loss.detach().cpu().item()
                losses += loss_value
                overall_losses += loss_value

                y_pred.extend(torch.max(batch_outputs, dim=1)
                              [1].cpu().numpy().tolist())
                y_true.extend(batch_labels.cpu().numpy().tolist())

                nn.utils.clip_grad_norm_(
                    optimizer.all_params, max_norm=clip)  # 梯度裁剪
                for cur_optim, scheduler in zip(optimizer.optims, optimizer.schedulers):
                    cur_optim.step()
                    scheduler.step()
                optimizer.zero_grad()
                step += 1
                # print(step)
            print(epoch)
            overall_losses /= batch_num
            overall_losses = reformat(overall_losses, 4)
            score, train_f1 = get_score(y_true, y_pred)
            print("score:{}, train_f1:{}".format(train_f1, score))
            # if set(y_true) == set(y_pred):
            #     print("report")
            #     report = classification_report(y_true, y_pred, digits=4, target_names=label_encoder.target_names)
            #     # logging.info('\n' + report)
            #     print(report)

            # eval
            _, dev_f1 = _eval(data=dev_data)

            if best_dev_f1 <= dev_f1:
                best_dev_f1 = dev_f1
                early_stop = 0
                best_train_f1 = train_f1
                save_path = model_utils.save_checkpoint(
                    model, epoch, save_folder="/home/wujinjie/kesci_question_multilabel_classification/data/textbert")
                print("save_path:{}".format(save_path))
                # torch.save(model.state_dict(), save_model)
            else:
                early_stop += 1
                if early_stop == EarlyStopEpochs:  # 达到早停次数,则停止训练
                    break

            print("score:{}, dev_f1:{}, best_train_f1:{}, best_dev_f1:{}".format(
                dev_f1, score, best_train_f1, best_dev_f1))
    else:
        model = model_utils.load_checkpoint(save_path)
        if method == "test":
            test_texts, test_labels = process_corpus_dl(train_df)
            Test_data = {'label': test_labels, 'text': test_texts}
            test_data = get_examples_bert(Test_data, model.word_encoder, vocab, label_encoder)
            # model.load_state_dict(torch.load(save_model))
            _, dev_f1 = _eval(data=test_data)
            print(dev_f1)

        elif method == "infer":
            infer_texts = list(map(segment, infer_texts))
            # print(infer_texts)
            Infer_data = {'label': [0] * len(infer_texts), 'text': infer_texts}
            infer_data = get_examples_bert(Infer_data, model.word_encoder, vocab, label_encoder)
            _infer(data=infer_data)
Пример #4
0
            overall_losses += loss_value

            y_pred.extend(torch.max(batch_outputs, dim=1)[1].cpu().numpy().tolist())
            y_true.extend(batch_labels.cpu().numpy().tolist())

            nn.utils.clip_grad_norm_(optimizer.all_params, max_norm=clip)  # 梯度裁剪
            for cur_optim, scheduler in zip(optimizer.optims, optimizer.schedulers):
                cur_optim.step()
                scheduler.step()
            optimizer.zero_grad()
            step += 1
            # print(step)
        print(epoch)
        overall_losses /= batch_num
        overall_losses = reformat(overall_losses, 4)
        score, train_f1 = get_score(y_true, y_pred)
        print("score:{}, train_f1:{}".format(train_f1, score))
        if set(y_true) == set(y_pred):
            print("report")
            report = classification_report(y_true, y_pred, digits=4, target_names=vocab.target_names)
            # logging.info('\n' + report)
            print(report)

        # eval
        model.eval()  # 不启用 BatchNormalization 和 Dropout
        data = dev_data
        y_pred = []
        y_true = []
        with torch.no_grad():
            for batch_data in data_iter(data, test_batch_size, shuffle=False):
                torch.cuda.empty_cache()