示例#1
0
def start():
    # produce_data()
    model = Bert_CRF()
    model.load_state_dict(load_model(args.output_dir))
    device = torch.device(args.device if torch.cuda.is_available()
                          and not args.no_cuda else "cpu")
    model.to(device)
    print('create_iter')
    eval_iter = create_batch_iter("valid")
    print('create_iter finished')
    # ------------------判断CUDA模式----------------------
    device = torch.device(args.device if torch.cuda.is_available()
                          and not args.no_cuda else "cpu")

    # -----------------------验证----------------------------
    model.eval()
    count = 0
    y_predicts, y_labels = [], []
    eval_loss, eval_acc, eval_f1 = 0, 0, 0
    with torch.no_grad():
        for step, batch in enumerate(eval_iter):
            batch = tuple(t.to(device) for t in batch)
            input_ids, input_mask, segment_ids, label_ids, output_mask = batch
            bert_encode = model(input_ids, segment_ids, input_mask).cpu()
            eval_los = model.loss_fn(bert_encode=bert_encode,
                                     tags=label_ids,
                                     output_mask=output_mask)
            eval_loss = eval_los + eval_loss
            count += 1
            predicts = model.predict(bert_encode, output_mask)
            predict_tensor = predicts.cpu()
            label_tensor = label_ids.cpu()
            y_predicts.append(predicts)
            y_labels.append(label_ids)
            entity_precision, entity_recall, entity_f1 = score_predict(
                label_tensor, predict_tensor)
            print(
                '\n step :%d - eval_loss: %4f - ent_p:%4f - ent_r:%4f - ent_f1:%4f\n'
                % (step, eval_loss.item() / count, entity_precision,
                   entity_recall, entity_f1))

            label_ids = label_ids.view(1, -1).squeeze()
            predicts = predicts.view(1, -1).squeeze()
            label_ids = label_ids[label_ids != -1]
            predicts = predicts[predicts != -1]
            assert len(label_ids) == len(predicts)

        eval_predicted = torch.cat(y_predicts, dim=0).cpu()
        eval_labeled = torch.cat(y_labels, dim=0).cpu()
        entity_precision, entity_recall, entity_f1 = score_predict(
            eval_labeled, eval_predicted)
        print(
            '\n\n- eval_loss: %4f - eval_acc:%4f - eval_f1:%4f - ent_p:%4f - ent_r:%4f - ent_f1:%4f\n'
            % (eval_loss.item() / count, eval_acc, eval_f1, entity_precision,
               entity_recall, entity_f1))
示例#2
0
 def __init__(self):
     print('[INFO]加载分词器')
     self.processor, self.bertTokenizer = init_params()
     label_list = self.processor.get_labels()
     self.label_map = {label: i for i, label in enumerate(label_list)}
     self.tokenizer = BasicTokenizer()
     print('[INFO]分词器加载完毕')
     print('[INFO]加载模型')
     self.model = Bert_CRF()
     self.model.load_state_dict(load_model(args.output_dir))
     self.device = torch.device(args.device if torch.cuda.is_available()
                                and not args.no_cuda else "cpu")
     self.model.to(self.device)
     self.model.eval()
     print('[INFO]模型加载完毕')
示例#3
0
def start():
    # 优先使用缓存
    if not os.path.exists(args.TRAIN) or not os.path.exists(args.VALID):
        produce_data(user_define=USER_DEFINE)

    if os.path.exists(args.TRAIN_CACHE):
        train_iter, num_train_steps = torch.load(args.TRAIN_CACHE)
    else:
        train_iter, num_train_steps = create_batch_iter("train")

    if os.path.exists(args.VALID_CACHE):
        eval_iter = torch.load(args.VALID_CACHE)
    else:
        eval_iter = create_batch_iter("dev")

    epoch_size = num_train_steps * args.train_batch_size * args.gradient_accumulation_steps / args.num_train_epochs

    pbar = ProgressBar(epoch_size=epoch_size, batch_size=args.train_batch_size)

    model = Bert_CRF.from_pretrained(args.bert_model, num_tag=len(args.labels))

    for name, param in model.named_parameters():
        if param.requires_grad:
            print(name)

    fit(model=model,
        training_iter=train_iter,
        eval_iter=eval_iter,
        num_epoch=args.num_train_epochs,
        pbar=pbar,
        num_train_steps=num_train_steps,
        verbose=1)
示例#4
0
def load_model(output_dir):
    # Load a trained model that you have fine-tuned
    output_model_file = os.path.join(output_dir, "pytorch_model.bin")
    model_state_dict = torch.load(output_model_file)
    model = Bert_CRF.from_pretrained(args.bert_model,
                                     state_dict=model_state_dict)
    return model
示例#5
0
def start():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--do_not_train_ernie",
        default=False,
        action='store_true',
    )
    parser.add_argument(
        "--do_CRF",
        default=False,
        action='store_true',
    )
    arg = parser.parse_args()
    args.do_not_train_ernie = arg.do_not_train_ernie
    args.do_CRF = arg.do_CRF

    produce_data()
    train_iter, num_train_steps = create_batch_iter("train")
    eval_iter = create_batch_iter("dev")

    epoch_size = num_train_steps * args.train_batch_size * args.gradient_accumulation_steps / args.num_train_epochs

    pbar = ProgressBar(epoch_size=epoch_size, batch_size=args.train_batch_size)
    if args.load_weight:
        model = load_model(args.output_dir)
    else:
        model = Bert_CRF.from_pretrained(args.bert_model,
                                         num_tag=len(args.labels))

    for name, param in model.named_parameters():
        if param.requires_grad:
            print(name)

    fit(model=model,
        training_iter=train_iter,
        eval_iter=eval_iter,
        num_epoch=args.num_train_epochs,
        pbar=pbar,
        num_train_steps=num_train_steps,
        verbose=1)
示例#6
0
def start():
    produce_data()
    train_iter, num_train_steps = create_batch_iter("train")
    eval_iter = create_batch_iter("dev")

    epoch_size = num_train_steps * args.train_batch_size * args.gradient_accumulation_steps / args.num_train_epochs

    pbar = ProgressBar(epoch_size=epoch_size, batch_size=args.train_batch_size)

    model = Bert_CRF.from_pretrained(args.bert_model, num_tag=len(args.labels))

    for name, param in model.named_parameters():
        if param.requires_grad:
            print(name)

    fit(model=model,
        training_iter=train_iter,
        eval_iter=eval_iter,
        num_epoch=args.num_train_epochs,
        pbar=pbar,
        num_train_steps=num_train_steps,
        verbose=1)
示例#7
0
def start():
    # produce_data()
    model = Bert_CRF()
    print('create_iter')
    train_iter, num_train_steps = create_batch_iter("train")
    eval_iter = create_batch_iter("valid")
    print('create_iter finished')

    epoch_size = num_train_steps * args.train_batch_size * args.gradient_accumulation_steps / args.num_train_epochs

    pbar = ProgressBar(epoch_size=epoch_size, batch_size=args.train_batch_size)

    # for name, param in model.named_parameters():
    #     if param.requires_grad:
    #         print(name)
    print('fit')

    fit(model=model,
        training_iter=train_iter,
        eval_iter=eval_iter,
        num_epoch=args.num_train_epochs,
        pbar=pbar,
        num_train_steps=num_train_steps,
        verbose=1)
示例#8
0
class entity_extractor:
    def __init__(self):
        print('[INFO]加载分词器')
        self.processor, self.bertTokenizer = init_params()
        label_list = self.processor.get_labels()
        self.label_map = {label: i for i, label in enumerate(label_list)}
        self.tokenizer = BasicTokenizer()
        print('[INFO]分词器加载完毕')
        print('[INFO]加载模型')
        self.model = Bert_CRF()
        self.model.load_state_dict(load_model(args.output_dir))
        self.device = torch.device(args.device if torch.cuda.is_available()
                                   and not args.no_cuda else "cpu")
        self.model.to(self.device)
        self.model.eval()
        print('[INFO]模型加载完毕')

    def extract(self, text):
        text = list(text)
        if len(text) > args.max_seq_length - 2:
            text = text[:(args.max_seq_length - 2)]
        tokens = ["[CLS]"] + text + ["[SEP]"]
        segment_ids = [0] * len(tokens)
        input_ids = convert_text_to_ids(tokens, self.bertTokenizer)
        input_mask = [1] * len(input_ids)
        padding = [0] * (args.max_seq_length - len(input_ids))
        input_ids += padding
        input_mask += padding
        segment_ids += padding

        assert len(input_ids) == args.max_seq_length
        assert len(input_mask) == args.max_seq_length
        assert len(segment_ids) == args.max_seq_length
        ## output_mask用来过滤bert输出中sub_word的输出,只保留单词的第一个输出(As recommended by jocob in his paper)
        ## 此外,也是为了适应crf
        output_mask = [1 for t in text]
        output_mask = [0] + output_mask + [0]
        output_mask += padding
        text = ''.join(text)
        all_input_ids = torch.tensor([input_ids], dtype=torch.long)
        all_input_mask = torch.tensor([input_mask], dtype=torch.long)
        all_segment_ids = torch.tensor([segment_ids], dtype=torch.long)
        all_output_mask = torch.tensor([output_mask], dtype=torch.long)
        data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                             all_output_mask)
        sampler = SequentialSampler(data)
        iterator = DataLoader(data, sampler=sampler, batch_size=1)

        with torch.no_grad():
            for step, batch in enumerate(iterator):
                batch = tuple(t.to(self.device) for t in batch)
                input_ids, input_mask, segment_ids, output_mask = batch
                bert_encode = self.model(input_ids, segment_ids,
                                         input_mask).cpu()
                predicts = self.model.predict(bert_encode, output_mask)
                for ix, predict in enumerate(predicts):
                    predict = predict[predict != -1]
                    pre_tags = get_tags_BIESO(predict.cpu().numpy().tolist())
                    pre_entities = [
                        text[tag[0]:tag[1] + 1] for tag in pre_tags
                    ]
        return set(pre_entities)