示例#1
0
            else:
                xindex, xlabel = x.split("-")
                if xindex == "B":
                    if start is not None:
                        extract_ner.append((start, i, label, input_x[start:i]))
                    start = i
                    label = xlabel
                else:
                    if label != xlabel:
                        start = None
                        label = None
        return extract_ner


if __name__ == "__main__":
    msra_data = LoadMsraDataV2("D:\data\\nlp\\命名实体识别\\msra_ner_token_level\\")

    print(msra_data.train_tag_list[0])

    X_train = [sent2features(s) for s in msra_data.train_sentence_list]
    y_train = [sent2labels(s) for s in msra_data.train_tag_list]

    X_test = [sent2features(s) for s in msra_data.test_sentence_list]
    y_test = [sent2labels(s) for s in msra_data.test_tag_list]

    # print(X_train)
    print(len(y_train))

    crf_mode = CRFNerModel()
    # crf_mode.load_model()
    crf_mode.fit(X_train, y_train)
示例#2
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright (c) ***
import numpy as np
import torch
import torch.nn as nn
from torch.optim import Optimizer
from transformers import BertModel
from pytorch.layers.crf import CRF
import torch.autograd as autograd
import torch.optim as optim
from pytorch.layers.bert_optimization import BertAdam
from transformers import BertTokenizer
from nlp_applications.data_loader import LoadMsraDataV2

msra_data = LoadMsraDataV2("D:\data\\ner\\msra_ner_token_level\\")
bert_model_name = "bert-base-chinese"
class_num = len(msra_data.label2id)


def sequence_padding(inputs, length=None, padding=0, is_float=False):
    """Numpy函数,将序列padding到同一长度
    """
    if length is None:
        length = max([len(x) for x in inputs])

    outputs = np.array([
        np.concatenate([x, [padding] *
                        (length - len(x))]) if len(x) < length else x[:length]
        for x in inputs
    ])