Пример #1
0
def clean_data(source_file, target_file, ner_model):

    data_x, data_y = DataReader().read_conll_format_file(source_file)

    with tqdm(total=len(data_x)) as pbar:
        for idx, text_array in enumerate(data_x):
            if len(text_array) <= 100:
                ners = ner_model.predict([text_array])
                ner = ners[0]
            else:
                texts = cut_text(''.join(text_array), 100)
                ners = []
                for text in texts:
                    ner = ner_model.predict([[char for char in text]])
                    ners = ners + ner[0]
                ner = ners
            # print('[-----------------------', idx, len(data_x))
            # print(data_y[idx])
            # print(ner)

            for jdx, t in enumerate(text_array):
                if ner[jdx].startswith('B') or ner[jdx].startswith('I'):
                    if data_y[idx][jdx] == 'O':
                        data_y[idx][jdx] = ner[jdx]

            # print(data_y[idx])
            # print('-----------------------]')
            pbar.update(1)

    f = open(target_file, 'a', encoding="utf-8")
    for idx, text_array in enumerate(data_x):
        if idx != 0:
            f.writelines(['\n'])
        for jdx, t in enumerate(text_array):
            text = t + ' ' + data_y[idx][jdx]
            if idx == 0 and jdx == 0:
                text = text
            else:
                text = '\n' + text
            f.writelines([text])

    f.close()

    data_x2, data_y2 = DataReader().read_conll_format_file(source_file)
    print(data_x == data_x2, len(data_y) == len(data_y2), '数据清洗完成')
Пример #2
0
def load_data(subset_name='train', shuffle=True):
    """
    Load dataset as sequence labeling format, char level tokenized

    Args:
        subset_name: {train, test, valid}
        shuffle: should shuffle or not, default True.

    Returns:
        dataset_features and dataset labels
    """

    if subset_name == 'train':
        file_path = '../../data/ChineseDailyNerCorpus/example.train'
    elif subset_name == 'test':
        file_path = '../../data/ChineseDailyNerCorpus/example.test'
    else:
        file_path = '../../data/ChineseDailyNerCorpus/example.dev'

    x_data, y_data = DataReader.read_conll_format_file(file_path)
    if shuffle:
        x_data, y_data = utils.unison_shuffled_copies(x_data, y_data)

    return x_data, y_data
# -*- coding: utf-8 -*-
# time: 2019-08-09 16:47
# place: Zhichunlu Beijing

import kashgari
from kashgari.corpus import DataReader
from kashgari.embeddings import BERTEmbedding
from kashgari.tasks.labeling import BiLSTM_CRF_Model

train_x, train_y = DataReader().read_conll_format_file('./data/time.train')
valid_x, valid_y = DataReader().read_conll_format_file('./data/time.dev')
test_x, test_y = DataReader().read_conll_format_file('./data/time.test')

bert_embedding = BERTEmbedding('chinese_wwm_ext_L-12_H-768_A-12',
                               task=kashgari.LABELING,
                               sequence_length=128)

model = BiLSTM_CRF_Model(bert_embedding)
model.fit(train_x, train_y, valid_x, valid_y, batch_size=16, epochs=10)

model.save('time_ner.h5')

model.evaluate(test_x, test_y)
Пример #4
0
from kashgari.corpus import DataReader

from keras.models import load_model
from keras.backend.tensorflow_backend import set_session
import os
import tensorflow as tf
os.environ["CUDA_VISIBLE_DEVICES"] = '1'
config = tf.ConfigProto()

config.gpu_options.allow_growth = True
sess = tf.Session(config=config)
set_session(sess)

test_x, test_y, _ = DataReader.read_conll_format_file_word(
    "/home/y182235017/law/predict.txt")
print(f"test data count: {len(test_x)}")

from kashgari import utils
model = utils.load_model("/home/y182235017/law/model/Word_BiLSTM_CRF_Model")
# model = load_model("/home/y182235017/law/model/Word_BiLSTM_CRF_Attention_Model_test1/my_model.h5")
import codecs
# result=model.evaluate(test_x,test_y,batch_size=128)
result = model.predict_entities_all(test_x)
with codecs.open("/home/y182235017/law/2.txt", "w", "utf-8") as file_obj:
    file_obj.write(result)
#-*-coding:utf-8-*-
from keras.backend.tensorflow_backend import set_session
import tensorflow as tf
import os
os.environ["CUDA_VISIBLE_DEVICES"] = '2'
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)
set_session(sess)

import kashgari

from kashgari.corpus import ChineseDailyNerCorpus
from kashgari.corpus import DataReader

train_x, train_y, = DataReader.read_conll_format_file("/home/y182235017/law/trainwithseg.txt")
test_x, test_y, = DataReader.read_conll_format_file("/home/y182235017/law/testwithseg.txt")

# print(f"train data count: {len(train_x)}")
from kashgari.embeddings import WordEmbedding
from kashgari.embeddings import BareEmbedding
from kashgari.embeddings import BERTEmbedding
from kashgari.tasks.labeling import BiLSTM_CRF_Model_Attention
from kashgari.tasks.labeling import Bert_Position_BiLSTM_Attention_CRF_LSTMDecoder_Model
from kashgari.tasks.labeling import BiLSTM_CRF_Model
from kashgari.tasks.labeling import CNN_BiLSTM_CRF_Model_Attenetion
from kashgari.tasks.labeling import CNN_BiLSTM_CRF_Model_Position
from kashgari.tasks.labeling import CNN_BiLSTM_CRF_Model
from kashgari.tasks.labeling import BiLSTM_LSTMDecoder_Model
from kashgari.tasks.labeling import Bert_BiLSTM_CRF_Model
from kashgari.tasks.labeling import BiLSTM_CRF_Model_Position
Пример #6
0
# -*- coding: utf-8 -*-
'''
训练包含:ORG、LOC、PER、TIME的中文NER任务模型
'''

import kashgari
from kashgari.corpus import DataReader
from kashgari.embeddings import BERTEmbedding
from kashgari.tasks.labeling import BiLSTM_CRF_Model
from kashgari import utils

kashgari.config.use_cudnn_cell = False

train_x, train_y = DataReader().read_conll_format_file('data/data_all/example.train')
valid_x, valid_y = DataReader().read_conll_format_file('data/data_all/example.dev')
test_x, test_y = DataReader().read_conll_format_file('data/data_all/example.test')

train_x, train_y = utils.unison_shuffled_copies(train_x, train_y)
valid_x, valid_y = utils.unison_shuffled_copies(valid_x, valid_y)
test_x, test_y = utils.unison_shuffled_copies(test_x, test_y)

print(f"train data count: {len(train_x)}")
print(f"validate data count: {len(valid_x)}")
print(f"test data count: {len(test_x)}", test_x[0], test_y[0])

bert_embedding = BERTEmbedding('chinese_wwm_ext_L-12_H-768_A-12',
                               task=kashgari.LABELING,
                               sequence_length=100)

model = BiLSTM_CRF_Model(bert_embedding)
model.fit(train_x, train_y, valid_x, valid_y, batch_size=512, epochs=20)