def clean_data(source_file, target_file, ner_model): data_x, data_y = DataReader().read_conll_format_file(source_file) with tqdm(total=len(data_x)) as pbar: for idx, text_array in enumerate(data_x): if len(text_array) <= 100: ners = ner_model.predict([text_array]) ner = ners[0] else: texts = cut_text(''.join(text_array), 100) ners = [] for text in texts: ner = ner_model.predict([[char for char in text]]) ners = ners + ner[0] ner = ners # print('[-----------------------', idx, len(data_x)) # print(data_y[idx]) # print(ner) for jdx, t in enumerate(text_array): if ner[jdx].startswith('B') or ner[jdx].startswith('I'): if data_y[idx][jdx] == 'O': data_y[idx][jdx] = ner[jdx] # print(data_y[idx]) # print('-----------------------]') pbar.update(1) f = open(target_file, 'a', encoding="utf-8") for idx, text_array in enumerate(data_x): if idx != 0: f.writelines(['\n']) for jdx, t in enumerate(text_array): text = t + ' ' + data_y[idx][jdx] if idx == 0 and jdx == 0: text = text else: text = '\n' + text f.writelines([text]) f.close() data_x2, data_y2 = DataReader().read_conll_format_file(source_file) print(data_x == data_x2, len(data_y) == len(data_y2), '数据清洗完成')
def load_data(subset_name='train', shuffle=True): """ Load dataset as sequence labeling format, char level tokenized Args: subset_name: {train, test, valid} shuffle: should shuffle or not, default True. Returns: dataset_features and dataset labels """ if subset_name == 'train': file_path = '../../data/ChineseDailyNerCorpus/example.train' elif subset_name == 'test': file_path = '../../data/ChineseDailyNerCorpus/example.test' else: file_path = '../../data/ChineseDailyNerCorpus/example.dev' x_data, y_data = DataReader.read_conll_format_file(file_path) if shuffle: x_data, y_data = utils.unison_shuffled_copies(x_data, y_data) return x_data, y_data
# -*- coding: utf-8 -*- # time: 2019-08-09 16:47 # place: Zhichunlu Beijing import kashgari from kashgari.corpus import DataReader from kashgari.embeddings import BERTEmbedding from kashgari.tasks.labeling import BiLSTM_CRF_Model train_x, train_y = DataReader().read_conll_format_file('./data/time.train') valid_x, valid_y = DataReader().read_conll_format_file('./data/time.dev') test_x, test_y = DataReader().read_conll_format_file('./data/time.test') bert_embedding = BERTEmbedding('chinese_wwm_ext_L-12_H-768_A-12', task=kashgari.LABELING, sequence_length=128) model = BiLSTM_CRF_Model(bert_embedding) model.fit(train_x, train_y, valid_x, valid_y, batch_size=16, epochs=10) model.save('time_ner.h5') model.evaluate(test_x, test_y)
from kashgari.corpus import DataReader from keras.models import load_model from keras.backend.tensorflow_backend import set_session import os import tensorflow as tf os.environ["CUDA_VISIBLE_DEVICES"] = '1' config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) set_session(sess) test_x, test_y, _ = DataReader.read_conll_format_file_word( "/home/y182235017/law/predict.txt") print(f"test data count: {len(test_x)}") from kashgari import utils model = utils.load_model("/home/y182235017/law/model/Word_BiLSTM_CRF_Model") # model = load_model("/home/y182235017/law/model/Word_BiLSTM_CRF_Attention_Model_test1/my_model.h5") import codecs # result=model.evaluate(test_x,test_y,batch_size=128) result = model.predict_entities_all(test_x) with codecs.open("/home/y182235017/law/2.txt", "w", "utf-8") as file_obj: file_obj.write(result)
#-*-coding:utf-8-*- from keras.backend.tensorflow_backend import set_session import tensorflow as tf import os os.environ["CUDA_VISIBLE_DEVICES"] = '2' config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) set_session(sess) import kashgari from kashgari.corpus import ChineseDailyNerCorpus from kashgari.corpus import DataReader train_x, train_y, = DataReader.read_conll_format_file("/home/y182235017/law/trainwithseg.txt") test_x, test_y, = DataReader.read_conll_format_file("/home/y182235017/law/testwithseg.txt") # print(f"train data count: {len(train_x)}") from kashgari.embeddings import WordEmbedding from kashgari.embeddings import BareEmbedding from kashgari.embeddings import BERTEmbedding from kashgari.tasks.labeling import BiLSTM_CRF_Model_Attention from kashgari.tasks.labeling import Bert_Position_BiLSTM_Attention_CRF_LSTMDecoder_Model from kashgari.tasks.labeling import BiLSTM_CRF_Model from kashgari.tasks.labeling import CNN_BiLSTM_CRF_Model_Attenetion from kashgari.tasks.labeling import CNN_BiLSTM_CRF_Model_Position from kashgari.tasks.labeling import CNN_BiLSTM_CRF_Model from kashgari.tasks.labeling import BiLSTM_LSTMDecoder_Model from kashgari.tasks.labeling import Bert_BiLSTM_CRF_Model from kashgari.tasks.labeling import BiLSTM_CRF_Model_Position
# -*- coding: utf-8 -*- ''' 训练包含:ORG、LOC、PER、TIME的中文NER任务模型 ''' import kashgari from kashgari.corpus import DataReader from kashgari.embeddings import BERTEmbedding from kashgari.tasks.labeling import BiLSTM_CRF_Model from kashgari import utils kashgari.config.use_cudnn_cell = False train_x, train_y = DataReader().read_conll_format_file('data/data_all/example.train') valid_x, valid_y = DataReader().read_conll_format_file('data/data_all/example.dev') test_x, test_y = DataReader().read_conll_format_file('data/data_all/example.test') train_x, train_y = utils.unison_shuffled_copies(train_x, train_y) valid_x, valid_y = utils.unison_shuffled_copies(valid_x, valid_y) test_x, test_y = utils.unison_shuffled_copies(test_x, test_y) print(f"train data count: {len(train_x)}") print(f"validate data count: {len(valid_x)}") print(f"test data count: {len(test_x)}", test_x[0], test_y[0]) bert_embedding = BERTEmbedding('chinese_wwm_ext_L-12_H-768_A-12', task=kashgari.LABELING, sequence_length=100) model = BiLSTM_CRF_Model(bert_embedding) model.fit(train_x, train_y, valid_x, valid_y, batch_size=512, epochs=20)