def main(): train_x, train_y = ChineseDailyNerCorpus.load_data("train") valid_x, valid_y = ChineseDailyNerCorpus.load_data("validate") test_x, test_y = ChineseDailyNerCorpus.load_data("test") print(f"train data count: {len(train_x)}") print(f"validate data count: {len(valid_x)}") print(f"test data count: {len(test_x)}") bert_embed = BERTEmbedding("models/chinese_L-12_H-768_A-12", task=kashgari.LABELING, sequence_length=100) model = BiLSTM_CRF_Model(bert_embed) model.fit( train_x, train_y, x_validate=valid_x, y_validate=valid_y, epochs=1, batch_size=512, ) model.save("models/ner.h5") model.evaluate(test_x, test_y) predictions = model.predict_classes(test_x) print(predictions)
def train_BERT_BiLSTM_CRF( train_test_devide=0.9, epoch=20, path='/home/peitian_zhang/data/corpus/labeled_train.txt'): train_x, train_y = getTrain(path) x = train_x[:int(len(train_x) * train_test_devide) + 1] y = train_y[:int(len(train_x) * train_test_devide) + 1] bert = BERTEmbedding( model_folder='/home/peitian_zhang/data/chinese_L-12_H-768_A-12', sequence_length=400, task=kashgari.LABELING) model = BiLSTM_CRF_Model(bert) model.fit(x, y, x, y, epochs=epoch, batch_size=64) print('---------evaluate on train---------\n{}'.format( model.evaluate(train_x, train_y))) print('---------evaluate on test----------\n{}'.format( model.evaluate(train_x[int(len(train_x) * train_test_devide) + 1:], train_y[int(len(train_x) * train_test_devide) + 1:]))) try: model.save('/home/peitian_zhang/models/bert_epoch_{}'.format(epoch)) print('Success in saving!') except: pass return model
def train_ner(x_train, y_train, x_valid, y_valid, x_test, y_test, sequence_length, epoch, batch_size, bert_model_path, model_save_path): """ BERT-BiLSTM-CRF 模型训练,提取症状内部特征 """ bert_embedding = BERTEmbedding(bert_model_path, task=kashgari.LABELING, sequence_length=sequence_length) model = BiLSTM_CRF_Model(bert_embedding) eval_callback_val = EvalCallBack(kash_model=model, valid_x=x_valid, valid_y=y_valid, step=1) eval_callback_test = EvalCallBack(kash_model=model, valid_x=x_test, valid_y=y_test, step=1) model.fit(x_train, y_train, x_validate=x_valid, y_validate=y_valid, epochs=epoch, batch_size=batch_size, callbacks=[eval_callback_val, eval_callback_test]) model.save(model_save_path) model.evaluate(x_test, y_test) return model
def build(self): embed = BERTEmbedding(model_folder=self.folder, task=kashgari.LABELING, trainable=self.fine_tune, sequence_length=self.seq_len) model = BiLSTM_CRF_Model(embed) return model
def train(): parser = argparse.ArgumentParser() parser.add_argument('model_dir', default='model dir') args = parser.parse_args() model_dir = args.model_dir hdf_dir = os.path.join(model_dir, "hdf5") os.makedirs(hdf_dir, exist_ok=True) bert_model_path = os.path.join(ROOT_DIR, 'BERT-baseline') data_path = os.path.join(model_dir, "feature.pkl") with open(data_path, 'rb') as fr: train_data, train_label, test_data, test_label = pickle.load(fr) print("load {}/{} train/dev items ".format(len(train_data), len(test_data))) bert_embed = BERTEmbedding(bert_model_path, task=kashgari.LABELING, sequence_length=50) model = KashModel(bert_embed) model.build_model(x_train=train_data, y_train=train_label, x_validate=test_data, y_validate=test_label) from src.get_model_path import get_model_path model_path, init_epoch = get_model_path(hdf_dir) if init_epoch > 0: print("load epoch from {}".format(model_path)) model.tf_model.load_weights(model_path) optimizer = RAdam(learning_rate=0.0001) model.compile_model(optimizer=optimizer) hdf5_path = os.path.join(hdf_dir, "crf-{epoch:03d}-{val_accuracy:.3f}.hdf5") checkpoint = ModelCheckpoint(hdf5_path, monitor='val_accuracy', verbose=1, save_best_only=True, save_weights_only=False, mode='auto', period=1) tensorboard = TensorBoard(log_dir=os.path.join(model_dir, "log")) eval_callback = EvalCallBack(kash_model=model, valid_x=test_data, valid_y=test_label, step=1, log_path=os.path.join(model_dir, "acc.txt")) callbacks = [checkpoint, tensorboard, eval_callback] model.fit(train_data, train_label, x_validate=test_data, y_validate=test_label, epochs=100, batch_size=256, callbacks=callbacks) return
def train(self, tokens, tags): x, y = self.prepare_data_fit(tokens, tags, chunk_size=self.chunk_size) text_embedding = BareEmbedding(task=kashgari.LABELING, sequence_length=self.chunk_size) first_of_p_embedding = NumericFeaturesEmbedding( feature_count=2, feature_name='first_of_p', sequence_length=self.chunk_size) stack_embedding = StackedEmbedding( [text_embedding, first_of_p_embedding]) stack_embedding.analyze_corpus(x, y) from kashgari.tasks.labeling import BiLSTM_Model, BiLSTM_CRF_Model self.model = BiLSTM_CRF_Model(embedding=stack_embedding) self.model.fit(x, y, batch_size=1, epochs=20)
def initial_model(self, bert_model_path, psd_model_path): print('=============init bert model=========================') print("bert model path:", bert_model_path) print("crf model path:", psd_model_path) self.sess = tf.Session() set_session(self.sess) self.model_dir = os.path.dirname(os.path.dirname(psd_model_path)) self.model_path = psd_model_path data_path = os.path.join(self.model_dir, "feature_psd.pkl") train_data, train_label, test_data, test_label = \ pickle.load(open(data_path, 'rb')) bert_embed = BERTEmbedding(bert_model_path, task=kashgari.LABELING, sequence_length=50) self.model = BiLSTM_CRF_Model(bert_embed) self.model.build_model(x_train=train_data, y_train=train_label, x_validate=test_data, y_validate=test_label) self.model.compile_model() self.model.tf_model.load_weights(psd_model_path) print('=============bert model loaded=========================') return
def train_it2(train_path, checkpoint_filepath, model_path, start, span): data_generator = BIODataGenerator(train_path, 100000000) Xs, ys = data_generator.forfit().__next__() train_x, train_y = [], [] valid_x, valid_y = [], [] rng = np.random.RandomState(0) k = 0 for x, y in zip(Xs, ys): # x = [str(i, 'utf-8') for i in x] # y = [str(i, 'utf-8') for i in y] rnum = rng.rand() k += 1 if rnum < start or rnum >= start + span: train_x += [x] train_y += [y] else: valid_x += [x] valid_y += [y] # dataset = dataset.batch(32) print('====' * 8) print('total = ', k) print('start , span = ', (start, span)) print('len train = ', len(train_x)) # checkpoint_filepath = './checkpoint' if not os.path.exists(os.path.dirname(checkpoint_filepath)): os.mkdir(os.path.dirname(checkpoint_filepath)) # train_x, train_y = ChineseDailyNerCorpus.load_data('train') # test_x, test_y = ChineseDailyNerCorpus.load_data('test') # valid_x, valid_y = ChineseDailyNerCorpus.load_data('valid') # model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint( # filepath=checkpoint_filepath, # save_weights_only=True, # monitor='val_accuracy', # mode='max', # save_best_only=True) #train_x, train_y = train_x[:1000], train_y[:1000] #valid_x, valid_y = valid_x[:200], valid_y[:200] model = BiLSTM_CRF_Model(bert_embed, sequence_length=128) eval_callback = Evaluator(model, checkpoint_filepath, valid_x, valid_y) early_stop = keras.callbacks.EarlyStopping(patience=10) reduse_lr_callback = keras.callbacks.ReduceLROnPlateau(factor=0.1, patience=5) # eval_callback = EvalCallBack(kash_model=model, # x_data=valid_x, # y_data=valid_y, # step=1) model.fit(train_x, train_y, valid_x, valid_y, batch_size=64, epochs=20, callbacks=[early_stop, eval_callback, reduse_lr_callback]) model.save(model_path)
def train_BiLSTM_CRF(train_test_devide=0.9, epoch=100, path='/home/peitian_zhang/data/corpus/labeled_train.txt'): train_x, train_y = getTrain(path) model = BiLSTM_CRF_Model() x = train_x[:int(len(train_x) * train_test_devide) + 1] y = train_y[:int(len(train_x) * train_test_devide) + 1] model.fit(x, y, x, y, epochs=epoch, batch_size=64) print('---------evaluate on train---------\n{}'.format( model.evaluate(train_x, train_y))) print('---------evaluate on test----------\n{}'.format( model.evaluate(train_x[int(len(train_x) * train_test_devide) + 1:], train_y[int(len(train_x) * train_test_devide) + 1:]))) try: model.save('/home/peitian_zhang/models/bert_epoch_{}'.format(epoch)) print('Success in saving!') except: pass return model
def train_it(train_path, checkpoint_filepath, model_path, start, span): dataset = build_dataset(train_path) train_x, train_y = [], [] valid_x, valid_y = [], [] rng = np.random.RandomState(0) k = 0 for x, y in dataset.as_numpy_iterator(): x = [str(i, 'utf-8') for i in x] y = [str(i, 'utf-8') for i in y] rnum = rng.rand() k += 1 if rnum < start or rnum >= start + span: train_x += [x] train_y += [y] else: valid_x += [x] valid_y += [y] # dataset = dataset.batch(32) print('====' * 8) print('total = ', k) print('start , span = ', (start, span)) print('len train = ', len(train_x)) # checkpoint_filepath = './checkpoint' if not os.path.exists(os.path.dirname(checkpoint_filepath)): os.mkdir(os.path.dirname(checkpoint_filepath)) # model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint( # filepath=checkpoint_filepath, # save_weights_only=True, # monitor='val_accuracy', # mode='max', # save_best_only=True) model = BiLSTM_CRF_Model(bert_embed, sequence_length=100) evaluator = Evaluator(model, checkpoint_filepath, valid_x, valid_y) model.fit(train_x, train_y, valid_x, valid_y, batch_size=64, epochs=20, callbacks=[evaluator]) model.save(model_path)
text = [[0.9, 0.1, 0.1], [0.9, 0.1, 0.1], [0.1, 0.8, 0.1], [0.1, 0.8, 0.1], [0.1, 0.8, 0.1]] label = [ 'B-Category', 'I-Category', 'B-ProjectName', 'I-ProjectName', 'I-ProjectName' ] text_list = [text] * 100 label_list = [label] * 100 SEQUENCE_LEN = 80 # You can use WordEmbedding or BERTEmbedding for your text embedding bare_embedding = DirectEmbedding(task=kashgari.RAW_LABELING, sequence_length=SEQUENCE_LEN, embedding_size=3) #bare_embedding = BareEmbedding(task=kashgari.LABELING, sequence_length=SEQUENCE_LEN) x = (text_list) y = label_list bare_embedding.analyze_corpus(x, y) # Now we can embed with this stacked embedding layer # We can build any labeling model with this embedding from kashgari.tasks.labeling import BiLSTM_Model, BiLSTM_CRF_Model model = BiLSTM_CRF_Model(embedding=bare_embedding) model.fit(x, y, batch_size=1, epochs=3) print(model.predict(x)) #print(model.predict_entities(x))
test_x = list(test_x) test_y = list(test_y) ''' BERT Embedding ''' #embedding = BERTEmbedding('./chinese_L-12_H-768_A-12', # task = kashgari.LABELING, # sequence_length = 150) ''' Word2Vec Embeddings ''' word2vec_embedding = kashgari.embeddings.WordEmbedding( w2v_path="word2vec.model", task=kashgari.LABELING, w2v_kwargs={ 'binary': True, 'unicode_errors': 'ignore' }, sequence_length='auto') model = BiLSTM_CRF_Model(word2vec_embedding) #model = BiLSTM_CRF_Model(embedding) tf_board_callback = keras.callbacks_v1.TensorBoard(log_dir='.\\logs', update_freq=1000) eval_callback = EvalCallBack(kash_model=model, valid_x=test_x, valid_y=test_y, step=4) model.fit(train_x, train_y, test_x, test_y, batch_size=20, epochs=4, callbacks=[eval_callback, tf_board_callback])
class BertPolyPhone: """ 拼音预测主类""" def __init__(self): super().__init__() self.poly_dict = dict() poly_dict_path = "/data1/liufeng/synthesis/frontend/data/simple_poly_dict" for line in read_lines(poly_dict_path): line = line.replace(" ", "").replace("*", "") key = line.split(":")[0] value = line.split(":")[1].split(",") self.poly_dict[key] = value self.model, self.model_dir = None, None self.sess = None def inialize_model(self, bert_model_path, poly_model_path): print('=============init phone model=========================') print("bert model path:", bert_model_path) print("crf model path:", poly_model_path) # 需要训练数据的路径构建字典 self.sess = tf.Session() set_session(self.sess) self.model_dir = os.path.dirname(os.path.dirname(poly_model_path)) data_path = os.path.join(self.model_dir, "feature.pkl") train_data, train_label, test_data, test_label = \ pickle.load(open(data_path, 'rb')) bert_embed = BERTEmbedding(bert_model_path, task=kashgari.LABELING, sequence_length=50) self.model = BiLSTM_CRF_Model(bert_embed) self.model.build_model(x_train=train_data, y_train=train_label, x_validate=test_data, y_validate=test_label) self.model.compile_model() self.model.tf_model.load_weights(poly_model_path) print('=============successful loaded=========================') def _lookup_dict(self, bert_result, pred_ph_pairs): """查字典的方法对拼音进行修正 """ # todo: 如果词在词典中,不用bert的结果。 bert_phone_result = [] for index_c, (char, ph, _) in enumerate(pred_ph_pairs): if char in self.poly_dict.keys(): # 如果bert预测结果不在多音字字典中,就是预测结果跑偏了 if bert_result[index_c] not in self.poly_dict[char]: bert_phone_result.append((char, ph)) else: bert_result[index_c] = split_phone_format(bert_result[index_c]) bert_phone_result.append((char, bert_result[index_c])) if ph != bert_result[index_c]: print("using bert result {}:{} instead of {}".format( char, bert_result[index_c], ph)) else: bert_phone_result.append((char, ph)) return bert_phone_result def predict(self, sentence_list): """ 通过句子预测韵律,标点断开 """ bert_input = [] for sent in sentence_list: assert len(sent) < 50 bert_input.append([c for c in sent]) print("bert-input:", bert_input) prosody = self.model.predict(bert_input) return prosody def save_pb(self): self._write_dict() pb_dir = os.path.join(self.model_dir, "pb") os.makedirs(pb_dir, exist_ok=True) h5_to_pb(self.model.tf_model, pb_dir, self.sess, "model_phone.pb", ["output_phone"]) return def _write_dict(self): label_path = os.path.join(self.model_dir, "pb/phone_idx2label.txt") with open(label_path, "w", encoding="utf-8") as fr: for key, value in self.model.embedding.label2idx.items(): fr.write("{} {}\n".format(value, key)) print("write {}".format(label_path)) token_path = os.path.join(self.model_dir, "pb/phone_token2idx.txt") with open(token_path, "w", encoding="utf-8") as fr: for key, value in self.model.embedding.token2idx.items(): if len(key) > 0: fr.write("{} {}\n".format(key, value)) print("write {}".format(token_path)) return def compute_embed(self, sentence_list): bert_input = [[c for c in sent] for sent in sentence_list] print("bert-input:", bert_input) import numpy as np tensor = self.model.embedding.process_x_dataset(bert_input) print("debug:", np.shape(tensor), tensor) res = self.model.tf_model.predict(tensor) import numpy as np print("debug:", np.shape(res), res[0][0: len(sentence_list[0]+1)]) return tensor @staticmethod def _merge_eng_char(bert_phone_result, dict_phone_pairs): from src.utils import check_all_chinese index = 0 new_bert_phone = [] for word, _, _ in dict_phone_pairs: if (not check_all_chinese(word)) and len(word) > 1: new_bert_phone.append(bert_phone_result[index]) index += len(word) else: new_bert_phone.append(bert_phone_result[index]) index += 1 return new_bert_phone def modify_result(self, bert_result, dict_phone_pairs): bert_result = self._merge_eng_char(bert_result, dict_phone_pairs) bert_phone_pairs = self._lookup_dict(bert_result, dict_phone_pairs) phone_pairs = bert_phone_pairs # phone_pairs = change_yi(phone_pairs) # phone_pairs = change_bu(phone_pairs) phone_pairs = sandhi(phone_pairs) bert_result = [ph for _, ph in phone_pairs] chars = "".join([c for c, _ in phone_pairs]) bert_result = change_qingyin(bert_result, chars) return bert_result
import kashgari from kashgari.embeddings import BERTEmbedding from kashgari.corpus import ChineseDailyNerCorpus from kashgari.tasks.labeling import BiLSTM_CRF_Model train_x, train_y = ChineseDailyNerCorpus.load_data('./data/train.txt') valid_x, valid_y = ChineseDailyNerCorpus.load_data('./data/dev.txt') test_x, test_y = ChineseDailyNerCorpus.load_data('./data/test.txt') bert_embed = BERTEmbedding('./chinese_L-12_H-768_A-12', task=kashgari.LABELING, sequence_length=100) # 还可以选择 `CNN_LSTM_Model`, `BiLSTM_Model`, `BiGRU_Model` 或 `BiGRU_CRF_Model` model = BiLSTM_CRF_Model(bert_embed) model.fit(train_x, train_y, x_validate=valid_x, y_validate=valid_y, epochs=20, batch_size=512) model.save('saved_ner_model')
# -*- coding: utf-8 -*- # time: 2019-09-12 # place: Huangcun Beijing import kashgari from kashgari import utils from kashgari.corpus import DataReader from kashgari.embeddings import BERTEmbedding from kashgari.tasks.labeling import BiLSTM_CRF_Model # 模型训练 train_x, train_y = DataReader().read_conll_format_file('./data/time.train') valid_x, valid_y = DataReader().read_conll_format_file('./data/time.dev') test_x, test_y = DataReader().read_conll_format_file('./data/time.test') bert_embedding = BERTEmbedding('chinese_wwm_ext_L-12_H-768_A-12', task=kashgari.LABELING, sequence_length=128) model = BiLSTM_CRF_Model(bert_embedding) model.fit(train_x, train_y, valid_x, valid_y, batch_size=16, epochs=1) # Save model utils.convert_to_saved_model(model, model_path='saved_model/time_entity', version=1)
class BertProsody: """ 目前只支持长度50,输入字符数49 + 终结符 """ def __init__(self): self.model, self.model_dir, self.model_path = None, None, None self.sess = None return def initial_model(self, bert_model_path, psd_model_path): print('=============init bert model=========================') print("bert model path:", bert_model_path) print("crf model path:", psd_model_path) self.sess = tf.Session() set_session(self.sess) self.model_dir = os.path.dirname(os.path.dirname(psd_model_path)) self.model_path = psd_model_path data_path = os.path.join(self.model_dir, "feature_psd.pkl") train_data, train_label, test_data, test_label = \ pickle.load(open(data_path, 'rb')) bert_embed = BERTEmbedding(bert_model_path, task=kashgari.LABELING, sequence_length=50) self.model = BiLSTM_CRF_Model(bert_embed) self.model.build_model(x_train=train_data, y_train=train_label, x_validate=test_data, y_validate=test_label) self.model.compile_model() self.model.tf_model.load_weights(psd_model_path) print('=============bert model loaded=========================') return def _write_dict(self): label_path = os.path.join(self.model_dir, "idx2label.txt") with open(label_path, "w", encoding="utf-8") as fr: for key, value in self.model.embedding.label2idx.items(): fr.write("{} {}\n".format(value, key)) token_path = os.path.join(self.model_dir, "token2idx.txt") with open(token_path, "w", encoding="utf-8") as fr: for key, value in self.model.embedding.token2idx.items(): if len(key) > 0: fr.write("{} {}\n".format(key, value)) def predict(self, sentence_list): """ 通过句子预测韵律,标点断开 """ bert_input = [] for sent in sentence_list: assert len(sent) < 50 bert_input.append([c for c in sent]) print("bert-input:", bert_input) prosody = self.model.predict(bert_input) return prosody def compute_embed(self, sentence_list): bert_input = [[c for c in sent] for sent in sentence_list] print("bert-input:", bert_input) tensor = self.model.embedding.process_x_dataset(bert_input) res = self.model.tf_model.predict(tensor) import numpy as np print("debug:", np.shape(res), res[0]) return tensor def save_pb(self): self._write_dict() pb_dir = os.path.join(self.model_dir, "pb") os.makedirs(pb_dir, exist_ok=True) # [print(n.name) for n in tf.get_default_graph().as_graph_def().node] h5_to_pb(self.model.tf_model, pb_dir, self.sess, "model_psd.pb", ["output_psd"]) return @staticmethod def change_by_rules(old_pairs): """ 强制规则: 1. 逗号之前是#3,句号之前是#4 2. 其他位置,#3 -> #2 """ new_pairs = [] for i, (char, ph, psd) in enumerate(old_pairs[0:-1]): next_char, _, _ = old_pairs[i+1] if next_char == ",": new_pairs.append((char, ph, "3")) elif next_char in ["。", "?", "!"]: new_pairs.append((char, ph, "4")) else: if psd == "3": new_pairs.append((char, ph, "2")) else: new_pairs.append((char, ph, psd)) new_pairs.append(old_pairs[-1]) return new_pairs
title_cut_all = pickle.load(ipt) tag_all = pickle.load(ipt) from sklearn.model_selection import train_test_split x_train, x_test, y_train, y_test = train_test_split(title_cut_all, tag_all, test_size=0.2, random_state=43) x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size=0.2, random_state=43) import kashgari from kashgari.embeddings import BERTEmbedding bert_embed = BERTEmbedding( '/root/meicloud/majk1/NLP/BERT/chinese_L-12_H-768_A-12', task=kashgari.LABELING, sequence_length=100) from kashgari.tasks.labeling import BiLSTM_CRF_Model model = BiLSTM_CRF_Model(bert_embed) model.fit(x_train, y_train, x_validate=x_valid, y_validate=y_valid, epochs=10, batch_size=512)
bert_embed = BERTEmbedding('drive/My Drive/rbt3', task=kashgari.LABELING, sequence_length=128) from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint,ReduceLROnPlateau from tensorflow.keras.layers import Flatten, Dense, Dropout from tensorflow.python import keras from kashgari.callbacks import EvalCallBack #patience=3是看每一個epoch stop_callback = EarlyStopping(patience=5, restore_best_weights=True) # save_callback = ModelCheckpoint("530test1.h5",save_best_only=True,save_weights_only=True) reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=3, verbose=1, min_lr=1e-6) # tf_board_callback = keras.callbacks.TensorBoard(log_dir='./logs', update_freq=1000) model = BiLSTM_CRF_Model(bert_embed) eval_callback = EvalCallBack(kash_model=model, valid_x=valid_x, valid_y=valid_y, step=3) # optimizer = RAdam() # model.compile_model(optimizer=optimizer) model.fit(train_x, train_y, x_validate=valid_x, y_validate=valid_y, callbacks=[stop_callback,reduce_lr,eval_callback],
import pickle import kashgari from kashgari.embeddings import BertEmbedding from kashgari.tasks.labeling import BiLSTM_CRF_Model import tensorflow as tf with open('data.pickle', 'rb') as f: data_dic = pickle.load(f) x_train = data_dic[0] x_validation = data_dic[1] y_train = data_dic[2] y_validation = data_dic[3] embedding = BertEmbedding('bert-base-chinese', sequence_length = 128) model = BiLSTM_CRF_Model(embedding) model.fit( x_train = x_train, x_validate = x_validation, y_train = y_train, y_validate = y_validation, epochs=5, batch_size=32, ) model.save('Model') model.evaluate(x_data=x_validation,y_data=y_validation)
# 下面我们用 Bi_LSTM 模型实现一个命名实体识别任务: from kashgari.corpus import ChineseDailyNerCorpus from kashgari.tasks.labeling import BiLSTM_Model, BiLSTM_CRF_Model # 加载内置数据集,此处可以替换成自己的数据集,保证格式一致即可 train_x, train_y = ChineseDailyNerCorpus.load_data('train') test_x, test_y = ChineseDailyNerCorpus.load_data('test') valid_x, valid_y = ChineseDailyNerCorpus.load_data('valid') model = BiLSTM_CRF_Model() model.fit(train_x, train_y, valid_x, valid_y, epochs=1) model.save("BiLSTM_CRF_Model")
from kashgari.embeddings import BERTEmbedding from kashgari.tasks.labeling import BiLSTM_CRF_Model from kashgari.corpus import ChineseDailyNerCorpus train_x, train_y = ChineseDailyNerCorpus.load_data('train') valid_x, valid_y = ChineseDailyNerCorpus.load_data('validate') test_x, test_y = ChineseDailyNerCorpus.load_data('test') # 还可以选择 `CNN_LSTM_Model`, `BiLSTM_Model`, `BiGRU_Model` 或 `BiGRU_CRF_Model` bert = BERTEmbedding('wwm', task="classification", sequence_length=300) model = BiLSTM_CRF_Model(bert) model.fit(train_x, train_y, x_validate=valid_x, y_validate=valid_y, epochs=20, batch_size=512)
import kashgari from kashgari.corpus import DataReader from kashgari.embeddings import BERTEmbedding from kashgari.tasks.labeling import BiLSTM_CRF_Model from kashgari import utils kashgari.config.use_cudnn_cell = False train_x, train_y = DataReader().read_conll_format_file('data/data_all/example.train') valid_x, valid_y = DataReader().read_conll_format_file('data/data_all/example.dev') test_x, test_y = DataReader().read_conll_format_file('data/data_all/example.test') train_x, train_y = utils.unison_shuffled_copies(train_x, train_y) valid_x, valid_y = utils.unison_shuffled_copies(valid_x, valid_y) test_x, test_y = utils.unison_shuffled_copies(test_x, test_y) print(f"train data count: {len(train_x)}") print(f"validate data count: {len(valid_x)}") print(f"test data count: {len(test_x)}", test_x[0], test_y[0]) bert_embedding = BERTEmbedding('chinese_wwm_ext_L-12_H-768_A-12', task=kashgari.LABELING, sequence_length=100) model = BiLSTM_CRF_Model(bert_embedding) model.fit(train_x, train_y, valid_x, valid_y, batch_size=512, epochs=20) model.save('models/all_ner.h5') model.evaluate(test_x, test_y)
bert_embed = BERTEmbedding('electra', task=kashgari.LABELING, sequence_length=128) from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint from tensorflow.keras.layers import Flatten, Dense, Dropout from tensorflow.python import keras from kashgari.callbacks import EvalCallBack #patience=3是看每一個epoch stop_callback = EarlyStopping(patience=3, restore_best_weights=True) save_callback = ModelCheckpoint("5_29_1", save_best_only=True) model = BiLSTM_CRF_Model(bert_embed) model.fit(train_x, train_y, x_validate=valid_x, y_validate=valid_y, callbacks=[stop_callback, save_callback], batch_size=250, epochs=25) # 验证模型,此方法将打印出详细的验证报告 model.evaluate(test_x, test_y) # 保存模型到 `model_name` 目录下 model.save('5_29_1')
else: x.append(rows[0]) y.append(rows[1]) return data_x, data_y train_x, train_y = get_sequenct_tagging_data(train_path) dev_x, dev_y = get_sequenct_tagging_data(dev_path) test_x, test_y = get_sequenct_tagging_data(test_path) print(f"train data count: {len(train_x)}") print(f"validate data count: {len(dev_x)}") print(f"test data count: {len(test_x)}") bert_embed = BERTEmbedding(bert_path, task=kashgari.LABELING, sequence_length=100) # 创建模型并训练 model = BiLSTM_CRF_Model(bert_embed) model.fit(train_x, train_y, x_validate=dev_x, y_validate=dev_y, epochs=20, batch_size=512) model.save(model_path) # 模型评估 model.evaluate(test_x, test_y)
class Kashgari: def __init__(self): self.model = None self.chunk_size = 100 self.set_features_numeric = dict() self.set_features_text = dict() def prepare_data_fit(self, tokens, tags, chunk_size, overlap=10): text_list = [] first_of_p_list = [] tag_list = [] buffer_text = [] buffer_first_of_p = [] buffer_tag = [] text_features = set("token") numeric_features = set("first_of_p") self.set_features_numeric = dict() for doc, doc_tags in zip(tokens, tags): for token, tag in zip(doc, doc_tags): features = agregado(token, simple_features=True) buffer_text.append(features['token']) buffer_first_of_p.append( '2' if features['first_of_p'] else '1') buffer_tag.append(tag) if len(buffer_text) > chunk_size: text_list.append(buffer_text) first_of_p_list.append(buffer_first_of_p) tag_list.append(buffer_tag) # Zerar buffer_text = [] buffer_first_of_p = [] buffer_tag = [] print("Processed doc") if len(buffer_text) >= 0: text_list.append(buffer_text) first_of_p_list.append(buffer_first_of_p) tag_list.append(buffer_tag) results = (text_list, first_of_p_list) return results, tag_list def prepare_data_predict(self, tokens, chunk_size): text_list = [] first_of_p_list = [] buffer_text = [] buffer_first_of_p = [] for token in tokens: features = agregado(token, simple_features=True) buffer_text.append(features['token']) buffer_first_of_p.append('2' if features['first_of_p'] else '1') if len(buffer_text) >= chunk_size: text_list.append(buffer_text) first_of_p_list.append(buffer_first_of_p) # Zerar buffer_text = [] buffer_first_of_p = [] if len(buffer_text) > 0: text_list.append(buffer_text) first_of_p_list.append(buffer_first_of_p) results = (text_list, first_of_p_list) return results def train(self, tokens, tags): x, y = self.prepare_data_fit(tokens, tags, chunk_size=self.chunk_size) text_embedding = BareEmbedding(task=kashgari.LABELING, sequence_length=self.chunk_size) first_of_p_embedding = NumericFeaturesEmbedding( feature_count=2, feature_name='first_of_p', sequence_length=self.chunk_size) stack_embedding = StackedEmbedding( [text_embedding, first_of_p_embedding]) stack_embedding.analyze_corpus(x, y) from kashgari.tasks.labeling import BiLSTM_Model, BiLSTM_CRF_Model self.model = BiLSTM_CRF_Model(embedding=stack_embedding) self.model.fit(x, y, batch_size=1, epochs=20) def predict(self, tokens): import itertools results = [] for doc in tokens: x = self.prepare_data_predict(doc, chunk_size=self.chunk_size) predicted = self.model.predict(x) x_list = list(itertools.chain.from_iterable(x[0])) predicted_unified = list(itertools.chain.from_iterable(predicted)) predicted_truncated = predicted_unified[:len(doc)] print( f"len doc{len(doc)} | x_list{len(x_list)} |len predicted_unified{len(predicted_unified)} |len predicted_truncated{len(predicted_truncated)} |" ) results.append(predicted_unified[:len(doc)]) return results
train_x, train_y = ChineseDailyNerCorpus.load_data('train') valid_x, valid_y = ChineseDailyNerCorpus.load_data('validate') test_x, test_y = ChineseDailyNerCorpus.load_data('test') print(f"train data count: {len(train_x)}") print(f"validate data count: {len(valid_x)}") print(f"test data count: {len(test_x)}", test_x[0], test_y[0]) import kashgari from kashgari.embeddings import BERTEmbedding bert_embed = BERTEmbedding('chinese_wwm_ext_L-12_H-768_A-12', task=kashgari.LABELING, sequence_length=100) from kashgari.tasks.labeling import BiLSTM_CRF_Model # 还可以选择 `CNN_LSTM_Model`, `BiLSTM_Model`, `BiGRU_Model` 或 `BiGRU_CRF_Model` model = BiLSTM_CRF_Model(bert_embed) model.fit(train_x, train_y, x_validate=valid_x, y_validate=valid_y, epochs=20, batch_size=512) model.save('models/org_loc_per_ner.h5') model.evaluate(test_x, test_y)
words, labels = [], [] count = 0 for data, label in zip(datafile, labelfile): count += 1 s1 = data.strip().split(' ') s2 = label.strip().split(' ') words.append(s1) labels.append(s2) train_x, test_x, train_y, test_y = train_test_split(words, labels, test_size=0.5, random_state=50) bert_embed = BERTEmbedding('uncased_L-12_H-768_A-12', trainable=False, task=kashgari.LABELING, sequence_length=20, ) model = BiLSTM_CRF_Model(bert_embed) model.fit(train_x, train_y, x_validate=test_x, y_validate=test_y, epochs=35, batch_size=256) model.save('model_bilstm_crf_35_256_64') model.evaluate(x_data=test_x,y_data=test_y,batch_size=64,debug_info=True)
# ou o GloVE-300 do http://nilc.icmc.usp.br/embeddings se não der certo # 2 - Ver como fazer o Predict. Temos que processar a frase para ficar igual a deles. # Eles usam um PunktSentenceTokenizer com um abbrev_list. Esses scripts estao na pasta leNer-dataset. # 3 - Ver como integrar esse codigo com o webstruct atual # 4 - Seria uma boa ideia ter uma interface tipo o Broka. Para que existesse a lista de arquivos, e que # pudesse abrir para re-treinar, abrindo com o plugin de Ramon. # Uma ideia seria ate converter o dataset deles atual para o formato do broka hoje em Html ( pode ser algo simples, como colocar cada paragrafo como um p) # 5 - Fazer a persistencia ( O kashgari tem um metodo save/load) # 2 - Aumentar epochs para treinar # You can use WordEmbedding or BERTEmbedding for your text embedding text_embedding = BareEmbedding(task=kashgari.LABELING) text_embedding.analyze_corpus(tokens, labels) # Now we can embed with this stacked embedding layer # We can build any labeling model with this embedding from kashgari.tasks.labeling import BiLSTM_CRF_Model model = BiLSTM_CRF_Model(embedding=text_embedding) model.fit(tokens, labels, batch_size=8, epochs=10) print(model.predict(tokens)) # print(model.predict_entities(x))
# -*- coding: utf-8 -*- # time: 2019-08-09 16:47 # place: Zhichunlu Beijing import kashgari from kashgari.corpus import DataReader from kashgari.embeddings import BERTEmbedding from kashgari.tasks.labeling import BiLSTM_CRF_Model train_x, train_y = DataReader().read_conll_format_file('./data/time.train') valid_x, valid_y = DataReader().read_conll_format_file('./data/time.dev') test_x, test_y = DataReader().read_conll_format_file('./data/time.test') bert_embedding = BERTEmbedding('chinese_wwm_ext_L-12_H-768_A-12', task=kashgari.LABELING, sequence_length=128) model = BiLSTM_CRF_Model(bert_embedding) model.fit(train_x, train_y, valid_x, valid_y, batch_size=16, epochs=10) model.save('time_ner.h5') model.evaluate(test_x, test_y)
with open("data_test.pkl", "rb") as f: x_test, y_test = pickle.load(f) x_train, y_train = list(map(list, x_train)), list(map(list, y_train)) x_valid, y_valid = list(map(list, x_valid)), list(map(list, y_valid)) x_test, y_test = list(map(list, x_test)), list(map(list, y_test)) # Skip testing for now x_train, y_train = x_train + x_test, y_train + y_test model_dir = 'bert_tagger' log_dir = os.path.join(model_dir, 'logs') weights_path = os.path.join(log_dir, 'weights.h5') BERT_PATH = '/mnt/DATA/data/embeddings/uncased_L-12_H-768_A-12' EARLY_STOP = 10 bert_embed = BERTEmbedding(BERT_PATH, task=kashgari.LABELING) model = BiLSTM_CRF_Model(bert_embed) model.fit(x_train, y_train, x_valid, y_valid, epochs=10, batch_size=64, callbacks=[ TensorBoard(log_dir=log_dir, write_graph=False), ModelCheckpoint(weights_path, save_weights_only=True), ReduceLROnPlateau() ]) print('Saving the model...') model.save(model_dir) from kashgari.utils import load_model