def build_embedding(self): bert_path = get_file( 'bert_sample_model', "http://s3.bmio.net/kashgari/bert_sample_model.tar.bz2", cache_dir=DATA_PATH, untar=True) embedding = BertEmbedding(model_folder=bert_path) return embedding
def main(): #注意修改对应的路径 train_path = '/home/qianlang/WordSeg-master/Data/train/data_generate_train.utf8' dev_path = '/home/qianlang/WordSeg-master/Data/train/data_generate_dev.utf8' test_path = '/home/qianlang/WordSeg-master/Data/test/data_generate_test.utf8' # dev_path = r'D:\Pycharm\Project\data_analyze\Data_processing\data_generate\data_generate_dev.utf8' train_x, train_y = load_dataset(train_path) dev_x, dev_y = load_dataset(dev_path) test_x, test_y = load_dataset(test_path) bert_embed = BertEmbedding('chinese_wwm_ext_L-12_H-768_A-12') model = BiGRU_CRF_Model(bert_embed, sequence_length=128) tf_board_callback = keras.callbacks.TensorBoard( log_dir='./logs/bert_bigru_crf1', histogram_freq=1, write_graph=True, write_images=False, embeddings_freq=1, embeddings_layer_names=None, embeddings_metadata=None, update_freq=1000) # Build-in callback for print precision, recall and f1 at every epoch step eval_callback = EvalCallBack(kash_model=model, x_data=dev_x, y_data=dev_y, truncating=True, step=1) model.fit(train_x, train_y, dev_x, dev_y, batch_size=128, epochs=20, callbacks=[eval_callback, tf_board_callback]) model.evaluate(test_x, test_y) model.save('cws_wwm_bert_bigru_crf.h5')
def run_with_model_class(self, model_class: Type[ABCLabelingModel], epochs: int): bert_path = get_bert_path() train_x, train_y = ChineseDailyNerCorpus.load_data('train') valid_x, valid_y = ChineseDailyNerCorpus.load_data('valid') test_x, test_y = ChineseDailyNerCorpus.load_data('test') bert_embed = BertEmbedding(bert_path) model = model_class(bert_embed) log_path = os.path.join(log_root, model_class.__name__) file_writer = tf.summary.create_file_writer(log_path + "/metrics") file_writer.set_as_default() callbacks = [EvalCallBack(model, test_x, test_y, step=1, truncating=True)] # callbacks = [] model.fit(train_x, train_y, valid_x, valid_y, epochs=epochs, callbacks=callbacks) report = model.evaluate(test_x, test_y) del model del bert_embed return report
def predict_it(test_path, model_path, output_path): bert_embed = BertEmbedding(bert_path) dataset = build_dataset(test_path) test_x, test_y = [], [] for x, y in dataset.as_numpy_iterator(): x = [str(i, 'utf-8') for i in x] y = [str(i, 'utf-8') for i in y] test_x += [x] test_y += [y] # 加载保存模型 loaded_model = kashgari.utils.load_model('saved_ner_model') # loaded_model = tf.keras.models.load_model(model_path) loaded_model.tf_model.load_weights(model_path) # 使用模型进行预测 test_y = loaded_model.predict(test_x) with open(output_path, 'w') as f: for y in test_y: f.write('\t'.join(y) + '\n') print('predict_it done {} {} {}'.format(test_path, model_path, output_path))
def test_with_bert(self): bert_path = get_file( 'bert_sample_model', "http://s3.bmio.net/kashgari/bert_sample_model.tar.bz2", cache_dir=DATA_PATH, untar=True) embedding = BertEmbedding(model_folder=bert_path) model = self.TASK_MODEL_CLASS(embedding=embedding) train_x, train_y = TestMacros.load_labeling_corpus() valid_x, valid_y = train_x, train_y model.fit(train_x, train_y, x_validate=valid_x, y_validate=valid_y, epochs=self.EPOCH_COUNT) model.evaluate(valid_x, valid_y) model.evaluate(valid_x, valid_y, truncating=True) model.predict(valid_x) model.predict(valid_x, truncating=True)
from bert_model.utils import load_data import kashgari from kashgari.embeddings import BertEmbedding from kashgari.tasks.labeling import BiLSTM_CRF_Model train_x, train_y = load_data('train') valid_x, valid_y = load_data('validate') test_x, test_y = load_data('test') # print(train_x) # print(f"train data count: {len(train_x)}") # print(f"validate data count: {len(valid_x)}") # print(f"test data count: {len(test_x)}") model_folder = '/Users/mesie/python/nlp/chinese_L-12_H-768_A-12' bert_embed = BertEmbedding(model_folder) model = BiLSTM_CRF_Model(bert_embed) model.fit(train_x, train_y, x_validate=valid_x, y_validate=valid_y, epochs=20, batch_size=512)
from kashgari.corpus import ChineseDailyNerCorpus import kashgari from kashgari.tasks.labeling import BiLSTM_CRF_Model import os import keras from kashgari.embeddings import BertEmbedding from kashgari.callbacks import EvalCallBack import datetime import numpy as np import sys import argparse import tensorflow as tf ## roberta bert_path = '/root/bert/chinese_roberta_wwm_ext_L-12_H-768_A-12' bert_embed = BertEmbedding(bert_path, trainable=True) class BIODataGenerator: def __init__(self, data_path, batch_size): self.data_path = data_path self.batch_size = batch_size pass def forfit(self): while True: batch_X = [] batch_y = [] with open(self.data_path, 'r') as f: X, Y = [], [] for line in f:
seps, strips = u'\n。!?!?;;,, ', u';;,, ' x_data = [] y_data = [] for d in corpus_data: for p in d['passages']: if p['answer']: x = tokenizer.tokenize( d['question']) + ['[SEP]'] + tokenizer.tokenize(p['passage']) x_data.append(x) y_data.append(tokenizer.tokenize(p['answer'])) print(x_data[:3]) print(y_data[:3]) bert = BertEmbedding(bert_path) model = Seq2Seq(encoder_seq_length=256) class CustomCallback(tf.keras.callbacks.Callback): def __init__(self, model): self.model = model self.sample_count = 5 def on_epoch_end(self, epoch, logs=None): if epoch % 4 != 0: return import random samples = random.sample(x_data, self.sample_count) translates, _ = self.model.predict(samples) print()
import pickle import kashgari from kashgari.embeddings import BertEmbedding from kashgari.tasks.labeling import BiLSTM_CRF_Model import tensorflow as tf with open('data.pickle', 'rb') as f: data_dic = pickle.load(f) x_train = data_dic[0] x_validation = data_dic[1] y_train = data_dic[2] y_validation = data_dic[3] embedding = BertEmbedding('bert-base-chinese', sequence_length = 128) model = BiLSTM_CRF_Model(embedding) model.fit( x_train = x_train, x_validate = x_validation, y_train = y_train, y_validate = y_validation, epochs=5, batch_size=32, ) model.save('Model') model.evaluate(x_data=x_validation,y_data=y_validation)