Пример #1
0
 def build_embedding(self):
     bert_path = get_file(
         'bert_sample_model',
         "http://s3.bmio.net/kashgari/bert_sample_model.tar.bz2",
         cache_dir=DATA_PATH,
         untar=True)
     embedding = BertEmbedding(model_folder=bert_path)
     return embedding
def main():
    #注意修改对应的路径
    train_path = '/home/qianlang/WordSeg-master/Data/train/data_generate_train.utf8'
    dev_path = '/home/qianlang/WordSeg-master/Data/train/data_generate_dev.utf8'
    test_path = '/home/qianlang/WordSeg-master/Data/test/data_generate_test.utf8'
    # dev_path = r'D:\Pycharm\Project\data_analyze\Data_processing\data_generate\data_generate_dev.utf8'

    train_x, train_y = load_dataset(train_path)
    dev_x, dev_y = load_dataset(dev_path)
    test_x, test_y = load_dataset(test_path)

    bert_embed = BertEmbedding('chinese_wwm_ext_L-12_H-768_A-12')

    model = BiGRU_CRF_Model(bert_embed, sequence_length=128)

    tf_board_callback = keras.callbacks.TensorBoard(
        log_dir='./logs/bert_bigru_crf1',
        histogram_freq=1,
        write_graph=True,
        write_images=False,
        embeddings_freq=1,
        embeddings_layer_names=None,
        embeddings_metadata=None,
        update_freq=1000)

    # Build-in callback for print precision, recall and f1 at every epoch step
    eval_callback = EvalCallBack(kash_model=model,
                                 x_data=dev_x,
                                 y_data=dev_y,
                                 truncating=True,
                                 step=1)

    model.fit(train_x,
              train_y,
              dev_x,
              dev_y,
              batch_size=128,
              epochs=20,
              callbacks=[eval_callback, tf_board_callback])

    model.evaluate(test_x, test_y)

    model.save('cws_wwm_bert_bigru_crf.h5')
Пример #3
0
    def run_with_model_class(self, model_class: Type[ABCLabelingModel], epochs: int):
        bert_path = get_bert_path()

        train_x, train_y = ChineseDailyNerCorpus.load_data('train')
        valid_x, valid_y = ChineseDailyNerCorpus.load_data('valid')
        test_x, test_y = ChineseDailyNerCorpus.load_data('test')

        bert_embed = BertEmbedding(bert_path)
        model = model_class(bert_embed)

        log_path = os.path.join(log_root, model_class.__name__)
        file_writer = tf.summary.create_file_writer(log_path + "/metrics")
        file_writer.set_as_default()
        callbacks = [EvalCallBack(model, test_x, test_y, step=1, truncating=True)]
        # callbacks = []
        model.fit(train_x, train_y, valid_x, valid_y, epochs=epochs, callbacks=callbacks)

        report = model.evaluate(test_x, test_y)
        del model
        del bert_embed
        return report
Пример #4
0
def predict_it(test_path, model_path, output_path):
    bert_embed = BertEmbedding(bert_path)
    dataset = build_dataset(test_path)
    test_x, test_y = [], []
    for x, y in dataset.as_numpy_iterator():
        x = [str(i, 'utf-8') for i in x]
        y = [str(i, 'utf-8') for i in y]
        test_x += [x]
        test_y += [y]

    # 加载保存模型
    loaded_model = kashgari.utils.load_model('saved_ner_model')
    # loaded_model = tf.keras.models.load_model(model_path)
    loaded_model.tf_model.load_weights(model_path)
    # 使用模型进行预测
    test_y = loaded_model.predict(test_x)
    with open(output_path, 'w') as f:
        for y in test_y:
            f.write('\t'.join(y) + '\n')
    print('predict_it done {} {} {}'.format(test_path, model_path,
                                            output_path))
Пример #5
0
    def test_with_bert(self):
        bert_path = get_file(
            'bert_sample_model',
            "http://s3.bmio.net/kashgari/bert_sample_model.tar.bz2",
            cache_dir=DATA_PATH,
            untar=True)
        embedding = BertEmbedding(model_folder=bert_path)
        model = self.TASK_MODEL_CLASS(embedding=embedding)
        train_x, train_y = TestMacros.load_labeling_corpus()
        valid_x, valid_y = train_x, train_y

        model.fit(train_x,
                  train_y,
                  x_validate=valid_x,
                  y_validate=valid_y,
                  epochs=self.EPOCH_COUNT)

        model.evaluate(valid_x, valid_y)
        model.evaluate(valid_x, valid_y, truncating=True)
        model.predict(valid_x)
        model.predict(valid_x, truncating=True)
from bert_model.utils import load_data
import kashgari
from kashgari.embeddings import BertEmbedding
from kashgari.tasks.labeling import BiLSTM_CRF_Model

train_x, train_y = load_data('train')
valid_x, valid_y = load_data('validate')
test_x, test_y = load_data('test')
# print(train_x)
# print(f"train data count: {len(train_x)}")
# print(f"validate data count: {len(valid_x)}")
# print(f"test data count: {len(test_x)}")
model_folder = '/Users/mesie/python/nlp/chinese_L-12_H-768_A-12'
bert_embed = BertEmbedding(model_folder)
model = BiLSTM_CRF_Model(bert_embed)
model.fit(train_x,
          train_y,
          x_validate=valid_x,
          y_validate=valid_y,
          epochs=20,
          batch_size=512)
Пример #7
0
from kashgari.corpus import ChineseDailyNerCorpus
import kashgari
from kashgari.tasks.labeling import BiLSTM_CRF_Model
import os
import keras
from kashgari.embeddings import BertEmbedding
from kashgari.callbacks import EvalCallBack
import datetime
import numpy as np
import sys
import argparse
import tensorflow as tf

## roberta
bert_path = '/root/bert/chinese_roberta_wwm_ext_L-12_H-768_A-12'
bert_embed = BertEmbedding(bert_path, trainable=True)


class BIODataGenerator:
    def __init__(self, data_path, batch_size):
        self.data_path = data_path
        self.batch_size = batch_size
        pass

    def forfit(self):
        while True:
            batch_X = []
            batch_y = []
            with open(self.data_path, 'r') as f:
                X, Y = [], []
                for line in f:
Пример #8
0
seps, strips = u'\n。!?!?;;,, ', u';;,, '
x_data = []
y_data = []

for d in corpus_data:
    for p in d['passages']:
        if p['answer']:
            x = tokenizer.tokenize(
                d['question']) + ['[SEP]'] + tokenizer.tokenize(p['passage'])
            x_data.append(x)
            y_data.append(tokenizer.tokenize(p['answer']))

print(x_data[:3])
print(y_data[:3])

bert = BertEmbedding(bert_path)
model = Seq2Seq(encoder_seq_length=256)


class CustomCallback(tf.keras.callbacks.Callback):
    def __init__(self, model):
        self.model = model
        self.sample_count = 5

    def on_epoch_end(self, epoch, logs=None):
        if epoch % 4 != 0:
            return
        import random
        samples = random.sample(x_data, self.sample_count)
        translates, _ = self.model.predict(samples)
        print()
Пример #9
0
import pickle
import kashgari
from kashgari.embeddings import BertEmbedding
from kashgari.tasks.labeling import BiLSTM_CRF_Model
import tensorflow as tf

with open('data.pickle', 'rb') as f:
    data_dic = pickle.load(f)

x_train = data_dic[0]
x_validation = data_dic[1]
y_train = data_dic[2]
y_validation = data_dic[3]

embedding = BertEmbedding('bert-base-chinese',
                            sequence_length = 128)
model = BiLSTM_CRF_Model(embedding)

model.fit(  x_train = x_train,
            x_validate = x_validation,
            y_train = y_train,
            y_validate = y_validation,
            epochs=5,
            batch_size=32,
            )
model.save('Model')
model.evaluate(x_data=x_validation,y_data=y_validation)