def main():

    train_x, train_y = ChineseDailyNerCorpus.load_data("train")
    valid_x, valid_y = ChineseDailyNerCorpus.load_data("validate")
    test_x, test_y = ChineseDailyNerCorpus.load_data("test")

    print(f"train data count: {len(train_x)}")
    print(f"validate data count: {len(valid_x)}")
    print(f"test data count: {len(test_x)}")

    bert_embed = BERTEmbedding("models/chinese_L-12_H-768_A-12",
                               task=kashgari.LABELING,
                               sequence_length=100)
    model = BiLSTM_CRF_Model(bert_embed)
    model.fit(
        train_x,
        train_y,
        x_validate=valid_x,
        y_validate=valid_y,
        epochs=1,
        batch_size=512,
    )
    model.save("models/ner.h5")
    model.evaluate(test_x, test_y)
    predictions = model.predict_classes(test_x)
    print(predictions)
Пример #2
0
def train_BERT_BiLSTM_CRF(
        train_test_devide=0.9,
        epoch=20,
        path='/home/peitian_zhang/data/corpus/labeled_train.txt'):
    train_x, train_y = getTrain(path)
    x = train_x[:int(len(train_x) * train_test_devide) + 1]
    y = train_y[:int(len(train_x) * train_test_devide) + 1]

    bert = BERTEmbedding(
        model_folder='/home/peitian_zhang/data/chinese_L-12_H-768_A-12',
        sequence_length=400,
        task=kashgari.LABELING)
    model = BiLSTM_CRF_Model(bert)

    model.fit(x, y, x, y, epochs=epoch, batch_size=64)

    print('---------evaluate on train---------\n{}'.format(
        model.evaluate(train_x, train_y)))
    print('---------evaluate on test----------\n{}'.format(
        model.evaluate(train_x[int(len(train_x) * train_test_devide) + 1:],
                       train_y[int(len(train_x) * train_test_devide) + 1:])))
    try:
        model.save('/home/peitian_zhang/models/bert_epoch_{}'.format(epoch))
        print('Success in saving!')
    except:
        pass
    return model
Пример #3
0
def train_ner(x_train, y_train, x_valid, y_valid, x_test, y_test,
              sequence_length, epoch, batch_size, bert_model_path,
              model_save_path):
    """
    BERT-BiLSTM-CRF 模型训练,提取症状内部特征
    """
    bert_embedding = BERTEmbedding(bert_model_path,
                                   task=kashgari.LABELING,
                                   sequence_length=sequence_length)

    model = BiLSTM_CRF_Model(bert_embedding)

    eval_callback_val = EvalCallBack(kash_model=model,
                                     valid_x=x_valid,
                                     valid_y=y_valid,
                                     step=1)

    eval_callback_test = EvalCallBack(kash_model=model,
                                      valid_x=x_test,
                                      valid_y=y_test,
                                      step=1)

    model.fit(x_train,
              y_train,
              x_validate=x_valid,
              y_validate=y_valid,
              epochs=epoch,
              batch_size=batch_size,
              callbacks=[eval_callback_val, eval_callback_test])

    model.save(model_save_path)

    model.evaluate(x_test, y_test)

    return model
Пример #4
0
 def build(self):
     embed = BERTEmbedding(model_folder=self.folder,
                           task=kashgari.LABELING,
                           trainable=self.fine_tune,
                           sequence_length=self.seq_len)
     model = BiLSTM_CRF_Model(embed)
     return model
Пример #5
0
def train():
    parser = argparse.ArgumentParser()
    parser.add_argument('model_dir', default='model dir')
    args = parser.parse_args()

    model_dir = args.model_dir
    hdf_dir = os.path.join(model_dir, "hdf5")
    os.makedirs(hdf_dir, exist_ok=True)

    bert_model_path = os.path.join(ROOT_DIR, 'BERT-baseline')
    data_path = os.path.join(model_dir, "feature.pkl")
    with open(data_path, 'rb') as fr:
        train_data, train_label, test_data, test_label = pickle.load(fr)
    print("load {}/{} train/dev items ".format(len(train_data),
                                               len(test_data)))

    bert_embed = BERTEmbedding(bert_model_path,
                               task=kashgari.LABELING,
                               sequence_length=50)
    model = KashModel(bert_embed)
    model.build_model(x_train=train_data,
                      y_train=train_label,
                      x_validate=test_data,
                      y_validate=test_label)

    from src.get_model_path import get_model_path
    model_path, init_epoch = get_model_path(hdf_dir)
    if init_epoch > 0:
        print("load epoch from {}".format(model_path))
        model.tf_model.load_weights(model_path)

    optimizer = RAdam(learning_rate=0.0001)
    model.compile_model(optimizer=optimizer)

    hdf5_path = os.path.join(hdf_dir,
                             "crf-{epoch:03d}-{val_accuracy:.3f}.hdf5")
    checkpoint = ModelCheckpoint(hdf5_path,
                                 monitor='val_accuracy',
                                 verbose=1,
                                 save_best_only=True,
                                 save_weights_only=False,
                                 mode='auto',
                                 period=1)
    tensorboard = TensorBoard(log_dir=os.path.join(model_dir, "log"))
    eval_callback = EvalCallBack(kash_model=model,
                                 valid_x=test_data,
                                 valid_y=test_label,
                                 step=1,
                                 log_path=os.path.join(model_dir, "acc.txt"))
    callbacks = [checkpoint, tensorboard, eval_callback]

    model.fit(train_data,
              train_label,
              x_validate=test_data,
              y_validate=test_label,
              epochs=100,
              batch_size=256,
              callbacks=callbacks)
    return
Пример #6
0
    def train(self, tokens, tags):

        x, y = self.prepare_data_fit(tokens, tags, chunk_size=self.chunk_size)

        text_embedding = BareEmbedding(task=kashgari.LABELING,
                                       sequence_length=self.chunk_size)
        first_of_p_embedding = NumericFeaturesEmbedding(
            feature_count=2,
            feature_name='first_of_p',
            sequence_length=self.chunk_size)

        stack_embedding = StackedEmbedding(
            [text_embedding, first_of_p_embedding])

        stack_embedding.analyze_corpus(x, y)

        from kashgari.tasks.labeling import BiLSTM_Model, BiLSTM_CRF_Model
        self.model = BiLSTM_CRF_Model(embedding=stack_embedding)
        self.model.fit(x, y, batch_size=1, epochs=20)
Пример #7
0
  def initial_model(self, bert_model_path, psd_model_path):
    print('=============init bert model=========================')
    print("bert model path:", bert_model_path)
    print("crf model path:", psd_model_path)
    self.sess = tf.Session()
    set_session(self.sess)
    self.model_dir = os.path.dirname(os.path.dirname(psd_model_path))
    self.model_path = psd_model_path
    data_path = os.path.join(self.model_dir, "feature_psd.pkl")
    train_data, train_label, test_data, test_label = \
        pickle.load(open(data_path, 'rb'))

    bert_embed = BERTEmbedding(bert_model_path, task=kashgari.LABELING,
                               sequence_length=50)
    self.model = BiLSTM_CRF_Model(bert_embed)
    self.model.build_model(x_train=train_data, y_train=train_label,
                           x_validate=test_data, y_validate=test_label)
    self.model.compile_model()
    self.model.tf_model.load_weights(psd_model_path)
    print('=============bert model loaded=========================')
    return
Пример #8
0
def train_it2(train_path, checkpoint_filepath, model_path, start, span):
    data_generator = BIODataGenerator(train_path, 100000000)
    Xs, ys = data_generator.forfit().__next__()

    train_x, train_y = [], []
    valid_x, valid_y = [], []
    rng = np.random.RandomState(0)
    k = 0
    for x, y in zip(Xs, ys):
        # x = [str(i, 'utf-8') for i in x]
        # y = [str(i, 'utf-8') for i in y]
        rnum = rng.rand()
        k += 1
        if rnum < start or rnum >= start + span:
            train_x += [x]
            train_y += [y]
        else:
            valid_x += [x]
            valid_y += [y]
    # dataset = dataset.batch(32)
    print('====' * 8)
    print('total = ', k)
    print('start , span = ', (start, span))
    print('len train = ', len(train_x))
    # checkpoint_filepath = './checkpoint'
    if not os.path.exists(os.path.dirname(checkpoint_filepath)):
        os.mkdir(os.path.dirname(checkpoint_filepath))

    # train_x, train_y = ChineseDailyNerCorpus.load_data('train')
    # test_x, test_y = ChineseDailyNerCorpus.load_data('test')
    # valid_x, valid_y = ChineseDailyNerCorpus.load_data('valid')
    # model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    #     filepath=checkpoint_filepath,
    #     save_weights_only=True,
    #     monitor='val_accuracy',
    #     mode='max',
    #     save_best_only=True)
    #train_x, train_y = train_x[:1000], train_y[:1000]
    #valid_x, valid_y = valid_x[:200], valid_y[:200]

    model = BiLSTM_CRF_Model(bert_embed, sequence_length=128)
    eval_callback = Evaluator(model, checkpoint_filepath, valid_x, valid_y)
    early_stop = keras.callbacks.EarlyStopping(patience=10)
    reduse_lr_callback = keras.callbacks.ReduceLROnPlateau(factor=0.1,
                                                           patience=5)
    # eval_callback = EvalCallBack(kash_model=model,
    #                              x_data=valid_x,
    #                              y_data=valid_y,
    #                              step=1)

    model.fit(train_x,
              train_y,
              valid_x,
              valid_y,
              batch_size=64,
              epochs=20,
              callbacks=[early_stop, eval_callback, reduse_lr_callback])
    model.save(model_path)
Пример #9
0
def train_BiLSTM_CRF(train_test_devide=0.9,
                     epoch=100,
                     path='/home/peitian_zhang/data/corpus/labeled_train.txt'):

    train_x, train_y = getTrain(path)
    model = BiLSTM_CRF_Model()

    x = train_x[:int(len(train_x) * train_test_devide) + 1]
    y = train_y[:int(len(train_x) * train_test_devide) + 1]

    model.fit(x, y, x, y, epochs=epoch, batch_size=64)
    print('---------evaluate on train---------\n{}'.format(
        model.evaluate(train_x, train_y)))
    print('---------evaluate on test----------\n{}'.format(
        model.evaluate(train_x[int(len(train_x) * train_test_devide) + 1:],
                       train_y[int(len(train_x) * train_test_devide) + 1:])))
    try:
        model.save('/home/peitian_zhang/models/bert_epoch_{}'.format(epoch))
        print('Success in saving!')
    except:
        pass
    return model
Пример #10
0
def train_it(train_path, checkpoint_filepath, model_path, start, span):
    dataset = build_dataset(train_path)
    train_x, train_y = [], []
    valid_x, valid_y = [], []
    rng = np.random.RandomState(0)
    k = 0
    for x, y in dataset.as_numpy_iterator():
        x = [str(i, 'utf-8') for i in x]
        y = [str(i, 'utf-8') for i in y]
        rnum = rng.rand()
        k += 1
        if rnum < start or rnum >= start + span:
            train_x += [x]
            train_y += [y]
        else:
            valid_x += [x]
            valid_y += [y]
    # dataset = dataset.batch(32)
    print('====' * 8)
    print('total = ', k)
    print('start , span = ', (start, span))
    print('len train = ', len(train_x))
    # checkpoint_filepath = './checkpoint'
    if not os.path.exists(os.path.dirname(checkpoint_filepath)):
        os.mkdir(os.path.dirname(checkpoint_filepath))

    # model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    #     filepath=checkpoint_filepath,
    #     save_weights_only=True,
    #     monitor='val_accuracy',
    #     mode='max',
    #     save_best_only=True)

    model = BiLSTM_CRF_Model(bert_embed, sequence_length=100)
    evaluator = Evaluator(model, checkpoint_filepath, valid_x, valid_y)
    model.fit(train_x,
              train_y,
              valid_x,
              valid_y,
              batch_size=64,
              epochs=20,
              callbacks=[evaluator])
    model.save(model_path)
Пример #11
0
    text = [[0.9, 0.1, 0.1], [0.9, 0.1, 0.1], [0.1, 0.8, 0.1], [0.1, 0.8, 0.1],
            [0.1, 0.8, 0.1]]
    label = [
        'B-Category', 'I-Category', 'B-ProjectName', 'I-ProjectName',
        'I-ProjectName'
    ]

    text_list = [text] * 100
    label_list = [label] * 100

    SEQUENCE_LEN = 80

    # You can use WordEmbedding or BERTEmbedding for your text embedding
    bare_embedding = DirectEmbedding(task=kashgari.RAW_LABELING,
                                     sequence_length=SEQUENCE_LEN,
                                     embedding_size=3)
    #bare_embedding = BareEmbedding(task=kashgari.LABELING, sequence_length=SEQUENCE_LEN)

    x = (text_list)
    y = label_list
    bare_embedding.analyze_corpus(x, y)

    # Now we can embed with this stacked embedding layer
    # We can build any labeling model with this embedding

    from kashgari.tasks.labeling import BiLSTM_Model, BiLSTM_CRF_Model
    model = BiLSTM_CRF_Model(embedding=bare_embedding)
    model.fit(x, y, batch_size=1, epochs=3)

    print(model.predict(x))
    #print(model.predict_entities(x))
Пример #12
0
    test_x = list(test_x)
    test_y = list(test_y)
    ''' BERT Embedding '''
    #embedding = BERTEmbedding('./chinese_L-12_H-768_A-12',
    #                             task = kashgari.LABELING,
    #                             sequence_length = 150)
    ''' Word2Vec Embeddings '''
    word2vec_embedding = kashgari.embeddings.WordEmbedding(
        w2v_path="word2vec.model",
        task=kashgari.LABELING,
        w2v_kwargs={
            'binary': True,
            'unicode_errors': 'ignore'
        },
        sequence_length='auto')
    model = BiLSTM_CRF_Model(word2vec_embedding)
    #model = BiLSTM_CRF_Model(embedding)
    tf_board_callback = keras.callbacks_v1.TensorBoard(log_dir='.\\logs',
                                                       update_freq=1000)
    eval_callback = EvalCallBack(kash_model=model,
                                 valid_x=test_x,
                                 valid_y=test_y,
                                 step=4)

    model.fit(train_x,
              train_y,
              test_x,
              test_y,
              batch_size=20,
              epochs=4,
              callbacks=[eval_callback, tf_board_callback])
Пример #13
0
class BertPolyPhone:
  """ 拼音预测主类"""
  def __init__(self):
    super().__init__()
    self.poly_dict = dict()
    poly_dict_path = "/data1/liufeng/synthesis/frontend/data/simple_poly_dict"
    for line in read_lines(poly_dict_path):
      line = line.replace(" ", "").replace("*", "")
      key = line.split(":")[0]
      value = line.split(":")[1].split(",")
      self.poly_dict[key] = value
    self.model, self.model_dir = None, None
    self.sess = None

  def inialize_model(self, bert_model_path, poly_model_path):
    print('=============init phone model=========================')
    print("bert model path:", bert_model_path)
    print("crf model path:", poly_model_path)
    # 需要训练数据的路径构建字典
    self.sess = tf.Session()
    set_session(self.sess)
    self.model_dir = os.path.dirname(os.path.dirname(poly_model_path))
    data_path = os.path.join(self.model_dir, "feature.pkl")

    train_data, train_label, test_data, test_label = \
        pickle.load(open(data_path, 'rb'))

    bert_embed = BERTEmbedding(bert_model_path, task=kashgari.LABELING,
                               sequence_length=50)
    self.model = BiLSTM_CRF_Model(bert_embed)

    self.model.build_model(x_train=train_data, y_train=train_label,
                           x_validate=test_data, y_validate=test_label)
    self.model.compile_model()
    self.model.tf_model.load_weights(poly_model_path)
    print('=============successful loaded=========================')

  def _lookup_dict(self, bert_result, pred_ph_pairs):
    """查字典的方法对拼音进行修正 """
    # todo: 如果词在词典中,不用bert的结果。
    bert_phone_result = []
    for index_c, (char, ph, _) in enumerate(pred_ph_pairs):
      if char in self.poly_dict.keys():
        # 如果bert预测结果不在多音字字典中,就是预测结果跑偏了
        if bert_result[index_c] not in self.poly_dict[char]:
          bert_phone_result.append((char, ph))
        else:
          bert_result[index_c] = split_phone_format(bert_result[index_c])
          bert_phone_result.append((char, bert_result[index_c]))
          if ph != bert_result[index_c]:
            print("using bert result {}:{} instead of {}".format(
              char, bert_result[index_c], ph))
      else:
        bert_phone_result.append((char, ph))
    return bert_phone_result

  def predict(self, sentence_list):
    """ 通过句子预测韵律,标点断开 """
    bert_input = []
    for sent in sentence_list:
      assert len(sent) < 50
      bert_input.append([c for c in sent])
    print("bert-input:", bert_input)
    prosody = self.model.predict(bert_input)
    return prosody

  def save_pb(self):
    self._write_dict()
    pb_dir = os.path.join(self.model_dir, "pb")
    os.makedirs(pb_dir, exist_ok=True)
    h5_to_pb(self.model.tf_model, pb_dir, self.sess, "model_phone.pb",
             ["output_phone"])
    return

  def _write_dict(self):
    label_path = os.path.join(self.model_dir, "pb/phone_idx2label.txt")
    with open(label_path, "w", encoding="utf-8") as fr:
      for key, value in self.model.embedding.label2idx.items():
        fr.write("{} {}\n".format(value, key))
    print("write {}".format(label_path))

    token_path = os.path.join(self.model_dir, "pb/phone_token2idx.txt")
    with open(token_path, "w", encoding="utf-8") as fr:
      for key, value in self.model.embedding.token2idx.items():
        if len(key) > 0:
          fr.write("{} {}\n".format(key, value))
    print("write {}".format(token_path))
    return

  def compute_embed(self, sentence_list):
    bert_input = [[c for c in sent] for sent in sentence_list]
    print("bert-input:", bert_input)
    import numpy as np
    tensor = self.model.embedding.process_x_dataset(bert_input)
    print("debug:", np.shape(tensor), tensor)
    res = self.model.tf_model.predict(tensor)
    import numpy as np
    print("debug:", np.shape(res), res[0][0: len(sentence_list[0]+1)])
    return tensor

  @staticmethod
  def _merge_eng_char(bert_phone_result, dict_phone_pairs):
    from src.utils import check_all_chinese
    index = 0
    new_bert_phone = []
    for word, _, _ in dict_phone_pairs:
      if (not check_all_chinese(word)) and len(word) > 1:
        new_bert_phone.append(bert_phone_result[index])
        index += len(word)
      else:
        new_bert_phone.append(bert_phone_result[index])
        index += 1
    return new_bert_phone

  def modify_result(self, bert_result, dict_phone_pairs):
    bert_result = self._merge_eng_char(bert_result, dict_phone_pairs)
    bert_phone_pairs = self._lookup_dict(bert_result, dict_phone_pairs)
    phone_pairs = bert_phone_pairs
    # phone_pairs = change_yi(phone_pairs)
    # phone_pairs = change_bu(phone_pairs)
    phone_pairs = sandhi(phone_pairs)
    bert_result = [ph for _, ph in phone_pairs]
    chars = "".join([c for c, _ in phone_pairs])
    bert_result = change_qingyin(bert_result, chars)
    return bert_result
Пример #14
0
import kashgari
from kashgari.embeddings import BERTEmbedding
from kashgari.corpus import ChineseDailyNerCorpus
from kashgari.tasks.labeling import BiLSTM_CRF_Model
train_x, train_y = ChineseDailyNerCorpus.load_data('./data/train.txt')
valid_x, valid_y = ChineseDailyNerCorpus.load_data('./data/dev.txt')
test_x, test_y  = ChineseDailyNerCorpus.load_data('./data/test.txt')

bert_embed = BERTEmbedding('./chinese_L-12_H-768_A-12',
                           task=kashgari.LABELING,
                           sequence_length=100)

# 还可以选择 `CNN_LSTM_Model`, `BiLSTM_Model`, `BiGRU_Model` 或 `BiGRU_CRF_Model`
model = BiLSTM_CRF_Model(bert_embed)
model.fit(train_x,
          train_y,
          x_validate=valid_x,
          y_validate=valid_y,
          epochs=20,
          batch_size=512)

model.save('saved_ner_model')
# -*- coding: utf-8 -*-
# time: 2019-09-12
# place: Huangcun Beijing

import kashgari
from kashgari import utils
from kashgari.corpus import DataReader
from kashgari.embeddings import BERTEmbedding
from kashgari.tasks.labeling import BiLSTM_CRF_Model

# 模型训练

train_x, train_y = DataReader().read_conll_format_file('./data/time.train')
valid_x, valid_y = DataReader().read_conll_format_file('./data/time.dev')
test_x, test_y = DataReader().read_conll_format_file('./data/time.test')

bert_embedding = BERTEmbedding('chinese_wwm_ext_L-12_H-768_A-12',
                               task=kashgari.LABELING,
                               sequence_length=128)

model = BiLSTM_CRF_Model(bert_embedding)

model.fit(train_x, train_y, valid_x, valid_y, batch_size=16, epochs=1)

# Save model
utils.convert_to_saved_model(model,
                             model_path='saved_model/time_entity',
                             version=1)
Пример #16
0
class BertProsody:
  """ 目前只支持长度50,输入字符数49 + 终结符 """
  def __init__(self):
    self.model, self.model_dir, self.model_path = None, None, None
    self.sess = None
    return

  def initial_model(self, bert_model_path, psd_model_path):
    print('=============init bert model=========================')
    print("bert model path:", bert_model_path)
    print("crf model path:", psd_model_path)
    self.sess = tf.Session()
    set_session(self.sess)
    self.model_dir = os.path.dirname(os.path.dirname(psd_model_path))
    self.model_path = psd_model_path
    data_path = os.path.join(self.model_dir, "feature_psd.pkl")
    train_data, train_label, test_data, test_label = \
        pickle.load(open(data_path, 'rb'))

    bert_embed = BERTEmbedding(bert_model_path, task=kashgari.LABELING,
                               sequence_length=50)
    self.model = BiLSTM_CRF_Model(bert_embed)
    self.model.build_model(x_train=train_data, y_train=train_label,
                           x_validate=test_data, y_validate=test_label)
    self.model.compile_model()
    self.model.tf_model.load_weights(psd_model_path)
    print('=============bert model loaded=========================')
    return

  def _write_dict(self):
    label_path = os.path.join(self.model_dir, "idx2label.txt")
    with open(label_path, "w", encoding="utf-8") as fr:
      for key, value in self.model.embedding.label2idx.items():
        fr.write("{} {}\n".format(value, key))

    token_path = os.path.join(self.model_dir, "token2idx.txt")
    with open(token_path, "w", encoding="utf-8") as fr:
      for key, value in self.model.embedding.token2idx.items():
        if len(key) > 0:
          fr.write("{} {}\n".format(key, value))

  def predict(self, sentence_list):
    """ 通过句子预测韵律,标点断开 """
    bert_input = []
    for sent in sentence_list:
      assert len(sent) < 50
      bert_input.append([c for c in sent])
    print("bert-input:", bert_input)
    prosody = self.model.predict(bert_input)
    return prosody

  def compute_embed(self, sentence_list):
    bert_input = [[c for c in sent] for sent in sentence_list]
    print("bert-input:", bert_input)
    tensor = self.model.embedding.process_x_dataset(bert_input)
    res = self.model.tf_model.predict(tensor)
    import numpy as np
    print("debug:", np.shape(res), res[0])
    return tensor

  def save_pb(self):
    self._write_dict()
    pb_dir = os.path.join(self.model_dir, "pb")
    os.makedirs(pb_dir, exist_ok=True)
    # [print(n.name) for n in tf.get_default_graph().as_graph_def().node]
    h5_to_pb(self.model.tf_model, pb_dir, self.sess, "model_psd.pb",
             ["output_psd"])
    return

  @staticmethod
  def change_by_rules(old_pairs):
    """ 强制规则:
    1. 逗号之前是#3,句号之前是#4
    2. 其他位置,#3 -> #2
    """
    new_pairs = []
    for i, (char, ph, psd) in enumerate(old_pairs[0:-1]):
      next_char, _, _ = old_pairs[i+1]
      if next_char == ",":
        new_pairs.append((char, ph, "3"))
      elif next_char in ["。", "?", "!"]:
        new_pairs.append((char, ph, "4"))
      else:
        if psd == "3":
          new_pairs.append((char, ph, "2"))
        else:
          new_pairs.append((char, ph, psd))
    new_pairs.append(old_pairs[-1])
    return new_pairs
Пример #17
0
    title_cut_all = pickle.load(ipt)
    tag_all = pickle.load(ipt)

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(title_cut_all,
                                                    tag_all,
                                                    test_size=0.2,
                                                    random_state=43)
x_train, x_valid, y_train, y_valid = train_test_split(x_train,
                                                      y_train,
                                                      test_size=0.2,
                                                      random_state=43)

import kashgari
from kashgari.embeddings import BERTEmbedding

bert_embed = BERTEmbedding(
    '/root/meicloud/majk1/NLP/BERT/chinese_L-12_H-768_A-12',
    task=kashgari.LABELING,
    sequence_length=100)

from kashgari.tasks.labeling import BiLSTM_CRF_Model

model = BiLSTM_CRF_Model(bert_embed)
model.fit(x_train,
          y_train,
          x_validate=x_valid,
          y_validate=y_valid,
          epochs=10,
          batch_size=512)
Пример #18
0
bert_embed = BERTEmbedding('drive/My Drive/rbt3',
                           task=kashgari.LABELING,
                           sequence_length=128)


from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint,ReduceLROnPlateau
from tensorflow.keras.layers import Flatten, Dense, Dropout
from tensorflow.python import keras
from kashgari.callbacks import EvalCallBack

#patience=3是看每一個epoch
stop_callback = EarlyStopping(patience=5, restore_best_weights=True)
# save_callback = ModelCheckpoint("530test1.h5",save_best_only=True,save_weights_only=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=3, verbose=1, min_lr=1e-6)
# tf_board_callback = keras.callbacks.TensorBoard(log_dir='./logs', update_freq=1000)
model = BiLSTM_CRF_Model(bert_embed)
eval_callback = EvalCallBack(kash_model=model,
                             valid_x=valid_x,
                             valid_y=valid_y,
                             step=3)


# optimizer = RAdam()
# model.compile_model(optimizer=optimizer)


model.fit(train_x,
          train_y,
          x_validate=valid_x,
          y_validate=valid_y,
          callbacks=[stop_callback,reduce_lr,eval_callback],
Пример #19
0
import pickle
import kashgari
from kashgari.embeddings import BertEmbedding
from kashgari.tasks.labeling import BiLSTM_CRF_Model
import tensorflow as tf

with open('data.pickle', 'rb') as f:
    data_dic = pickle.load(f)

x_train = data_dic[0]
x_validation = data_dic[1]
y_train = data_dic[2]
y_validation = data_dic[3]

embedding = BertEmbedding('bert-base-chinese',
                            sequence_length = 128)
model = BiLSTM_CRF_Model(embedding)

model.fit(  x_train = x_train,
            x_validate = x_validation,
            y_train = y_train,
            y_validate = y_validation,
            epochs=5,
            batch_size=32,
            )
model.save('Model')
model.evaluate(x_data=x_validation,y_data=y_validation)
Пример #20
0
# 下面我们用 Bi_LSTM 模型实现一个命名实体识别任务:

from kashgari.corpus import ChineseDailyNerCorpus
from kashgari.tasks.labeling import BiLSTM_Model, BiLSTM_CRF_Model

# 加载内置数据集,此处可以替换成自己的数据集,保证格式一致即可
train_x, train_y = ChineseDailyNerCorpus.load_data('train')
test_x, test_y = ChineseDailyNerCorpus.load_data('test')
valid_x, valid_y = ChineseDailyNerCorpus.load_data('valid')

model = BiLSTM_CRF_Model()
model.fit(train_x, train_y, valid_x, valid_y, epochs=1)

model.save("BiLSTM_CRF_Model")
from kashgari.embeddings import BERTEmbedding
from kashgari.tasks.labeling import BiLSTM_CRF_Model
from kashgari.corpus import ChineseDailyNerCorpus

train_x, train_y = ChineseDailyNerCorpus.load_data('train')
valid_x, valid_y = ChineseDailyNerCorpus.load_data('validate')
test_x, test_y = ChineseDailyNerCorpus.load_data('test')
# 还可以选择 `CNN_LSTM_Model`, `BiLSTM_Model`, `BiGRU_Model` 或 `BiGRU_CRF_Model`
bert = BERTEmbedding('wwm', task="classification", sequence_length=300)

model = BiLSTM_CRF_Model(bert)
model.fit(train_x,
          train_y,
          x_validate=valid_x,
          y_validate=valid_y,
          epochs=20,
          batch_size=512)
Пример #22
0
import kashgari
from kashgari.corpus import DataReader
from kashgari.embeddings import BERTEmbedding
from kashgari.tasks.labeling import BiLSTM_CRF_Model
from kashgari import utils

kashgari.config.use_cudnn_cell = False

train_x, train_y = DataReader().read_conll_format_file('data/data_all/example.train')
valid_x, valid_y = DataReader().read_conll_format_file('data/data_all/example.dev')
test_x, test_y = DataReader().read_conll_format_file('data/data_all/example.test')

train_x, train_y = utils.unison_shuffled_copies(train_x, train_y)
valid_x, valid_y = utils.unison_shuffled_copies(valid_x, valid_y)
test_x, test_y = utils.unison_shuffled_copies(test_x, test_y)

print(f"train data count: {len(train_x)}")
print(f"validate data count: {len(valid_x)}")
print(f"test data count: {len(test_x)}", test_x[0], test_y[0])

bert_embedding = BERTEmbedding('chinese_wwm_ext_L-12_H-768_A-12',
                               task=kashgari.LABELING,
                               sequence_length=100)

model = BiLSTM_CRF_Model(bert_embedding)
model.fit(train_x, train_y, valid_x, valid_y, batch_size=512, epochs=20)

model.save('models/all_ner.h5')

model.evaluate(test_x, test_y)
Пример #23
0
bert_embed = BERTEmbedding('electra',
                           task=kashgari.LABELING,
                           sequence_length=128)


from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.layers import Flatten, Dense, Dropout
from tensorflow.python import keras
from kashgari.callbacks import EvalCallBack
#patience=3是看每一個epoch
stop_callback = EarlyStopping(patience=3, restore_best_weights=True)
save_callback = ModelCheckpoint("5_29_1", save_best_only=True)



model = BiLSTM_CRF_Model(bert_embed)
model.fit(train_x,
          train_y,
          x_validate=valid_x,
          y_validate=valid_y,
          callbacks=[stop_callback, save_callback],
          batch_size=250,
          epochs=25)


# 验证模型,此方法将打印出详细的验证报告
model.evaluate(test_x, test_y)

# 保存模型到 `model_name` 目录下
model.save('5_29_1')
Пример #24
0
            else:
                x.append(rows[0])
                y.append(rows[1])
    return data_x, data_y


train_x, train_y = get_sequenct_tagging_data(train_path)
dev_x, dev_y = get_sequenct_tagging_data(dev_path)
test_x, test_y = get_sequenct_tagging_data(test_path)

print(f"train data count: {len(train_x)}")
print(f"validate data count: {len(dev_x)}")
print(f"test data count: {len(test_x)}")

bert_embed = BERTEmbedding(bert_path,
                           task=kashgari.LABELING,
                           sequence_length=100)

# 创建模型并训练
model = BiLSTM_CRF_Model(bert_embed)
model.fit(train_x,
          train_y,
          x_validate=dev_x,
          y_validate=dev_y,
          epochs=20,
          batch_size=512)

model.save(model_path)

# 模型评估
model.evaluate(test_x, test_y)
Пример #25
0
class Kashgari:
    def __init__(self):
        self.model = None
        self.chunk_size = 100
        self.set_features_numeric = dict()
        self.set_features_text = dict()

    def prepare_data_fit(self, tokens, tags, chunk_size, overlap=10):
        text_list = []
        first_of_p_list = []
        tag_list = []

        buffer_text = []
        buffer_first_of_p = []
        buffer_tag = []

        text_features = set("token")
        numeric_features = set("first_of_p")

        self.set_features_numeric = dict()

        for doc, doc_tags in zip(tokens, tags):
            for token, tag in zip(doc, doc_tags):
                features = agregado(token, simple_features=True)
                buffer_text.append(features['token'])
                buffer_first_of_p.append(
                    '2' if features['first_of_p'] else '1')
                buffer_tag.append(tag)

                if len(buffer_text) > chunk_size:
                    text_list.append(buffer_text)
                    first_of_p_list.append(buffer_first_of_p)
                    tag_list.append(buffer_tag)
                    # Zerar
                    buffer_text = []
                    buffer_first_of_p = []
                    buffer_tag = []

            print("Processed doc")

        if len(buffer_text) >= 0:
            text_list.append(buffer_text)
            first_of_p_list.append(buffer_first_of_p)
            tag_list.append(buffer_tag)

        results = (text_list, first_of_p_list)
        return results, tag_list

    def prepare_data_predict(self, tokens, chunk_size):
        text_list = []
        first_of_p_list = []

        buffer_text = []
        buffer_first_of_p = []

        for token in tokens:
            features = agregado(token, simple_features=True)
            buffer_text.append(features['token'])
            buffer_first_of_p.append('2' if features['first_of_p'] else '1')

            if len(buffer_text) >= chunk_size:
                text_list.append(buffer_text)
                first_of_p_list.append(buffer_first_of_p)
                # Zerar
                buffer_text = []
                buffer_first_of_p = []

        if len(buffer_text) > 0:
            text_list.append(buffer_text)
            first_of_p_list.append(buffer_first_of_p)

        results = (text_list, first_of_p_list)

        return results

    def train(self, tokens, tags):

        x, y = self.prepare_data_fit(tokens, tags, chunk_size=self.chunk_size)

        text_embedding = BareEmbedding(task=kashgari.LABELING,
                                       sequence_length=self.chunk_size)
        first_of_p_embedding = NumericFeaturesEmbedding(
            feature_count=2,
            feature_name='first_of_p',
            sequence_length=self.chunk_size)

        stack_embedding = StackedEmbedding(
            [text_embedding, first_of_p_embedding])

        stack_embedding.analyze_corpus(x, y)

        from kashgari.tasks.labeling import BiLSTM_Model, BiLSTM_CRF_Model
        self.model = BiLSTM_CRF_Model(embedding=stack_embedding)
        self.model.fit(x, y, batch_size=1, epochs=20)

    def predict(self, tokens):
        import itertools
        results = []
        for doc in tokens:
            x = self.prepare_data_predict(doc, chunk_size=self.chunk_size)

            predicted = self.model.predict(x)
            x_list = list(itertools.chain.from_iterable(x[0]))
            predicted_unified = list(itertools.chain.from_iterable(predicted))
            predicted_truncated = predicted_unified[:len(doc)]

            print(
                f"len doc{len(doc)} | x_list{len(x_list)} |len predicted_unified{len(predicted_unified)} |len predicted_truncated{len(predicted_truncated)} |"
            )
            results.append(predicted_unified[:len(doc)])

        return results
train_x, train_y = ChineseDailyNerCorpus.load_data('train')
valid_x, valid_y = ChineseDailyNerCorpus.load_data('validate')
test_x, test_y = ChineseDailyNerCorpus.load_data('test')

print(f"train data count: {len(train_x)}")
print(f"validate data count: {len(valid_x)}")
print(f"test data count: {len(test_x)}", test_x[0], test_y[0])

import kashgari
from kashgari.embeddings import BERTEmbedding

bert_embed = BERTEmbedding('chinese_wwm_ext_L-12_H-768_A-12',
                           task=kashgari.LABELING,
                           sequence_length=100)

from kashgari.tasks.labeling import BiLSTM_CRF_Model

# 还可以选择 `CNN_LSTM_Model`, `BiLSTM_Model`, `BiGRU_Model` 或 `BiGRU_CRF_Model`

model = BiLSTM_CRF_Model(bert_embed)
model.fit(train_x,
          train_y,
          x_validate=valid_x,
          y_validate=valid_y,
          epochs=20,
          batch_size=512)

model.save('models/org_loc_per_ner.h5')

model.evaluate(test_x, test_y)
Пример #27
0
words, labels = [], []

count = 0
for data, label in zip(datafile, labelfile):
    count += 1
    s1 = data.strip().split(' ')
    s2 = label.strip().split(' ')

    words.append(s1)
    labels.append(s2)

train_x, test_x, train_y, test_y = train_test_split(words, labels, test_size=0.5, random_state=50)


bert_embed = BERTEmbedding('uncased_L-12_H-768_A-12',
                           trainable=False,
                           task=kashgari.LABELING,
                           sequence_length=20,
                           )
model = BiLSTM_CRF_Model(bert_embed)
model.fit(train_x,
          train_y,
          x_validate=test_x,
          y_validate=test_y,
          epochs=35,
          batch_size=256)

model.save('model_bilstm_crf_35_256_64')

model.evaluate(x_data=test_x,y_data=test_y,batch_size=64,debug_info=True)
Пример #28
0
    # ou o GloVE-300 do http://nilc.icmc.usp.br/embeddings se não der certo

    # 2 - Ver como fazer o Predict. Temos que processar a frase para ficar igual a deles.
    # Eles usam um PunktSentenceTokenizer com um abbrev_list. Esses scripts estao na pasta leNer-dataset.

    # 3 - Ver como integrar esse codigo com o webstruct atual
    # 4 - Seria uma boa ideia ter uma interface tipo o Broka. Para que existesse a lista de arquivos, e que
    # pudesse abrir para re-treinar, abrindo com o plugin de Ramon.
    # Uma ideia seria ate converter o dataset deles atual para o formato do broka hoje em Html ( pode ser algo simples, como colocar cada paragrafo como um p)

    # 5 - Fazer a persistencia ( O kashgari tem um metodo save/load)


    # 2 - Aumentar epochs para treinar

    # You can use WordEmbedding or BERTEmbedding for your text embedding
    text_embedding = BareEmbedding(task=kashgari.LABELING)

    text_embedding.analyze_corpus(tokens, labels)

    # Now we can embed with this stacked embedding layer
    # We can build any labeling model with this embedding

    from kashgari.tasks.labeling import BiLSTM_CRF_Model

    model = BiLSTM_CRF_Model(embedding=text_embedding)
    model.fit(tokens, labels, batch_size=8, epochs=10)

    print(model.predict(tokens))
    # print(model.predict_entities(x))
# -*- coding: utf-8 -*-
# time: 2019-08-09 16:47
# place: Zhichunlu Beijing

import kashgari
from kashgari.corpus import DataReader
from kashgari.embeddings import BERTEmbedding
from kashgari.tasks.labeling import BiLSTM_CRF_Model

train_x, train_y = DataReader().read_conll_format_file('./data/time.train')
valid_x, valid_y = DataReader().read_conll_format_file('./data/time.dev')
test_x, test_y = DataReader().read_conll_format_file('./data/time.test')

bert_embedding = BERTEmbedding('chinese_wwm_ext_L-12_H-768_A-12',
                               task=kashgari.LABELING,
                               sequence_length=128)

model = BiLSTM_CRF_Model(bert_embedding)
model.fit(train_x, train_y, valid_x, valid_y, batch_size=16, epochs=10)

model.save('time_ner.h5')

model.evaluate(test_x, test_y)
Пример #30
0
with open("data_test.pkl", "rb") as f:
    x_test, y_test = pickle.load(f)
x_train, y_train = list(map(list, x_train)), list(map(list, y_train))
x_valid, y_valid = list(map(list, x_valid)), list(map(list, y_valid))
x_test, y_test = list(map(list, x_test)), list(map(list, y_test))
# Skip testing for now
x_train, y_train = x_train + x_test, y_train + y_test

model_dir = 'bert_tagger'
log_dir = os.path.join(model_dir, 'logs')
weights_path = os.path.join(log_dir, 'weights.h5')
BERT_PATH = '/mnt/DATA/data/embeddings/uncased_L-12_H-768_A-12'
EARLY_STOP = 10

bert_embed = BERTEmbedding(BERT_PATH, task=kashgari.LABELING)
model = BiLSTM_CRF_Model(bert_embed)
model.fit(x_train,
          y_train,
          x_valid,
          y_valid,
          epochs=10,
          batch_size=64,
          callbacks=[
              TensorBoard(log_dir=log_dir, write_graph=False),
              ModelCheckpoint(weights_path, save_weights_only=True),
              ReduceLROnPlateau()
          ])
print('Saving the model...')
model.save(model_dir)

from kashgari.utils import load_model