Пример #1
0
 def __init__(self,
              model_path,
              type_filter=False,
              save_label=False,
              batch=32,
              save_expand_subject=True):
     # mention -> entity_json_line
     self.subject_id_dict = subject_id_dict
     self._model = BLSTMCRFModel.load_model(model_path)
     # self._model = DDDDModel.load_model(model_path)
     self.type_filter = type_filter
     self.batch = batch
     self.save_label = save_label
     self.save_expand_subject = save_expand_subject
Пример #2
0
 def train_model(self):
     x_train, y_train, x_valid, y_valid = self.data_load(
         validation_split=0.2)
     model = BLSTMCRFModel(self.embedding)
     model.fit(x_train,
               y_train,
               x_validate=x_valid,
               y_validate=y_valid,
               epochs=self.EPOCHS,
               batch_size=self.BATCH_SIZE)
     model.save(self.model_path)
def main():
    # parser config
    config_file = "./config.ini"
    cp = ConfigParser()
    cp.read(config_file)

    # default config
    model_fold = cp["EVALUATION"].get("model_fold")
    output_dir = os.path.join('experiments', model_fold)

    test_x, test_y = CoNLL2003Corpus.get_sequence_tagging_data('test')

    model_path = os.path.join(output_dir, 'model')
    model = BLSTMCRFModel.load_model(model_path)
    report_evaluate = model.evaluate(test_x, test_y, debug_info=True)

    with open(os.path.join(output_dir, 'report_evaluate.log'), 'w') as f:
        f.write(f"The evaluate report is :\n {str(report_evaluate)}\n")
Пример #4
0
def main():
    # parser config
    config_file = "./config.ini"
    cp = ConfigParser()
    cp.read(config_file)

    # default config
    model_fold = cp["TEST"].get("model_fold")
    output_dir = os.path.join('experiments', model_fold)

    model_path = os.path.join(output_dir, 'model')
    model = BLSTMCRFModel.load_model(model_path)
    sentence = 'China and the United States are about the same size'
    sentence_list = sentence.split()
    result = model.predict(sentence_list)
    result_dict = model.predict(sentence_list, output_dict=True)
    print(f'the sentence is {sentence}')
    print(f'the result is {result}')
    print(f'the result of dict is {result_dict}')
    logging.info('test predict: {} -> {}'.format(sentence_list, result))

    with open(os.path.join(output_dir, 'result_predict.log'), 'w') as f:
        f.write(f"The predict result is : {str(result)}\n")
Пример #5
0
def main():
    # parser config
    config_file = "./config.ini"
    cp = ConfigParser()
    cp.read(config_file)

    # default config
    output_fold = cp["TRAIN"].get("output_fold")
    epochs = cp["TRAIN"].getint("epochs")
    batch_size = cp["TRAIN"].getint("batch_size")
    generator_workers = cp["TRAIN"].getint("generator_workers")
    output_weights_name = cp["TRAIN"].get("output_weights_name")
    sequence_length_max = cp["TRAIN"].getint("sequence_length_max")
    output_model_name = cp["TRAIN"].get("output_model_name")
    save_weights_only = cp["TRAIN"].getboolean("save_weights_only")
    cyclicLR_mode = cp["TRAIN"].get("cyclicLR_mode")
    base_lr = cp["TRAIN"].getfloat("base_lr")
    max_lr = cp["TRAIN"].getfloat("max_lr")

    today = datetime.date.today()
    formatted_today = today.strftime('%y%m%d')
    output_dir = os.path.join('experiments', formatted_today, output_fold)
    if not os.path.isdir(output_dir):
        os.makedirs(output_dir)

    output_dir_src = os.path.join(output_dir, 'src')
    if not os.path.isdir(output_dir_src):
        os.makedirs(output_dir_src)
    print(f"backup config file to {output_dir_src}")
    shutil.copy(config_file,
                os.path.join(output_dir_src,
                             os.path.split(config_file)[1]))
    train_file = os.path.basename(__file__)
    shutil.copy(train_file, os.path.join(output_dir_src, train_file))

    train_x, train_y = CoNLL2003Corpus.get_sequence_tagging_data('train')
    validate_x, validate_y = CoNLL2003Corpus.get_sequence_tagging_data(
        'validate')
    test_x, test_y = CoNLL2003Corpus.get_sequence_tagging_data('test')

    #'bert-large-cased'
    embedding = BERTEmbedding('bert-large-cased', sequence_length_max)
    # 还可以选择 `BLSTMModel` 和 `CNNLSTMModel`
    model = BLSTMCRFModel(embedding)
    # model.build_model(train_x, train_y)
    # model.build_multi_gpu_model(gpus=2)
    # print(model.summary())

    if save_weights_only:
        model_weights = os.path.join(output_dir, output_weights_name)
    else:
        model_weights = os.path.join(output_dir, output_model_name)

    checkpoint = ModelCheckpoint(
        model_weights,
        save_weights_only=save_weights_only,
        save_best_only=True,
        verbose=1,
    )
    earlystop = EarlyStopping(monitor='val_loss',
                              min_delta=0,
                              patience=20,
                              verbose=0,
                              mode='min')
    csv_logger = CSVLogger(os.path.join(output_dir, 'training.csv'))
    batch_size_cycliclr = ceil(len(train_x) / batch_size)
    if cyclicLR_mode == 'exp_range':
        gamma = 0.99994
    else:
        gamma = 1.
    clr = CyclicLR(mode=cyclicLR_mode,
                   step_size=batch_size_cycliclr,
                   base_lr=base_lr,
                   max_lr=max_lr,
                   gamma=gamma)
    save_min_loss = SaveMinLoss(filepath=output_dir)
    tb = TensorBoard(log_dir=os.path.join(output_dir, "logs"),
                     batch_size=batch_size)
    callbacks = [
        checkpoint,
        tb,
        csv_logger,
        # clr,
        save_min_loss,
        earlystop,
    ]
    print("** start training **")
    model.fit(train_x,
              train_y,
              x_validate=validate_x,
              y_validate=validate_y,
              epochs=epochs,
              batch_size=batch_size,
              labels_weight=True,
              fit_kwargs={
                  'callbacks': callbacks,
                  'workers': generator_workers,
                  'use_multiprocessing': True,
                  'class_weight': 'auto',
              })

    model_path = os.path.join(output_dir, 'model')
    model.save(model_path)
    report_evaluate = model.evaluate(test_x, test_y, debug_info=True)

    with open(os.path.join(output_dir, 'report_evaluate.log'), 'w') as f:
        f.write(f"The evaluate report is : \n{str(report_evaluate)}")
Пример #6
0
 def __init__(self, model_path, type_filter=False):
     # mention -> entity_json_line
     self.subject_id_dict = subject_id_dict
     self._model = BLSTMCRFModel.load_model(model_path)
     self.type_filter = type_filter
Пример #7
0
 def __init__(self, model_path):
     # mention -> entity_json_line
     self.subject_id_dict = subject_id_dict
     self._model = BLSTMCRFModel.load_model(model_path)
Пример #8
0
                x = []
        return datas

    def model_predict(self, model, text):
        x_test = self.build_input(text)
        result = model.predict(x_test)
        chars = [i for i in text]
        tags = []
        for i in range(len(result)):
            tags = tags + result[i]
        res = list(zip(chars, tags))
        print(res)
        return (res)


if __name__ == '__main__':
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('--option', type=str, default='predict')
    args = parser.parse_args()
    config = Config()

    bertner = BERTNER()
    if args.option == 'train':
        bertner.train_model()
    else:
        model = BLSTMCRFModel.load_model(bertner.model_path)
        while 1:
            s = input('enter an sent:').strip()
            bertner.model_predict(model, s)
Пример #9
0
 def setUpClass(cls):
     cls.epochs = 5
     embedding = EmbeddingManager.get_bert()
     cls.model = BLSTMCRFModel(embedding)
Пример #10
0
    def __init__(self, model_path):
        with tf.device('/gpu:0'):
            # mention -> entity_json_line
            self.subject_dic = super().get_kb_dic()

            self._model = BLSTMCRFModel.load_model(model_path)
Пример #11
0
from data_reduce import get_train_data, predict_reduce, loadData
import numpy as np


def reduce_text(news):
    text = news['title'] + '。' + news['content']
    text = text.replace('\n', '').replace('\t', '')
    return list(text)


if __name__ == '__main__':
    start = time()
    print('train start')
    train_x, train_y = get_train_data('data/train_text.txt')
    embedding = BERTEmbedding("bert-base-chinese", sequence_length=512)
    model = BLSTMCRFModel(embedding)
    length = int(len(train_x) * 0.9)
    print(len(train_x[:length]), len(train_y[:length]))
    model.fit(train_x[:length], train_y[:length], train_x[length:], train_y[length:], epochs=5, batch_size=20)
    # model.fit(train_x[:length], train_y[:length], train_x[length:], train_y[length:], epochs=5, batch_size=128,
    #           labels_weight=True, default_labels_weight=100)
    valid_x = train_x[length:]
    valid_y = train_y[length:]
    model.save('models')
    print('train end')
    print('predict start')
    try:
        model = BLSTMCRFModel.load_model('models')
    except Exception:
        print('模型加载失败')
    newsId_set = set()
Пример #12
0
model_path = r"D:\data\biendata\ccks2019_el\ner\m0"
log_filepath = r"D:\data\biendata\ccks2019_el\ner\m0log"

# emn_path = r'D:\data\bert\chinese_L-12_H-768_A-12'
emn_path = r'D:\data\bert\chinese-bert_chinese_wwm_L-12_H-768_A-12'

# check_point = ModelCheckpoint(model_path, monitor="val_loss", verbose=1, save_best_only=True, mode="min")

early_stop = EarlyStopping(monitor="val_loss", mode="min", patience=2)
# early_stop = EarlyStopping(monitor="val_crf_accuracy", mode="max", patience=2)

log = TensorBoard(log_dir=log_filepath, write_images=False, write_graph=True, histogram_freq=0)

embedding = BERTEmbedding(emn_path, 50)

model = BLSTMCRFModel(embedding)

model.__base_hyper_parameters__ = {
        'lstm_layer': {
            'units': 256,
            'return_sequences': True
        },
        'dense_layer': {
            'units': 64,
            'activation': 'tanh'
        }
    }


model.fit(train_x,
          train_y,
Пример #13
0
def eval_crf():
    model_path = r"D:\data\biendata\ccks2019_el\ner_model"
    model = BLSTMCRFModel.load_model(model_path)
    validate_x, validate_y = dload.load_json_data('validate')
    model.evaluate(validate_x, validate_y)
Пример #14
0
## 面向脚本编程

from kashgari.tasks.seq_labeling import BLSTMCRFModel

from util import InputHelper

# 读取模型
new_model = BLSTMCRFModel.load_model('./model')

# 读取测试集数据
with open('./data/test', 'r', encoding='utf-8') as g:
    test_data = g.readlines()

# 对测试集进行预测
with open('./keywords_test', 'w', encoding='utf-8') as g_key:
    for ids, line in enumerate(test_data):
        try:
            label = InputHelper().iob_iobes(
                new_model.predict(line.replace('\t', '。')))
            result = InputHelper().result_to_json(line, label)
            line_keys = [entity['word'] for entity in result['entities']]
            g_key.write(','.join(line_keys) + '\n')
        except Exception as e:
            g_key.write('\n')

# “人工智能”
# 清洗一些看得到的错误,取前三个。这里做的很糙
with open('./keywords_test', 'r', encoding='utf-8') as g:
    data = g.readlines()
with open('./keywords', 'w', encoding='utf-8') as f:
    for line in data:
Пример #15
0
log_filepath = r"D:\data\biendata\ccks2019_el\ner\m0.1log"

# emn_path = r'D:\data\bert\chinese_L-12_H-768_A-12'
emn_path = r'D:\data\bert\chinese-bert_chinese_wwm_L-12_H-768_A-12'

# check_point = ModelCheckpoint(model_path, monitor="val_loss", verbose=1, save_best_only=True, mode="min")

early_stop = EarlyStopping(monitor="val_loss", mode="min", patience=1)
# early_stop = EarlyStopping(monitor="val_crf_accuracy", mode="max", patience=2)

log = TensorBoard(log_dir=log_filepath,
                  write_images=False,
                  write_graph=True,
                  histogram_freq=0)

model = BLSTMCRFModel.load_model(model_path_o)

model.fit(train_x,
          train_y,
          x_validate=validate_x,
          y_validate=validate_y,
          epochs=40,
          batch_size=512,
          labels_weight=True,
          fit_kwargs={'callbacks': [early_stop, log]})

model.evaluate(test_x, test_y)

model.save(model_path_n)
"""
继续训练
Пример #16
0
 def setUpClass(cls):
     cls.epochs = 5
     cls.model = BLSTMCRFModel()
Пример #17
0
from kashgari.embeddings import BERTEmbedding
from kashgari.tasks.seq_labeling import BLSTMCRFModel
import os

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
from util import InputHelper

train_x, train_y = InputHelper().read_corpus('data', 'Bert_train')
embedding = BERTEmbedding('./chinese_L-12_H-768_A-12', sequence_length=256)
model = BLSTMCRFModel(embedding)
model.fit(train_x, train_y, epochs=10, batch_size=512)
model.save('./model')