def __init__(self, model_path, type_filter=False, save_label=False, batch=32, save_expand_subject=True): # mention -> entity_json_line self.subject_id_dict = subject_id_dict self._model = BLSTMCRFModel.load_model(model_path) # self._model = DDDDModel.load_model(model_path) self.type_filter = type_filter self.batch = batch self.save_label = save_label self.save_expand_subject = save_expand_subject
def train_model(self): x_train, y_train, x_valid, y_valid = self.data_load( validation_split=0.2) model = BLSTMCRFModel(self.embedding) model.fit(x_train, y_train, x_validate=x_valid, y_validate=y_valid, epochs=self.EPOCHS, batch_size=self.BATCH_SIZE) model.save(self.model_path)
def main(): # parser config config_file = "./config.ini" cp = ConfigParser() cp.read(config_file) # default config model_fold = cp["EVALUATION"].get("model_fold") output_dir = os.path.join('experiments', model_fold) test_x, test_y = CoNLL2003Corpus.get_sequence_tagging_data('test') model_path = os.path.join(output_dir, 'model') model = BLSTMCRFModel.load_model(model_path) report_evaluate = model.evaluate(test_x, test_y, debug_info=True) with open(os.path.join(output_dir, 'report_evaluate.log'), 'w') as f: f.write(f"The evaluate report is :\n {str(report_evaluate)}\n")
def main(): # parser config config_file = "./config.ini" cp = ConfigParser() cp.read(config_file) # default config model_fold = cp["TEST"].get("model_fold") output_dir = os.path.join('experiments', model_fold) model_path = os.path.join(output_dir, 'model') model = BLSTMCRFModel.load_model(model_path) sentence = 'China and the United States are about the same size' sentence_list = sentence.split() result = model.predict(sentence_list) result_dict = model.predict(sentence_list, output_dict=True) print(f'the sentence is {sentence}') print(f'the result is {result}') print(f'the result of dict is {result_dict}') logging.info('test predict: {} -> {}'.format(sentence_list, result)) with open(os.path.join(output_dir, 'result_predict.log'), 'w') as f: f.write(f"The predict result is : {str(result)}\n")
def main(): # parser config config_file = "./config.ini" cp = ConfigParser() cp.read(config_file) # default config output_fold = cp["TRAIN"].get("output_fold") epochs = cp["TRAIN"].getint("epochs") batch_size = cp["TRAIN"].getint("batch_size") generator_workers = cp["TRAIN"].getint("generator_workers") output_weights_name = cp["TRAIN"].get("output_weights_name") sequence_length_max = cp["TRAIN"].getint("sequence_length_max") output_model_name = cp["TRAIN"].get("output_model_name") save_weights_only = cp["TRAIN"].getboolean("save_weights_only") cyclicLR_mode = cp["TRAIN"].get("cyclicLR_mode") base_lr = cp["TRAIN"].getfloat("base_lr") max_lr = cp["TRAIN"].getfloat("max_lr") today = datetime.date.today() formatted_today = today.strftime('%y%m%d') output_dir = os.path.join('experiments', formatted_today, output_fold) if not os.path.isdir(output_dir): os.makedirs(output_dir) output_dir_src = os.path.join(output_dir, 'src') if not os.path.isdir(output_dir_src): os.makedirs(output_dir_src) print(f"backup config file to {output_dir_src}") shutil.copy(config_file, os.path.join(output_dir_src, os.path.split(config_file)[1])) train_file = os.path.basename(__file__) shutil.copy(train_file, os.path.join(output_dir_src, train_file)) train_x, train_y = CoNLL2003Corpus.get_sequence_tagging_data('train') validate_x, validate_y = CoNLL2003Corpus.get_sequence_tagging_data( 'validate') test_x, test_y = CoNLL2003Corpus.get_sequence_tagging_data('test') #'bert-large-cased' embedding = BERTEmbedding('bert-large-cased', sequence_length_max) # 还可以选择 `BLSTMModel` 和 `CNNLSTMModel` model = BLSTMCRFModel(embedding) # model.build_model(train_x, train_y) # model.build_multi_gpu_model(gpus=2) # print(model.summary()) if save_weights_only: model_weights = os.path.join(output_dir, output_weights_name) else: model_weights = os.path.join(output_dir, output_model_name) checkpoint = ModelCheckpoint( model_weights, save_weights_only=save_weights_only, save_best_only=True, verbose=1, ) earlystop = EarlyStopping(monitor='val_loss', min_delta=0, patience=20, verbose=0, mode='min') csv_logger = CSVLogger(os.path.join(output_dir, 'training.csv')) batch_size_cycliclr = ceil(len(train_x) / batch_size) if cyclicLR_mode == 'exp_range': gamma = 0.99994 else: gamma = 1. clr = CyclicLR(mode=cyclicLR_mode, step_size=batch_size_cycliclr, base_lr=base_lr, max_lr=max_lr, gamma=gamma) save_min_loss = SaveMinLoss(filepath=output_dir) tb = TensorBoard(log_dir=os.path.join(output_dir, "logs"), batch_size=batch_size) callbacks = [ checkpoint, tb, csv_logger, # clr, save_min_loss, earlystop, ] print("** start training **") model.fit(train_x, train_y, x_validate=validate_x, y_validate=validate_y, epochs=epochs, batch_size=batch_size, labels_weight=True, fit_kwargs={ 'callbacks': callbacks, 'workers': generator_workers, 'use_multiprocessing': True, 'class_weight': 'auto', }) model_path = os.path.join(output_dir, 'model') model.save(model_path) report_evaluate = model.evaluate(test_x, test_y, debug_info=True) with open(os.path.join(output_dir, 'report_evaluate.log'), 'w') as f: f.write(f"The evaluate report is : \n{str(report_evaluate)}")
def __init__(self, model_path, type_filter=False): # mention -> entity_json_line self.subject_id_dict = subject_id_dict self._model = BLSTMCRFModel.load_model(model_path) self.type_filter = type_filter
def __init__(self, model_path): # mention -> entity_json_line self.subject_id_dict = subject_id_dict self._model = BLSTMCRFModel.load_model(model_path)
x = [] return datas def model_predict(self, model, text): x_test = self.build_input(text) result = model.predict(x_test) chars = [i for i in text] tags = [] for i in range(len(result)): tags = tags + result[i] res = list(zip(chars, tags)) print(res) return (res) if __name__ == '__main__': import argparse parser = argparse.ArgumentParser() parser.add_argument('--option', type=str, default='predict') args = parser.parse_args() config = Config() bertner = BERTNER() if args.option == 'train': bertner.train_model() else: model = BLSTMCRFModel.load_model(bertner.model_path) while 1: s = input('enter an sent:').strip() bertner.model_predict(model, s)
def setUpClass(cls): cls.epochs = 5 embedding = EmbeddingManager.get_bert() cls.model = BLSTMCRFModel(embedding)
def __init__(self, model_path): with tf.device('/gpu:0'): # mention -> entity_json_line self.subject_dic = super().get_kb_dic() self._model = BLSTMCRFModel.load_model(model_path)
from data_reduce import get_train_data, predict_reduce, loadData import numpy as np def reduce_text(news): text = news['title'] + '。' + news['content'] text = text.replace('\n', '').replace('\t', '') return list(text) if __name__ == '__main__': start = time() print('train start') train_x, train_y = get_train_data('data/train_text.txt') embedding = BERTEmbedding("bert-base-chinese", sequence_length=512) model = BLSTMCRFModel(embedding) length = int(len(train_x) * 0.9) print(len(train_x[:length]), len(train_y[:length])) model.fit(train_x[:length], train_y[:length], train_x[length:], train_y[length:], epochs=5, batch_size=20) # model.fit(train_x[:length], train_y[:length], train_x[length:], train_y[length:], epochs=5, batch_size=128, # labels_weight=True, default_labels_weight=100) valid_x = train_x[length:] valid_y = train_y[length:] model.save('models') print('train end') print('predict start') try: model = BLSTMCRFModel.load_model('models') except Exception: print('模型加载失败') newsId_set = set()
model_path = r"D:\data\biendata\ccks2019_el\ner\m0" log_filepath = r"D:\data\biendata\ccks2019_el\ner\m0log" # emn_path = r'D:\data\bert\chinese_L-12_H-768_A-12' emn_path = r'D:\data\bert\chinese-bert_chinese_wwm_L-12_H-768_A-12' # check_point = ModelCheckpoint(model_path, monitor="val_loss", verbose=1, save_best_only=True, mode="min") early_stop = EarlyStopping(monitor="val_loss", mode="min", patience=2) # early_stop = EarlyStopping(monitor="val_crf_accuracy", mode="max", patience=2) log = TensorBoard(log_dir=log_filepath, write_images=False, write_graph=True, histogram_freq=0) embedding = BERTEmbedding(emn_path, 50) model = BLSTMCRFModel(embedding) model.__base_hyper_parameters__ = { 'lstm_layer': { 'units': 256, 'return_sequences': True }, 'dense_layer': { 'units': 64, 'activation': 'tanh' } } model.fit(train_x, train_y,
def eval_crf(): model_path = r"D:\data\biendata\ccks2019_el\ner_model" model = BLSTMCRFModel.load_model(model_path) validate_x, validate_y = dload.load_json_data('validate') model.evaluate(validate_x, validate_y)
## 面向脚本编程 from kashgari.tasks.seq_labeling import BLSTMCRFModel from util import InputHelper # 读取模型 new_model = BLSTMCRFModel.load_model('./model') # 读取测试集数据 with open('./data/test', 'r', encoding='utf-8') as g: test_data = g.readlines() # 对测试集进行预测 with open('./keywords_test', 'w', encoding='utf-8') as g_key: for ids, line in enumerate(test_data): try: label = InputHelper().iob_iobes( new_model.predict(line.replace('\t', '。'))) result = InputHelper().result_to_json(line, label) line_keys = [entity['word'] for entity in result['entities']] g_key.write(','.join(line_keys) + '\n') except Exception as e: g_key.write('\n') # “人工智能” # 清洗一些看得到的错误,取前三个。这里做的很糙 with open('./keywords_test', 'r', encoding='utf-8') as g: data = g.readlines() with open('./keywords', 'w', encoding='utf-8') as f: for line in data:
log_filepath = r"D:\data\biendata\ccks2019_el\ner\m0.1log" # emn_path = r'D:\data\bert\chinese_L-12_H-768_A-12' emn_path = r'D:\data\bert\chinese-bert_chinese_wwm_L-12_H-768_A-12' # check_point = ModelCheckpoint(model_path, monitor="val_loss", verbose=1, save_best_only=True, mode="min") early_stop = EarlyStopping(monitor="val_loss", mode="min", patience=1) # early_stop = EarlyStopping(monitor="val_crf_accuracy", mode="max", patience=2) log = TensorBoard(log_dir=log_filepath, write_images=False, write_graph=True, histogram_freq=0) model = BLSTMCRFModel.load_model(model_path_o) model.fit(train_x, train_y, x_validate=validate_x, y_validate=validate_y, epochs=40, batch_size=512, labels_weight=True, fit_kwargs={'callbacks': [early_stop, log]}) model.evaluate(test_x, test_y) model.save(model_path_n) """ 继续训练
def setUpClass(cls): cls.epochs = 5 cls.model = BLSTMCRFModel()
from kashgari.embeddings import BERTEmbedding from kashgari.tasks.seq_labeling import BLSTMCRFModel import os os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' from util import InputHelper train_x, train_y = InputHelper().read_corpus('data', 'Bert_train') embedding = BERTEmbedding('./chinese_L-12_H-768_A-12', sequence_length=256) model = BLSTMCRFModel(embedding) model.fit(train_x, train_y, epochs=10, batch_size=512) model.save('./model')