def main(): train_x, train_y = ChineseDailyNerCorpus.load_data("train") valid_x, valid_y = ChineseDailyNerCorpus.load_data("validate") test_x, test_y = ChineseDailyNerCorpus.load_data("test") print(f"train data count: {len(train_x)}") print(f"validate data count: {len(valid_x)}") print(f"test data count: {len(test_x)}") bert_embed = BERTEmbedding("models/chinese_L-12_H-768_A-12", task=kashgari.LABELING, sequence_length=100) model = BiLSTM_CRF_Model(bert_embed) model.fit( train_x, train_y, x_validate=valid_x, y_validate=valid_y, epochs=1, batch_size=512, ) model.save("models/ner.h5") model.evaluate(test_x, test_y) predictions = model.predict_classes(test_x) print(predictions)
def test_predict_and_callback(self): from kashgari.corpus import ChineseDailyNerCorpus from kashgari.callbacks import EvalCallBack train_x, train_y = ChineseDailyNerCorpus.load_data('train') valid_x, valid_y = ChineseDailyNerCorpus.load_data('valid') model = BiGRU_Model(sequence_length=10) eval_callback = EvalCallBack(kash_model=model, x_data=valid_x[:200], y_data=valid_y[:200], truncating=True, step=1) model.fit(train_x[:300], train_y[:300], valid_x[:200], valid_y[:200], epochs=1, callbacks=[eval_callback]) response = model.predict(train_x[:200], truncating=True) lengths = [len(i) for i in response] assert all([(i <= 10) for i in lengths]) response = model.predict(train_x[:200]) lengths = [len(i) for i in response] assert not all([(i <= 10) for i in lengths])
def test_labeling_eval_callback(self): train_x, train_y = ChineseDailyNerCorpus.load_data() test_x, test_y = ChineseDailyNerCorpus.load_data('test') train_x = train_x[:1000] train_y = train_y[:1000] model = Labeling_BiLSTM_Model() eval_callback = callbacks.EvalCallBack(model, test_x, test_y, step=1) model.fit(train_x, train_y, callbacks=[eval_callback], epochs=1)
def test_load_data(self): train_x, train_y = ChineseDailyNerCorpus.load_data() assert len(train_x) == len(train_y) assert len(train_x) > 0 assert train_x[:5] != train_y[:5] test_x, test_y = ChineseDailyNerCorpus.load_data('test') assert len(test_x) == len(test_y) assert len(test_x) > 0 test_x, test_y = ChineseDailyNerCorpus.load_data('valid') assert len(test_x) == len(test_y) assert len(test_x) > 0
def main(): model_json = None with open(arch_file, 'r') as f: model_json = f.read() # model = load_model(model_dir, custom_objects={"BiLSTM_CRF_Model": BiLSTM_CRF_Model}) model = model_from_json(model_json, custom_objects={ "BiLSTM_CRF_Model": BiLSTM_CRF_Model}) # model.load_weights(weight_file) keras.utils.plot_model(model) return """ WARNING:root:Sequence length will auto set at 95% of sequence length Traceback (most recent call last): File "keras_load.py", line 15, in <module> model.load_weights(weight_file) AttributeError: 'BiLSTM_CRF_Model' object has no attribute 'load_weights' """ print(model.__doc__) print(dir(model)) # WARNING:root:Sequence length will auto set at 95% of sequence length # Bidirectional LSTM CRF Sequence Labeling Model # ['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', # '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__task__', '__weakref__', 'build_model', 'build_model_arc', 'build_multi_gpu_model', 'build_tpu_model', 'compile_model', 'embedding', 'evaluate', 'fit', # 'fit_without_generator', 'get_data_generator', 'get_default_hyper_parameters', 'hyper_parameters', 'info', 'label2idx', 'model_info', 'pre_processor', 'predict', 'predict_entities', 'processor', 'save', 'task', 'tf_model', 'token2idx'] test_x, test_y = ChineseDailyNerCorpus.load_data("test") print("\n test_x:\n{}\n\n".format(test_x[0:5])) # predictions = model.predict(test_x[0:5]) predictions = model.predict_entities(test_x[0:5]) print(predictions)
def test_bert_embedding(self): text, label = ChineseDailyNerCorpus.load_data() is_bold = np.random.randint(1, 3, (len(text), 12)) bert_path = get_file( 'bert_sample_model', "http://s3.bmio.net/kashgari/bert_sample_model.tar.bz2", cache_dir=DATA_PATH, untar=True) text_embedding = BERTEmbedding(bert_path, task=kashgari.LABELING, sequence_length=12) num_feature_embedding = NumericFeaturesEmbedding(2, 'is_bold', sequence_length=12) stack_embedding = StackedEmbedding( [text_embedding, num_feature_embedding]) stack_embedding.analyze_corpus((text, is_bold), label) tensor = stack_embedding.process_x_dataset((text[:3], is_bold[:3])) print(tensor[0][0].shape) print(tensor[0][1].shape) print(tensor[1].shape) print(stack_embedding.embed_model.input_shape) print(stack_embedding.embed_model.summary()) r = stack_embedding.embed((text[:3], is_bold[:3])) assert r.shape == (3, 12, 24)
def test_embed(self): embedding = self.embedding_class(task=kashgari.CLASSIFICATION, **self.config) valid_x, valid_y = SMP2018ECDTCorpus.load_data('valid') embedding.analyze_corpus(valid_x, valid_y) assert embedding.embed_one(['我', '想', '看']).shape == (15, 50257) assert embedding.embed([ ['我', '想', '看'], ['我', '想', '看', '权力的游戏'], ['Hello', 'world'] ]).shape == (3, 15, 50257) embedding = self.embedding_class(task=kashgari.LABELING, sequence_length=10, **self.config) valid_x, valid_y = ChineseDailyNerCorpus.load_data('valid') embedding.analyze_corpus(valid_x, valid_y) assert embedding.embed_one(['我', '想', '看']).shape == (10, 50257) assert embedding.embed([ ['我', '想', '看'], ['我', '想', '看', '权力的游戏'], ['Hello', 'world'] ]).shape == (3, 10, 50257)
def test_init_with_processor(self): valid_x, valid_y = ChineseDailyNerCorpus.load_data('valid') processor = LabelingProcessor() processor.analyze_corpus(valid_x, valid_y) embedding = self.embedding_class(sequence_length=11, processor=processor, **self.config) embedding.analyze_corpus(valid_x, valid_y) assert embedding.embed_one(['我', '想', '看']).shape == (11, 50257)
def run_with_model_class(self, model_class: Type[ABCLabelingModel], epochs: int): bert_path = get_bert_path() train_x, train_y = ChineseDailyNerCorpus.load_data('train') valid_x, valid_y = ChineseDailyNerCorpus.load_data('valid') test_x, test_y = ChineseDailyNerCorpus.load_data('test') bert_embed = BertEmbedding(bert_path) model = model_class(bert_embed) log_path = os.path.join(log_root, model_class.__name__) file_writer = tf.summary.create_file_writer(log_path + "/metrics") file_writer.set_as_default() callbacks = [EvalCallBack(model, test_x, test_y, step=1, truncating=True)] # callbacks = [] model.fit(train_x, train_y, valid_x, valid_y, epochs=epochs, callbacks=callbacks) report = model.evaluate(test_x, test_y) del model del bert_embed return report
def test_embedding(self): text, label = ChineseDailyNerCorpus.load_data() is_bold = np.random.randint(1, 3, (len(text), 12)) text_embedding = BareEmbedding(task=kashgari.LABELING, sequence_length=12) num_feature_embedding = NumericFeaturesEmbedding(2, 'is_bold', sequence_length=12) stack_embedding = StackedEmbedding( [text_embedding, num_feature_embedding]) stack_embedding.analyze_corpus((text, is_bold), label) stack_embedding.process_x_dataset((text[:3], is_bold[:3])) r = stack_embedding.embed((text[:3], is_bold[:3])) assert r.shape == (3, 12, 116)
def test_base_use_case(self): x, y = ChineseDailyNerCorpus.load_data('test') x = x[:200] y = y[:200] seq2seq = Seq2Seq(hidden_size=64, encoder_seq_length=64, decoder_seq_length=64) seq2seq.fit(x, y, epochs=1) res, att = seq2seq.predict(x) model_path = os.path.join(tempfile.gettempdir(), str(time.time())) seq2seq.save(model_path) s2 = Seq2Seq.load_model(model_path) res2, att2 = s2.predict(x) assert res2 == res assert (att2 == att).all()
class NERCorpus(object): chinese_daily = ChineseDailyNerCorpus.load_data('valid') coll2003 = CONLL2003ENCorpus.load_data('valid') # Test data for issue https://github.com/BrikerMan/Kashgari/issues/187 custom_1 = (custom_x, custom_ner_y1) custom_2 = (custom_x, custom_ner_y2) @classmethod def load_corpus(cls, name=None): data_dict = { 'chinese_daily': cls.chinese_daily, 'coll2003': cls.chinese_daily, 'custom_1': cls.custom_1, 'custom_2': cls.custom_2 } if name is None: name = random.choice(list(data_dict.keys())) return data_dict[name]
def test_batch_generator(self): x, y = ChineseDailyNerCorpus.load_data('valid') text_processor = SequenceProcessor() label_processor = SequenceProcessor(build_vocab_from_labels=True, min_count=1) corpus_gen = CorpusGenerator(x, y) text_processor.build_vocab_generator([corpus_gen]) label_processor.build_vocab_generator([corpus_gen]) batch_dataset1 = BatchDataSet(corpus_gen, text_processor=text_processor, label_processor=label_processor, segment=False, seq_length=None, max_position=100, batch_size=12) duplicate_len = len(batch_dataset1) assert len(list(batch_dataset1.take(duplicate_len))) == duplicate_len assert len(list(batch_dataset1.take(1))) == 1
} def build_model_arc(self) -> None: output_dim = self.label_processor.vocab_size config = self.hyper_parameters embed_model = self.embedding.embed_model layer_stack = [ L.Bidirectional(L.LSTM(**config['layer_blstm']), name='layer_blstm'), L.Dropout(**config['layer_dropout'], name='layer_dropout'), L.Dense(output_dim, **config['layer_time_distributed']), L.Activation(**config['layer_activation']) ] tensor = embed_model.output for layer in layer_stack: tensor = layer(tensor) self.tf_model = keras.Model(embed_model.inputs, tensor) if __name__ == "__main__": from kashgari.corpus import ChineseDailyNerCorpus x, y = ChineseDailyNerCorpus.load_data() x_valid, y_valid = ChineseDailyNerCorpus.load_data('valid') model = BiLSTM_Model() model.fit(x, y, x_valid, y_valid, epochs=2) model.evaluate(*ChineseDailyNerCorpus.load_data('test'))
import unittest import os import time import tempfile import numpy as np import kashgari from tests.corpus import NERCorpus from kashgari.corpus import ChineseDailyNerCorpus from kashgari.embeddings import WordEmbedding from kashgari.tasks.labeling import CNN_LSTM_Model from kashgari.macros import DATA_PATH from tensorflow.python.keras.utils import get_file valid_x, valid_y = ChineseDailyNerCorpus.load_data('valid') sample_w2v_path = get_file('sample_w2v.txt', "http://s3.bmio.net/kashgari/sample_w2v.txt", cache_dir=DATA_PATH) class TestCNN_LSTM_Model(unittest.TestCase): @classmethod def setUpClass(cls): cls.model_class = CNN_LSTM_Model def test_basic_use_build(self): x, y = NERCorpus.load_corpus() model = self.model_class()
import kashgari from kashgari.tasks.labeling import BiLSTM_Model from kashgari.corpus import ChineseDailyNerCorpus test_x, test_y = ChineseDailyNerCorpus.load_data('./data/test.txt') model = kashgari.utils.load_model('saved_ner_model') model.evaluate(test_x, test_y)
import kashgari from kashgari.embeddings import BERTEmbedding from kashgari.corpus import ChineseDailyNerCorpus from kashgari.tasks.labeling import BiLSTM_CRF_Model train_x, train_y = ChineseDailyNerCorpus.load_data('./data/train.txt') valid_x, valid_y = ChineseDailyNerCorpus.load_data('./data/dev.txt') test_x, test_y = ChineseDailyNerCorpus.load_data('./data/test.txt') bert_embed = BERTEmbedding('./chinese_L-12_H-768_A-12', task=kashgari.LABELING, sequence_length=100) # 还可以选择 `CNN_LSTM_Model`, `BiLSTM_Model`, `BiGRU_Model` 或 `BiGRU_CRF_Model` model = BiLSTM_CRF_Model(bert_embed) model.fit(train_x, train_y, x_validate=valid_x, y_validate=valid_y, epochs=20, batch_size=512) model.save('saved_ner_model')
from kashgari.corpus import ChineseDailyNerCorpus from kashgari.tasks.labeling import BiLSTM_CRF_Model # 加载内置数据集,此处可以替换成自己的数据集,保证格式一致即可 train_x, train_y = ChineseDailyNerCorpus.load_data('train') test_x, test_y = ChineseDailyNerCorpus.load_data('test') valid_x, valid_y = ChineseDailyNerCorpus.load_data('valid') import kashgari from kashgari.tasks.labeling import BiLSTM_Model from kashgari.embeddings import BERTEmbedding bert_embed = BERTEmbedding('baidu_ernie', task=kashgari.LABELING, sequence_length=100) model = BiLSTM_Model(bert_embed) model.fit(train_x, train_y, valid_x, valid_y)
config = self.hyper_parameters embed_model = self.embedding.embed_model layer_conv = L.Conv1D(**config['layer_conv'], name='layer_conv') layer_lstm = L.LSTM(**config['layer_lstm'], name='layer_lstm') layer_dropout = L.Dropout(**config['layer_dropout'], name='layer_dropout') layer_time_distributed = L.TimeDistributed( L.Dense(output_dim, **config['layer_time_distributed']), name='layer_time_distributed') layer_activation = L.Activation(**config['layer_activation']) tensor = layer_conv(embed_model.output) tensor = layer_lstm(tensor) tensor = layer_dropout(tensor) tensor = layer_time_distributed(tensor) output_tensor = layer_activation(tensor) self.tf_model = keras.Model(embed_model.inputs, output_tensor) if __name__ == "__main__": logging.basicConfig(level=logging.DEBUG) from kashgari.corpus import ChineseDailyNerCorpus valid_x, valid_y = ChineseDailyNerCorpus.load_data('train') model = BiLSTM_CRF_Model() model.fit(valid_x, valid_y, epochs=50, batch_size=64) model.evaluate(valid_x, valid_y)
logging.debug('------ sample {} ------'.format(index)) logging.debug('x : {}'.format(x_data[index])) logging.debug('y_true : {}'.format(y_true[index])) logging.debug('y_pred : {}'.format(y_pred[index])) report = classification_report(y_true, y_pred, digits=digits) print(classification_report(y_true, y_pred, digits=digits)) return report def build_model_arc(self): raise NotImplementedError if __name__ == "__main__": logging.basicConfig(level=logging.DEBUG) from kashgari.tasks.labeling import BiLSTM_Model from kashgari.corpus import ChineseDailyNerCorpus from kashgari.utils import load_model train_x, train_y = ChineseDailyNerCorpus.load_data('train', shuffle=False) valid_x, valid_y = ChineseDailyNerCorpus.load_data('valid') train_x, train_y = train_x[:5120], train_y[:5120] model = load_model('/Users/brikerman/Desktop/blstm_model') # model.build_model(train_x[:100], train_y[:100]) # model.fit(train_x[:1000], train_y[:1000], epochs=10) # model.evaluate(train_x[:20], train_y[:20]) print("Hello world")
result = [] for seq in sequences: if self.add_bos_eos: seq = [self.token_pad] + seq + [self.token_pad] result.append([self.label_word2idx[label] for label in seq]) return result def reverse_numerize_label_word_sequences(self, sequences, lengths=None): result = [] for index, seq in enumerate(sequences): labels = [] if self.add_bos_eos: seq = seq[1:] for idx in seq: labels.append(self.idx2label_word[idx]) if lengths is not None: labels = labels[:lengths[index]] result.append(labels) return result if __name__ == "__main__": from kashgari.corpus import ChineseDailyNerCorpus x, y = ChineseDailyNerCorpus.load_data() p = LabelingProcessor() p.analyze_corpus(x, y) r = p.process_x_dataset(x, subset=[10, 12, 20]) print(r)
def get_train_examples(self, *args): train_x, train_y = ChineseDailyNerCorpus.load_data('train') return self._create_example(self.load_data2set(train_x, train_y), "train")
from kashgari.corpus import ChineseDailyNerCorpus train_x, train_y = ChineseDailyNerCorpus.load_data("train") valid_x, valid_y = ChineseDailyNerCorpus.load_data("validate") test_x, test_y = ChineseDailyNerCorpus.load_data("test") print("train len: {}".format(len(train_x))) print("valid len: {}".format(len(valid_x))) print("test_x len: {}".format(len(test_x))) print("test_y len: {}".format(len(test_y))) print("\n\n") print("test_x[0]: {}".format(test_x[0])) print("test_y[0]: {}".format(test_y[0])) print("\n\n") print("test_x[1]: {}".format(test_x[1])) print("test_y[1]: {}".format(test_y[1])) # for i in range(len(test_x)): # print("test_x[{}]: {}".format(i, test_x[i])) # print("test_y[{}]: {}".format(i, test_y[i])) # continue
from kashgari.utils import load_model from kashgari.corpus import ChineseDailyNerCorpus model = load_model('models/ner.h5') print(dir(model)) test_x, test_y = ChineseDailyNerCorpus.load_data("test") print("\n test_x:\n{}\n\n".format(test_x[0:5])) metrics = model.evaluate(test_x[0:5], test_y[0:5]) print("\n\n") print(metrics) print("\n\n") print("\n=================predicton==============\n") predictions = model.predict(test_x[0:5]) print(predictions) print("\n\n") print("\n=================predicton entities==============\n") predictions = model.predict_entities(test_x[0:5]) print(predictions)
def get_dev_examples(self, *args): valid_x, valid_y = ChineseDailyNerCorpus.load_data('validate') return self._create_example(self.load_data2set(valid_x, valid_y), "dev")
""" pathlib.Path(model_path).mkdir(exist_ok=True, parents=True) with open(os.path.join(model_path, 'model_info.json'), 'w') as f: f.write(json.dumps(self.info(), indent=2, ensure_ascii=True)) f.close() self.tf_model.save_weights(os.path.join(model_path, 'model_weights.h5')) logging.info('model saved to {}'.format(os.path.abspath(model_path))) if __name__ == "__main__": from kashgari.tasks.labeling import CNN_LSTM_Model from kashgari.corpus import ChineseDailyNerCorpus train_x, train_y = ChineseDailyNerCorpus.load_data('valid') model = CNN_LSTM_Model() model.build_model(train_x[:100], train_y[:100]) r = model.predict_entities(train_x[:5]) model.save('./res') import pprint pprint.pprint(r) model.evaluate(train_x[:20], train_y[:20]) print("Hello world") print(model.predict(train_x[:20]))
def get_test_examples(self, *args): test_x, test_y = ChineseDailyNerCorpus.load_data('test') return self._create_example( self.load_data2set(test_x, test_y, max_num=1000), "test")