def main():

    train_x, train_y = ChineseDailyNerCorpus.load_data("train")
    valid_x, valid_y = ChineseDailyNerCorpus.load_data("validate")
    test_x, test_y = ChineseDailyNerCorpus.load_data("test")

    print(f"train data count: {len(train_x)}")
    print(f"validate data count: {len(valid_x)}")
    print(f"test data count: {len(test_x)}")

    bert_embed = BERTEmbedding("models/chinese_L-12_H-768_A-12",
                               task=kashgari.LABELING,
                               sequence_length=100)
    model = BiLSTM_CRF_Model(bert_embed)
    model.fit(
        train_x,
        train_y,
        x_validate=valid_x,
        y_validate=valid_y,
        epochs=1,
        batch_size=512,
    )
    model.save("models/ner.h5")
    model.evaluate(test_x, test_y)
    predictions = model.predict_classes(test_x)
    print(predictions)
示例#2
0
    def test_predict_and_callback(self):
        from kashgari.corpus import ChineseDailyNerCorpus
        from kashgari.callbacks import EvalCallBack

        train_x, train_y = ChineseDailyNerCorpus.load_data('train')
        valid_x, valid_y = ChineseDailyNerCorpus.load_data('valid')

        model = BiGRU_Model(sequence_length=10)

        eval_callback = EvalCallBack(kash_model=model,
                                     x_data=valid_x[:200],
                                     y_data=valid_y[:200],
                                     truncating=True,
                                     step=1)

        model.fit(train_x[:300], train_y[:300],
                  valid_x[:200], valid_y[:200],
                  epochs=1,
                  callbacks=[eval_callback])
        response = model.predict(train_x[:200], truncating=True)
        lengths = [len(i) for i in response]
        assert all([(i <= 10) for i in lengths])

        response = model.predict(train_x[:200])
        lengths = [len(i) for i in response]
        assert not all([(i <= 10) for i in lengths])
示例#3
0
    def test_labeling_eval_callback(self):
        train_x, train_y = ChineseDailyNerCorpus.load_data()
        test_x, test_y = ChineseDailyNerCorpus.load_data('test')

        train_x = train_x[:1000]
        train_y = train_y[:1000]
        model = Labeling_BiLSTM_Model()
        eval_callback = callbacks.EvalCallBack(model, test_x, test_y, step=1)
        model.fit(train_x, train_y, callbacks=[eval_callback], epochs=1)
示例#4
0
    def test_load_data(self):
        train_x, train_y = ChineseDailyNerCorpus.load_data()
        assert len(train_x) == len(train_y)
        assert len(train_x) > 0
        assert train_x[:5] != train_y[:5]

        test_x, test_y = ChineseDailyNerCorpus.load_data('test')
        assert len(test_x) == len(test_y)
        assert len(test_x) > 0

        test_x, test_y = ChineseDailyNerCorpus.load_data('valid')
        assert len(test_x) == len(test_y)
        assert len(test_x) > 0
示例#5
0
def main():
    model_json = None
    with open(arch_file, 'r') as f:
        model_json = f.read()

    # model = load_model(model_dir, custom_objects={"BiLSTM_CRF_Model": BiLSTM_CRF_Model})

    model = model_from_json(model_json, custom_objects={
                            "BiLSTM_CRF_Model": BiLSTM_CRF_Model})
    # model.load_weights(weight_file)

    keras.utils.plot_model(model)
    return

    """
    WARNING:root:Sequence length will auto set at 95% of sequence length
    Traceback (most recent call last):
    File "keras_load.py", line 15, in <module>
        model.load_weights(weight_file)
    AttributeError: 'BiLSTM_CRF_Model' object has no attribute 'load_weights'
    """

    print(model.__doc__)
    print(dir(model))
    # WARNING:root:Sequence length will auto set at 95% of sequence length
    # Bidirectional LSTM CRF Sequence Labeling Model
    # ['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__',
    # '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__task__', '__weakref__', 'build_model', 'build_model_arc', 'build_multi_gpu_model', 'build_tpu_model', 'compile_model', 'embedding', 'evaluate', 'fit',
    #  'fit_without_generator', 'get_data_generator', 'get_default_hyper_parameters', 'hyper_parameters', 'info', 'label2idx', 'model_info', 'pre_processor', 'predict', 'predict_entities', 'processor', 'save', 'task', 'tf_model', 'token2idx']

    test_x, test_y = ChineseDailyNerCorpus.load_data("test")
    print("\n test_x:\n{}\n\n".format(test_x[0:5]))
    # predictions = model.predict(test_x[0:5])
    predictions = model.predict_entities(test_x[0:5])
    print(predictions)
示例#6
0
    def test_bert_embedding(self):
        text, label = ChineseDailyNerCorpus.load_data()
        is_bold = np.random.randint(1, 3, (len(text), 12))

        bert_path = get_file(
            'bert_sample_model',
            "http://s3.bmio.net/kashgari/bert_sample_model.tar.bz2",
            cache_dir=DATA_PATH,
            untar=True)

        text_embedding = BERTEmbedding(bert_path,
                                       task=kashgari.LABELING,
                                       sequence_length=12)
        num_feature_embedding = NumericFeaturesEmbedding(2,
                                                         'is_bold',
                                                         sequence_length=12)

        stack_embedding = StackedEmbedding(
            [text_embedding, num_feature_embedding])
        stack_embedding.analyze_corpus((text, is_bold), label)

        tensor = stack_embedding.process_x_dataset((text[:3], is_bold[:3]))
        print(tensor[0][0].shape)
        print(tensor[0][1].shape)
        print(tensor[1].shape)
        print(stack_embedding.embed_model.input_shape)
        print(stack_embedding.embed_model.summary())
        r = stack_embedding.embed((text[:3], is_bold[:3]))
        assert r.shape == (3, 12, 24)
示例#7
0
    def test_embed(self):
        embedding = self.embedding_class(task=kashgari.CLASSIFICATION,
                                         **self.config)

        valid_x, valid_y = SMP2018ECDTCorpus.load_data('valid')
        embedding.analyze_corpus(valid_x, valid_y)

        assert embedding.embed_one(['我', '想', '看']).shape == (15, 50257)

        assert embedding.embed([
            ['我', '想', '看'],
            ['我', '想', '看', '权力的游戏'],
            ['Hello', 'world']
        ]).shape == (3, 15, 50257)

        embedding = self.embedding_class(task=kashgari.LABELING,
                                         sequence_length=10,
                                         **self.config)

        valid_x, valid_y = ChineseDailyNerCorpus.load_data('valid')
        embedding.analyze_corpus(valid_x, valid_y)

        assert embedding.embed_one(['我', '想', '看']).shape == (10, 50257)

        assert embedding.embed([
            ['我', '想', '看'],
            ['我', '想', '看', '权力的游戏'],
            ['Hello', 'world']
        ]).shape == (3, 10, 50257)
示例#8
0
    def test_init_with_processor(self):
        valid_x, valid_y = ChineseDailyNerCorpus.load_data('valid')

        processor = LabelingProcessor()
        processor.analyze_corpus(valid_x, valid_y)

        embedding = self.embedding_class(sequence_length=11,
                                         processor=processor,
                                         **self.config)
        embedding.analyze_corpus(valid_x, valid_y)
        assert embedding.embed_one(['我', '想', '看']).shape == (11, 50257)
示例#9
0
    def run_with_model_class(self, model_class: Type[ABCLabelingModel], epochs: int):
        bert_path = get_bert_path()

        train_x, train_y = ChineseDailyNerCorpus.load_data('train')
        valid_x, valid_y = ChineseDailyNerCorpus.load_data('valid')
        test_x, test_y = ChineseDailyNerCorpus.load_data('test')

        bert_embed = BertEmbedding(bert_path)
        model = model_class(bert_embed)

        log_path = os.path.join(log_root, model_class.__name__)
        file_writer = tf.summary.create_file_writer(log_path + "/metrics")
        file_writer.set_as_default()
        callbacks = [EvalCallBack(model, test_x, test_y, step=1, truncating=True)]
        # callbacks = []
        model.fit(train_x, train_y, valid_x, valid_y, epochs=epochs, callbacks=callbacks)

        report = model.evaluate(test_x, test_y)
        del model
        del bert_embed
        return report
示例#10
0
    def test_embedding(self):
        text, label = ChineseDailyNerCorpus.load_data()
        is_bold = np.random.randint(1, 3, (len(text), 12))

        text_embedding = BareEmbedding(task=kashgari.LABELING,
                                       sequence_length=12)
        num_feature_embedding = NumericFeaturesEmbedding(2,
                                                         'is_bold',
                                                         sequence_length=12)

        stack_embedding = StackedEmbedding(
            [text_embedding, num_feature_embedding])
        stack_embedding.analyze_corpus((text, is_bold), label)

        stack_embedding.process_x_dataset((text[:3], is_bold[:3]))
        r = stack_embedding.embed((text[:3], is_bold[:3]))
        assert r.shape == (3, 12, 116)
示例#11
0
    def test_base_use_case(self):
        x, y = ChineseDailyNerCorpus.load_data('test')
        x = x[:200]
        y = y[:200]
        seq2seq = Seq2Seq(hidden_size=64,
                          encoder_seq_length=64,
                          decoder_seq_length=64)
        seq2seq.fit(x, y, epochs=1)
        res, att = seq2seq.predict(x)

        model_path = os.path.join(tempfile.gettempdir(), str(time.time()))
        seq2seq.save(model_path)

        s2 = Seq2Seq.load_model(model_path)
        res2, att2 = s2.predict(x)

        assert res2 == res
        assert (att2 == att).all()
示例#12
0
文件: corpus.py 项目: JQIANG125/GPT
class NERCorpus(object):
    chinese_daily = ChineseDailyNerCorpus.load_data('valid')
    coll2003 = CONLL2003ENCorpus.load_data('valid')

    # Test data for issue https://github.com/BrikerMan/Kashgari/issues/187
    custom_1 = (custom_x, custom_ner_y1)
    custom_2 = (custom_x, custom_ner_y2)

    @classmethod
    def load_corpus(cls, name=None):
        data_dict = {
            'chinese_daily': cls.chinese_daily,
            'coll2003': cls.chinese_daily,
            'custom_1': cls.custom_1,
            'custom_2': cls.custom_2
        }

        if name is None:
            name = random.choice(list(data_dict.keys()))
        return data_dict[name]
示例#13
0
    def test_batch_generator(self):
        x, y = ChineseDailyNerCorpus.load_data('valid')

        text_processor = SequenceProcessor()
        label_processor = SequenceProcessor(build_vocab_from_labels=True,
                                            min_count=1)

        corpus_gen = CorpusGenerator(x, y)

        text_processor.build_vocab_generator([corpus_gen])
        label_processor.build_vocab_generator([corpus_gen])

        batch_dataset1 = BatchDataSet(corpus_gen,
                                      text_processor=text_processor,
                                      label_processor=label_processor,
                                      segment=False,
                                      seq_length=None,
                                      max_position=100,
                                      batch_size=12)

        duplicate_len = len(batch_dataset1)
        assert len(list(batch_dataset1.take(duplicate_len))) == duplicate_len
        assert len(list(batch_dataset1.take(1))) == 1
示例#14
0
        }

    def build_model_arc(self) -> None:
        output_dim = self.label_processor.vocab_size

        config = self.hyper_parameters
        embed_model = self.embedding.embed_model

        layer_stack = [
            L.Bidirectional(L.LSTM(**config['layer_blstm']),
                            name='layer_blstm'),
            L.Dropout(**config['layer_dropout'], name='layer_dropout'),
            L.Dense(output_dim, **config['layer_time_distributed']),
            L.Activation(**config['layer_activation'])
        ]
        tensor = embed_model.output
        for layer in layer_stack:
            tensor = layer(tensor)

        self.tf_model = keras.Model(embed_model.inputs, tensor)


if __name__ == "__main__":
    from kashgari.corpus import ChineseDailyNerCorpus

    x, y = ChineseDailyNerCorpus.load_data()
    x_valid, y_valid = ChineseDailyNerCorpus.load_data('valid')
    model = BiLSTM_Model()
    model.fit(x, y, x_valid, y_valid, epochs=2)
    model.evaluate(*ChineseDailyNerCorpus.load_data('test'))
示例#15
0
import unittest

import os
import time
import tempfile
import numpy as np
import kashgari
from tests.corpus import NERCorpus
from kashgari.corpus import ChineseDailyNerCorpus
from kashgari.embeddings import WordEmbedding
from kashgari.tasks.labeling import CNN_LSTM_Model
from kashgari.macros import DATA_PATH

from tensorflow.python.keras.utils import get_file

valid_x, valid_y = ChineseDailyNerCorpus.load_data('valid')

sample_w2v_path = get_file('sample_w2v.txt',
                           "http://s3.bmio.net/kashgari/sample_w2v.txt",
                           cache_dir=DATA_PATH)


class TestCNN_LSTM_Model(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.model_class = CNN_LSTM_Model

    def test_basic_use_build(self):
        x, y = NERCorpus.load_corpus()

        model = self.model_class()
示例#16
0
import kashgari
from kashgari.tasks.labeling import BiLSTM_Model
from kashgari.corpus import ChineseDailyNerCorpus

test_x, test_y = ChineseDailyNerCorpus.load_data('./data/test.txt')
model = kashgari.utils.load_model('saved_ner_model')
model.evaluate(test_x, test_y)
示例#17
0
import kashgari
from kashgari.embeddings import BERTEmbedding
from kashgari.corpus import ChineseDailyNerCorpus
from kashgari.tasks.labeling import BiLSTM_CRF_Model
train_x, train_y = ChineseDailyNerCorpus.load_data('./data/train.txt')
valid_x, valid_y = ChineseDailyNerCorpus.load_data('./data/dev.txt')
test_x, test_y  = ChineseDailyNerCorpus.load_data('./data/test.txt')

bert_embed = BERTEmbedding('./chinese_L-12_H-768_A-12',
                           task=kashgari.LABELING,
                           sequence_length=100)

# 还可以选择 `CNN_LSTM_Model`, `BiLSTM_Model`, `BiGRU_Model` 或 `BiGRU_CRF_Model`
model = BiLSTM_CRF_Model(bert_embed)
model.fit(train_x,
          train_y,
          x_validate=valid_x,
          y_validate=valid_y,
          epochs=20,
          batch_size=512)

model.save('saved_ner_model')
from kashgari.corpus import ChineseDailyNerCorpus
from kashgari.tasks.labeling import BiLSTM_CRF_Model

# 加载内置数据集,此处可以替换成自己的数据集,保证格式一致即可
train_x, train_y = ChineseDailyNerCorpus.load_data('train')
test_x, test_y = ChineseDailyNerCorpus.load_data('test')
valid_x, valid_y = ChineseDailyNerCorpus.load_data('valid')



import kashgari
from kashgari.tasks.labeling import BiLSTM_Model
from kashgari.embeddings import BERTEmbedding

bert_embed = BERTEmbedding('baidu_ernie',
                           task=kashgari.LABELING,
                           sequence_length=100)
model = BiLSTM_Model(bert_embed)
model.fit(train_x, train_y, valid_x, valid_y)
示例#19
0
        config = self.hyper_parameters
        embed_model = self.embedding.embed_model

        layer_conv = L.Conv1D(**config['layer_conv'], name='layer_conv')
        layer_lstm = L.LSTM(**config['layer_lstm'], name='layer_lstm')
        layer_dropout = L.Dropout(**config['layer_dropout'],
                                  name='layer_dropout')
        layer_time_distributed = L.TimeDistributed(
            L.Dense(output_dim, **config['layer_time_distributed']),
            name='layer_time_distributed')
        layer_activation = L.Activation(**config['layer_activation'])

        tensor = layer_conv(embed_model.output)
        tensor = layer_lstm(tensor)
        tensor = layer_dropout(tensor)
        tensor = layer_time_distributed(tensor)
        output_tensor = layer_activation(tensor)

        self.tf_model = keras.Model(embed_model.inputs, output_tensor)


if __name__ == "__main__":
    logging.basicConfig(level=logging.DEBUG)
    from kashgari.corpus import ChineseDailyNerCorpus

    valid_x, valid_y = ChineseDailyNerCorpus.load_data('train')

    model = BiLSTM_CRF_Model()
    model.fit(valid_x, valid_y, epochs=50, batch_size=64)
    model.evaluate(valid_x, valid_y)
示例#20
0
                logging.debug('------ sample {} ------'.format(index))
                logging.debug('x      : {}'.format(x_data[index]))
                logging.debug('y_true : {}'.format(y_true[index]))
                logging.debug('y_pred : {}'.format(y_pred[index]))
        report = classification_report(y_true, y_pred, digits=digits)
        print(classification_report(y_true, y_pred, digits=digits))
        return report

    def build_model_arc(self):
        raise NotImplementedError


if __name__ == "__main__":
    logging.basicConfig(level=logging.DEBUG)
    from kashgari.tasks.labeling import BiLSTM_Model
    from kashgari.corpus import ChineseDailyNerCorpus
    from kashgari.utils import load_model

    train_x, train_y = ChineseDailyNerCorpus.load_data('train', shuffle=False)
    valid_x, valid_y = ChineseDailyNerCorpus.load_data('valid')

    train_x, train_y = train_x[:5120], train_y[:5120]

    model = load_model('/Users/brikerman/Desktop/blstm_model')
    # model.build_model(train_x[:100], train_y[:100])

    # model.fit(train_x[:1000], train_y[:1000], epochs=10)

    # model.evaluate(train_x[:20], train_y[:20])
    print("Hello world")
示例#21
0
        result = []
        for seq in sequences:
            if self.add_bos_eos:
                seq = [self.token_pad] + seq + [self.token_pad]
            result.append([self.label_word2idx[label] for label in seq])
        return result

    def reverse_numerize_label_word_sequences(self, sequences, lengths=None):
        result = []

        for index, seq in enumerate(sequences):
            labels = []
            if self.add_bos_eos:
                seq = seq[1:]
            for idx in seq:
                labels.append(self.idx2label_word[idx])
            if lengths is not None:
                labels = labels[:lengths[index]]
            result.append(labels)
        return result


if __name__ == "__main__":
    from kashgari.corpus import ChineseDailyNerCorpus

    x, y = ChineseDailyNerCorpus.load_data()
    p = LabelingProcessor()
    p.analyze_corpus(x, y)
    r = p.process_x_dataset(x, subset=[10, 12, 20])
    print(r)
示例#22
0
    def get_train_examples(self, *args):
        train_x, train_y = ChineseDailyNerCorpus.load_data('train')

        return self._create_example(self.load_data2set(train_x, train_y),
                                    "train")
from kashgari.corpus import ChineseDailyNerCorpus

train_x, train_y = ChineseDailyNerCorpus.load_data("train")
valid_x, valid_y = ChineseDailyNerCorpus.load_data("validate")
test_x, test_y = ChineseDailyNerCorpus.load_data("test")

print("train len: {}".format(len(train_x)))
print("valid len: {}".format(len(valid_x)))

print("test_x len: {}".format(len(test_x)))
print("test_y len: {}".format(len(test_y)))

print("\n\n")

print("test_x[0]: {}".format(test_x[0]))
print("test_y[0]: {}".format(test_y[0]))

print("\n\n")

print("test_x[1]: {}".format(test_x[1]))
print("test_y[1]: {}".format(test_y[1]))

# for i in range(len(test_x)):
#     print("test_x[{}]: {}".format(i, test_x[i]))
#     print("test_y[{}]: {}".format(i, test_y[i]))
#     continue
示例#24
0
from kashgari.utils import load_model
from kashgari.corpus import ChineseDailyNerCorpus

model = load_model('models/ner.h5')

print(dir(model))

test_x, test_y = ChineseDailyNerCorpus.load_data("test")
print("\n test_x:\n{}\n\n".format(test_x[0:5]))

metrics = model.evaluate(test_x[0:5], test_y[0:5])
print("\n\n")
print(metrics)
print("\n\n")

print("\n=================predicton==============\n")
predictions = model.predict(test_x[0:5])
print(predictions)
print("\n\n")

print("\n=================predicton entities==============\n")
predictions = model.predict_entities(test_x[0:5])
print(predictions)
示例#25
0
 def get_dev_examples(self, *args):
     valid_x, valid_y = ChineseDailyNerCorpus.load_data('validate')
     return self._create_example(self.load_data2set(valid_x, valid_y),
                                 "dev")
示例#26
0
        """
        pathlib.Path(model_path).mkdir(exist_ok=True, parents=True)

        with open(os.path.join(model_path, 'model_info.json'), 'w') as f:
            f.write(json.dumps(self.info(), indent=2, ensure_ascii=True))
            f.close()

        self.tf_model.save_weights(os.path.join(model_path,
                                                'model_weights.h5'))
        logging.info('model saved to {}'.format(os.path.abspath(model_path)))


if __name__ == "__main__":
    from kashgari.tasks.labeling import CNN_LSTM_Model
    from kashgari.corpus import ChineseDailyNerCorpus

    train_x, train_y = ChineseDailyNerCorpus.load_data('valid')

    model = CNN_LSTM_Model()
    model.build_model(train_x[:100], train_y[:100])
    r = model.predict_entities(train_x[:5])
    model.save('./res')
    import pprint

    pprint.pprint(r)
    model.evaluate(train_x[:20], train_y[:20])
    print("Hello world")

    print(model.predict(train_x[:20]))
示例#27
0
 def get_test_examples(self, *args):
     test_x, test_y = ChineseDailyNerCorpus.load_data('test')
     return self._create_example(
         self.load_data2set(test_x, test_y, max_num=1000), "test")