def test_embedding(self):
     embed = NumericFeaturesEmbedding(2,
                                      feature_name='is_bold',
                                      sequence_length=10,
                                      embedding_size=30)
     embed.embed_model.summary()
     assert embed.embed_one([1, 2]).shape == (10, 30)
     assert embed.embed([[1, 2]]).shape == (1, 10, 30)
Пример #2
0
    def test_training(self):
        text = ['NLP', 'Projects', 'Project', 'Name', ':']
        start_of_p = [1, 2, 1, 2, 2]
        bold = [1, 1, 1, 1, 2]
        center = [1, 1, 2, 2, 2]
        label = [
            'B-Category', 'I-Category', 'B-ProjectName', 'I-ProjectName',
            'I-ProjectName'
        ]

        text_list = [text] * 300
        start_of_p_list = [start_of_p] * 300
        bold_list = [bold] * 300
        center_list = [center] * 300
        label_list = [label] * 300

        # You can use WordEmbedding or BERTEmbedding for your text embedding
        SEQUENCE_LEN = 100
        text_embedding = BareEmbedding(task=kashgari.LABELING,
                                       sequence_length=SEQUENCE_LEN)
        start_of_p_embedding = NumericFeaturesEmbedding(
            feature_count=2,
            feature_name='start_of_p',
            sequence_length=SEQUENCE_LEN)

        bold_embedding = NumericFeaturesEmbedding(feature_count=2,
                                                  feature_name='bold',
                                                  sequence_length=SEQUENCE_LEN,
                                                  embedding_size=10)

        center_embedding = NumericFeaturesEmbedding(
            feature_count=2,
            feature_name='center',
            sequence_length=SEQUENCE_LEN)

        # first one must be the text, embedding
        stack_embedding = StackedEmbedding([
            text_embedding, start_of_p_embedding, bold_embedding,
            center_embedding
        ])

        x = (text_list, start_of_p_list, bold_list, center_list)
        y = label_list
        stack_embedding.analyze_corpus(x, y)

        model = BiLSTM_Model(embedding=stack_embedding)
        model.build_model(x, y)
        model.tf_model.summary()

        model.fit(x, y, epochs=2)

        model_path = os.path.join('./saved_models/',
                                  model.__class__.__module__,
                                  model.__class__.__name__)
        model.save(model_path)

        new_model = kashgari.utils.load_model(model_path)
Пример #3
0
    def test_bert_embedding(self):
        text, label = ChineseDailyNerCorpus.load_data()
        is_bold = np.random.randint(1, 3, (len(text), 12))

        bert_path = get_file(
            'bert_sample_model',
            "http://s3.bmio.net/kashgari/bert_sample_model.tar.bz2",
            cache_dir=DATA_PATH,
            untar=True)

        text_embedding = BERTEmbedding(bert_path,
                                       task=kashgari.LABELING,
                                       sequence_length=12)
        num_feature_embedding = NumericFeaturesEmbedding(2,
                                                         'is_bold',
                                                         sequence_length=12)

        stack_embedding = StackedEmbedding(
            [text_embedding, num_feature_embedding])
        stack_embedding.analyze_corpus((text, is_bold), label)

        tensor = stack_embedding.process_x_dataset((text[:3], is_bold[:3]))
        print(tensor[0][0].shape)
        print(tensor[0][1].shape)
        print(tensor[1].shape)
        print(stack_embedding.embed_model.input_shape)
        print(stack_embedding.embed_model.summary())
        r = stack_embedding.embed((text[:3], is_bold[:3]))
        assert r.shape == (3, 12, 24)
Пример #4
0
    def test_embedding(self):
        text, label = ChineseDailyNerCorpus.load_data()
        is_bold = np.random.randint(1, 3, (len(text), 12))

        text_embedding = BareEmbedding(task=kashgari.LABELING,
                                       sequence_length=12)
        num_feature_embedding = NumericFeaturesEmbedding(2,
                                                         'is_bold',
                                                         sequence_length=12)

        stack_embedding = StackedEmbedding(
            [text_embedding, num_feature_embedding])
        stack_embedding.analyze_corpus((text, is_bold), label)

        stack_embedding.process_x_dataset((text[:3], is_bold[:3]))
        r = stack_embedding.embed((text[:3], is_bold[:3]))
        assert r.shape == (3, 12, 116)
Пример #5
0
    def train(self, tokens, tags):

        x, y = self.prepare_data_fit(tokens, tags, chunk_size=self.chunk_size)

        text_embedding = BareEmbedding(task=kashgari.LABELING,
                                       sequence_length=self.chunk_size)
        first_of_p_embedding = NumericFeaturesEmbedding(
            feature_count=2,
            feature_name='first_of_p',
            sequence_length=self.chunk_size)

        stack_embedding = StackedEmbedding(
            [text_embedding, first_of_p_embedding])

        stack_embedding.analyze_corpus(x, y)

        from kashgari.tasks.labeling import BiLSTM_Model, BiLSTM_CRF_Model
        self.model = BiLSTM_CRF_Model(embedding=stack_embedding)
        self.model.fit(x, y, batch_size=1, epochs=20)