Пример #1
0
 def test_speed(self):
     start = time.time()
     tokens = text.split()
     tokens = [(token, ) for token in tokens]
     transformer = TaggedTransformer(template)
     x2 = transformer.transform([tokens])
     end = time.time()
     duration = end - start
     self.assertLess(duration, 3)
Пример #2
0
class TestTaggedFeature(TestCase):
    def setUp(self):
        self.tagged_transformer = TaggedTransformer(templates)

    def test_word2features(self):
        sent = [["người", "N", "B-NP"], ["nghèo", "A", "I-NP"]]
        actual = self.tagged_transformer.transform([sent])[0][0][0]
        expected = "T[0].lower=người"
        self.assertEqual(expected, actual)
Пример #3
0
    def train(self, params):
        features = self.tagger.features
        print(features)
        transformer = TaggedTransformer(features)
        logger.info("Start feature extraction")
        X_train, y_train = transformer.transform(self.corpus.train,
                                                 contain_labels=True)
        X_test, y_test = transformer.transform(self.corpus.test,
                                               contain_labels=True)
        logger.info("Finish feature extraction")

        # Train
        logger.info("Start train")
        trainer = pycrfsuite.Trainer(verbose=True)
        for xseq, yseq in zip(X_train, y_train):
            trainer.append(xseq, yseq)
        trainer.set_params(params)
        filename = 'tmp/model.tmp'
        trainer.train(filename)
        logger.info("Finish train")

        # Tagger
        logger.info("Start tagger")
        tagger = pycrfsuite.Tagger()
        tagger.open(filename)
        y_pred = [tagger.tag(x_seq) for x_seq in X_test]
        sentences = [[item[0] for item in sentence]
                     for sentence in self.corpus.test]
        sentences = zip(sentences, y_test, y_pred)
        texts = []
        for s in sentences:
            tokens, y_true, y_pred = s
            tokens_ = ["\t".join(item) for item in zip(tokens, y_true, y_pred)]
            text = "\n".join(tokens_)
            texts.append(text)
        text = "\n\n".join(texts)
        open("tmp/output.txt", "w").write(text)
        evaluate_("tmp/output.txt")
        logger.info("Finish tagger")
Пример #4
0
def train(train_path, model_path):
    train_set = []

    train_set += load_dataset(train_path)
    print("Load data from file", train_path)
    transformer = TaggedTransformer(template)
    X, y = transformer.transform(train_set, contain_labels=True)

    # train
    params = {
        'c1': 1.0,  # coefficient for L1 penalty
        'c2': 1e-3,  # coefficient for L2 penalty
        'max_iterations': 1000,  #
        # include transitions that are possible, but not observed
        'feature.possible_transitions': True
    }
    folder = dirname(model_path)
    try:
        makedirs(folder)
    except:
        pass
    estimator = CRF(params=params, filename=model_path)
    estimator.fit(X, y)
Пример #5
0
    # =========================================================================#
    #                                Transformer
    # =========================================================================#
    template = [
        "T[-2].lower", "T[-1].lower", "T[0].lower", "T[1].lower", "T[2].lower",
        "T[0].istitle", "T[-1].istitle", "T[1].istitle", "T[-2].istitle", "T[2].istitle",
        # word unigram and bigram
        "T[-2]", "T[-1]", "T[0]", "T[1]", "T[2]",
        "T[-2,-1]", "T[-1,0]", "T[0,1]", "T[1,2]",
        # pos unigram and bigram
        "T[-2][1]", "T[-1][1]", "T[0][1]", "T[1][1]", "T[2][1]",
        "T[-2,-1][1]", "T[-1,0][1]", "T[0,1][1]", "T[1,2][1]",
        # ner
        "T[-3][3]", "T[-2][3]", "T[-1][3]",
    ]
    transformer = TaggedTransformer(template)

    # flow.transform(transformer)
    X_train, y_train = transformer.transform(train_sentences)
    X_test, y_test = transformer.transform(test_sentences)


    # =========================================================================#
    #                               Models
    # =========================================================================#
    parameters = {
        'c1': 1.0,  # coefficient for L1 penalty
        'c2': 1e-3,  # coefficient for L2 penalty
        'max_iterations': 1000,  #
        # include transitions that are possible, but not observed
        'feature.possible_transitions': True
Пример #6
0
Файл: main.py Проект: anhlbt/ner
    # =========================================================================#
    #                                Transformer
    # =========================================================================#
    template = [
        "T[-2].lower", "T[-1].lower", "T[0].lower", "T[1].lower", "T[2].lower",
        "T[0].istitle", "T[-1].istitle", "T[1].istitle", "T[-2].istitle", "T[2].istitle",
        # word unigram and bigram
        "T[-2]", "T[-1]", "T[0]", "T[1]", "T[2]",
        "T[-2,-1]", "T[-1,0]", "T[0,1]", "T[1,2]",
        # pos unigram and bigram
        "T[-2][1]", "T[-1][1]", "T[0][1]", "T[1][1]", "T[2][1]",
        "T[-2,-1][1]", "T[-1,0][1]", "T[0,1][1]", "T[1,2][1]",
        # ner
        "T[-3][3]", "T[-2][3]", "T[-1][3]",
    ]
    transformer = TaggedTransformer(template)

    flow.transform(transformer)

    # =========================================================================#
    #                               Models
    # =========================================================================#
    crf_params = {
        'c1': 1.0,  # coefficient for L1 penalty
        'c2': 1e-3,  # coefficient for L2 penalty
        'max_iterations': 1000,  #
        # include transitions that are possible, but not observed
        'feature.possible_transitions': True
    }
    flow.add_model(Model(CRF(params=crf_params), "CRF"))
Пример #7
0
    # "T[-2].is_in_dict", "T[-1].is_in_dict", "T[0].is_in_dict", "T[1].is_in_dict", "T[2].is_in_dict",
    # "T[-2,-1].is_in_dict", "T[-1,0].is_in_dict", "T[0,1].is_in_dict", "T[1,2].is_in_dict",
    # "T[-2,0].is_in_dict", "T[-1,1].is_in_dict", "T[0,2].is_in_dict",

    # "T[-2,-1].lower", "T[-1,0].lower", "T[0,1].lower", "T[1,2].lower",
    # word unigram and bigram and trigram
    "T[-2]",
    "T[-1]",
    "T[0]",
    "T[1]",
    "T[2]",
    "T[-2,-1]",
    "T[-1,0]",
    "T[0,1]",
    "T[1,2]",
    "T[-2,0]",
    "T[-1,1]",
    "T[0,2]",
]
transformer = TaggedTransformer(features)
import time
durations = []
for i in range(10):
    start = time.time()
    X_train, y_train = transformer.transform(corpus.train, contain_labels=True)
    end = time.time()
    duration = end - start
    durations.append(duration)
    print(duration)
print(pd.Series(durations).describe())
Пример #8
0
 def setUp(self):
     self.tagged_transformer = TaggedTransformer(templates)