def test_speed(self): start = time.time() tokens = text.split() tokens = [(token, ) for token in tokens] transformer = TaggedTransformer(template) x2 = transformer.transform([tokens]) end = time.time() duration = end - start self.assertLess(duration, 3)
class TestTaggedFeature(TestCase): def setUp(self): self.tagged_transformer = TaggedTransformer(templates) def test_word2features(self): sent = [["người", "N", "B-NP"], ["nghèo", "A", "I-NP"]] actual = self.tagged_transformer.transform([sent])[0][0][0] expected = "T[0].lower=người" self.assertEqual(expected, actual)
def train(self, params): features = self.tagger.features print(features) transformer = TaggedTransformer(features) logger.info("Start feature extraction") X_train, y_train = transformer.transform(self.corpus.train, contain_labels=True) X_test, y_test = transformer.transform(self.corpus.test, contain_labels=True) logger.info("Finish feature extraction") # Train logger.info("Start train") trainer = pycrfsuite.Trainer(verbose=True) for xseq, yseq in zip(X_train, y_train): trainer.append(xseq, yseq) trainer.set_params(params) filename = 'tmp/model.tmp' trainer.train(filename) logger.info("Finish train") # Tagger logger.info("Start tagger") tagger = pycrfsuite.Tagger() tagger.open(filename) y_pred = [tagger.tag(x_seq) for x_seq in X_test] sentences = [[item[0] for item in sentence] for sentence in self.corpus.test] sentences = zip(sentences, y_test, y_pred) texts = [] for s in sentences: tokens, y_true, y_pred = s tokens_ = ["\t".join(item) for item in zip(tokens, y_true, y_pred)] text = "\n".join(tokens_) texts.append(text) text = "\n\n".join(texts) open("tmp/output.txt", "w").write(text) evaluate_("tmp/output.txt") logger.info("Finish tagger")
def train(train_path, model_path): train_set = [] train_set += load_dataset(train_path) print("Load data from file", train_path) transformer = TaggedTransformer(template) X, y = transformer.transform(train_set, contain_labels=True) # train params = { 'c1': 1.0, # coefficient for L1 penalty 'c2': 1e-3, # coefficient for L2 penalty 'max_iterations': 1000, # # include transitions that are possible, but not observed 'feature.possible_transitions': True } folder = dirname(model_path) try: makedirs(folder) except: pass estimator = CRF(params=params, filename=model_path) estimator.fit(X, y)
# =========================================================================# # Transformer # =========================================================================# template = [ "T[-2].lower", "T[-1].lower", "T[0].lower", "T[1].lower", "T[2].lower", "T[0].istitle", "T[-1].istitle", "T[1].istitle", "T[-2].istitle", "T[2].istitle", # word unigram and bigram "T[-2]", "T[-1]", "T[0]", "T[1]", "T[2]", "T[-2,-1]", "T[-1,0]", "T[0,1]", "T[1,2]", # pos unigram and bigram "T[-2][1]", "T[-1][1]", "T[0][1]", "T[1][1]", "T[2][1]", "T[-2,-1][1]", "T[-1,0][1]", "T[0,1][1]", "T[1,2][1]", # ner "T[-3][3]", "T[-2][3]", "T[-1][3]", ] transformer = TaggedTransformer(template) # flow.transform(transformer) X_train, y_train = transformer.transform(train_sentences) X_test, y_test = transformer.transform(test_sentences) # =========================================================================# # Models # =========================================================================# parameters = { 'c1': 1.0, # coefficient for L1 penalty 'c2': 1e-3, # coefficient for L2 penalty 'max_iterations': 1000, # # include transitions that are possible, but not observed 'feature.possible_transitions': True
# =========================================================================# # Transformer # =========================================================================# template = [ "T[-2].lower", "T[-1].lower", "T[0].lower", "T[1].lower", "T[2].lower", "T[0].istitle", "T[-1].istitle", "T[1].istitle", "T[-2].istitle", "T[2].istitle", # word unigram and bigram "T[-2]", "T[-1]", "T[0]", "T[1]", "T[2]", "T[-2,-1]", "T[-1,0]", "T[0,1]", "T[1,2]", # pos unigram and bigram "T[-2][1]", "T[-1][1]", "T[0][1]", "T[1][1]", "T[2][1]", "T[-2,-1][1]", "T[-1,0][1]", "T[0,1][1]", "T[1,2][1]", # ner "T[-3][3]", "T[-2][3]", "T[-1][3]", ] transformer = TaggedTransformer(template) flow.transform(transformer) # =========================================================================# # Models # =========================================================================# crf_params = { 'c1': 1.0, # coefficient for L1 penalty 'c2': 1e-3, # coefficient for L2 penalty 'max_iterations': 1000, # # include transitions that are possible, but not observed 'feature.possible_transitions': True } flow.add_model(Model(CRF(params=crf_params), "CRF"))
# "T[-2].is_in_dict", "T[-1].is_in_dict", "T[0].is_in_dict", "T[1].is_in_dict", "T[2].is_in_dict", # "T[-2,-1].is_in_dict", "T[-1,0].is_in_dict", "T[0,1].is_in_dict", "T[1,2].is_in_dict", # "T[-2,0].is_in_dict", "T[-1,1].is_in_dict", "T[0,2].is_in_dict", # "T[-2,-1].lower", "T[-1,0].lower", "T[0,1].lower", "T[1,2].lower", # word unigram and bigram and trigram "T[-2]", "T[-1]", "T[0]", "T[1]", "T[2]", "T[-2,-1]", "T[-1,0]", "T[0,1]", "T[1,2]", "T[-2,0]", "T[-1,1]", "T[0,2]", ] transformer = TaggedTransformer(features) import time durations = [] for i in range(10): start = time.time() X_train, y_train = transformer.transform(corpus.train, contain_labels=True) end = time.time() duration = end - start durations.append(duration) print(duration) print(pd.Series(durations).describe())
def setUp(self): self.tagged_transformer = TaggedTransformer(templates)