def _get_pretrain_model(): pos_tagger = PerceptronTagger(load=False) pos_tagger.train(sentences=train_sents, save_loc=PICKLE) print('Accuracy : ', pos_tagger.evaluate(test_sents))
ret, tags, err = pipe_through_prog(prog, ' '.join(tokens)) return tags def tag_sents(self, sentences): text = [] for s in sentences: text.append(' '.join(s)) return self.tag(text) def evaluate(self, gold): tagged_sents = self.tag_sents(untag(sent) for sent in gold) gold_tokens = list(itertools.chain(*gold)) return accuracy(gold_tokens, tagged_sents) if __name__ == '__main__': sents = treebank.tagged_sents() PT = PerceptronTagger() now = time.time() PT.tag_sents(untag(sent) for sent in sents) pt_time = time.time() - now headers = ['Library', 'Accuracy', 'Time (sec)'] table = [['NLTK', round(PT.evaluate(sents), 3), round(pt_time, 3)], [ 'Prose', round(APTagger().evaluate(sents), 3), round(AP_TIME, 3) ]] print(tabulate(table, headers, tablefmt='pipe'))
gold_tokens = list(itertools.chain(*gold)) print(json.dumps(gold_tokens)) print(len(tagged_sents), len(gold_tokens)) return accuracy(gold_tokens, tagged_sents) if __name__ == '__main__': sents = treebank.tagged_sents() PT = PerceptronTagger() print("Timing NLTK ...") pt_times = [] for _ in range(5): now = time.time() PT.tag_sents(untag(sent) for sent in sents) pt_times.append(time.time() - now) pt_time = round(sum(pt_times) / len(pt_times), 3) '''NOTE: Moved to tag_test.go print("Timing prose ...") acc = round(APTagger().evaluate(sents), 3) ap_time = round(sum(AP_TIME) / len(AP_TIME), 3) ''' print("Evaluating accuracy ...") headers = ['Library', 'Accuracy', '5-Run Average (sec)'] table = [ ['NLTK', round(PT.evaluate(sents), 3), pt_time], # ['`prose`', acc, ap_time] ] print(tabulate(table, headers, tablefmt='pipe'))
import cowparser as cp train_sents = [] test_sents = [] gen = cp.sentences_for_dir(separate=False) for i, (metadata, data) in enumerate(gen): train_sents.append([(a,b) for a,b,c in data]) if i == 2000000: break for i, (metadata, data) in enumerate(gen): test_sents.append([(a,b) for a,b,c in data]) if i == 5000: break from nltk.tag.perceptron import PerceptronTagger pt = PerceptronTagger(load=False) pt.train(train_sents,'model2.perc.dutch_tagger') print(pt.evaluate(test_sents))