예제 #1
0
def get_tagger():
    d_tagger = nltk.DefaultTagger('NN')
    re_tagger = nltk.RegexpTagger(patterns,backoff=d_tagger)
    # train is the proportion of data used in training; the rest is reserved
    # for testing.
    print("Loading tagged data... ")
    tagged_data =  brown_tagged_sents
    cutoff = int(1000*.8)
    training_data = tagged_data[:cutoff]
    gold_data = tagged_data[cutoff:1000]
    testing_data = [[t[0] for t in sent] for sent in gold_data]
    print("Done loading.")

    bigram_tagger = tag.BigramTagger(training_data,backoff=re_tagger)
    
    templates = [
      brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1,1)),
      brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (2,2)),
      brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1,2)),
      brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1,3)),
      brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1,1)),
      brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (2,2)),
      brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1,2)),
      brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1,3)),
      brill.ProximateTokensTemplate(brill.ProximateTagsRule, (-1, -1), (1,1)),
      brill.ProximateTokensTemplate(brill.ProximateWordsRule, (-1, -1), (1,1)),
      ]
    trainer = brill.FastBrillTaggerTrainer(bigram_tagger, templates, 0)
    brill_tagger = trainer.train(training_data, max_rules=100, min_score=3)

    return brill_tagger
예제 #2
0
    def train(self, sentence_list):
        """Trains the tagger from the tagged sentences provided
        """
        noun_fallback = DefaultTagger('NN')
        affix_fallback = AffixTagger(sentence_list, backoff=noun_fallback)
        unigram_fallback = UnigramTagger(sentence_list, backoff=affix_fallback)
        bigram_fallback = BigramTagger(sentence_list, backoff=unigram_fallback)
        trigram_fallback = TrigramTagger(sentence_list,
                                         backoff=bigram_fallback)
        templates = [
            brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule,
                                                   (1, 1)),
            brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule,
                                                   (2, 2)),
            brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule,
                                                   (1, 2)),
            brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule,
                                                   (1, 3)),
            brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule,
                                                   (1, 1)),
            brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule,
                                                   (2, 2)),
            brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule,
                                                   (1, 2)),
            brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule,
                                                   (1, 3)),
            brill.ProximateTokensTemplate(brill.ProximateTagsRule, (-1, -1),
                                          (1, 1)),
            brill.ProximateTokensTemplate(brill.ProximateWordsRule, (-1, -1),
                                          (1, 1))
        ]

        trainer = brill.FastBrillTaggerTrainer(trigram_fallback, templates)
        self.tagger = trainer.train(sentence_list, max_rules=100, min_score=3)
예제 #3
0
def train_brill_tagger(initial_tagger, train_sents, end, trace=0, **kwargs):
    bounds = [(1, end)]

    templates = [
        brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule,
                                               *bounds),
        brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule,
                                               *bounds),
    ]

    trainer = brill.FastBrillTaggerTrainer(initial_tagger,
                                           templates,
                                           deterministic=True,
                                           trace=trace)
    return trainer.train(train_sents, **kwargs)
예제 #4
0
def train_brill_tagger(initial_tagger, train_sents, **kwargs):
    sym_bounds = [(1, 1), (2, 2), (1, 2), (1, 3)]
    asym_bounds = [(-1, -1), (1, 1)]

    templates = [
        brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule,
                                               *sym_bounds),
        brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule,
                                               *sym_bounds),
        brill.ProximateTokensTemplate(brill.ProximateTagsRule, *asym_bounds),
        brill.ProximateTokensTemplate(brill.ProximateWordsRule, *asym_bounds)
    ]

    trainer = brill.FastBrillTaggerTrainer(initial_tagger,
                                           templates,
                                           deterministic=True)
    return trainer.train(train_sents, **kwargs)
예제 #5
0
def get_brill(train_sents=None):
    import nltk, nltk.tag
    from nltk.tag import brill

    if train_sents is None:
        train_sents = nltk.corpus.conll2000.tagged_sents(
        ) + nltk.corpus.brown.tagged_sents()

    raubt_tagger = backoff_tagger(train_sents, [
        nltk.tag.AffixTagger, nltk.tag.UnigramTagger, nltk.tag.BigramTagger,
        nltk.tag.TrigramTagger
    ],
                                  backoff=nltk.tag.RegexpTagger(word_patterns))

    templates = [
        brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule,
                                               (1, 1)),
        brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule,
                                               (2, 2)),
        brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule,
                                               (1, 2)),
        brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule,
                                               (1, 3)),
        brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule,
                                               (1, 1)),
        brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule,
                                               (2, 2)),
        brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule,
                                               (1, 2)),
        brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule,
                                               (1, 3)),
        brill.ProximateTokensTemplate(brill.ProximateTagsRule, (-1, -1),
                                      (1, 1)),
        brill.ProximateTokensTemplate(brill.ProximateWordsRule, (-1, -1),
                                      (1, 1))
    ]

    trainer = brill.FastBrillTaggerTrainer(raubt_tagger, templates)
    braubt_tagger = trainer.train(train_sents, max_rules=100, min_score=3)
    return braubt_tagger
    def __init__(self, msr_data=True, msr_data_location=params.msr_dataset):

        tagged_sents = nltk.corpus.indian.tagged_sents(fileids="bangla.pos")
        filtered_sents1 = []
        for i, sent in enumerate(tagged_sents):
            try:
                nltk.tag.UnigramTagger([sent])
                filtered_sents1.append(sent)
            except ValueError:
                pass

        filtered_sents2 = []

        if msr_data == True:
            cwdir = os.getcwd()
            listing = os.listdir(msr_data_location)
            os.chdir(msr_data_location)
            for fileName in listing:
                xmlTree = parse(fileName)
                nodes = xmlTree.getElementsByTagName('sentence')
                try:
                    for node in nodes:
                        pairs = node.childNodes[0].nodeValue.split()
                        pairs = map(lambda a: a.split("\\"), pairs)
                        pairs = map(
                            lambda (a, b):
                            (a.encode("utf-8"), b.encode().split(".")[0]),
                            pairs)
                        nltk.tag.UnigramTagger([pairs])
                        filtered_sents2.append(pairs)
                except ValueError:
                    pass
            os.chdir(cwdir)

        total_set = filtered_sents1 + filtered_sents2

        aubt_tagger = [
            nltk.tag.AffixTagger, nltk.tag.UnigramTagger,
            nltk.tag.BigramTagger, nltk.tag.TrigramTagger
        ]
        brill_templates = [
            brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule,
                                                   (1, 1)),
            brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule,
                                                   (2, 2)),
            brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule,
                                                   (1, 2)),
            brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule,
                                                   (1, 3)),
            brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule,
                                                   (1, 1)),
            brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule,
                                                   (2, 2)),
            brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule,
                                                   (1, 2)),
            brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule,
                                                   (1, 3)),
            brill.ProximateTokensTemplate(brill.ProximateTagsRule, (-1, -1),
                                          (1, 1)),
            brill.ProximateTokensTemplate(brill.ProximateWordsRule, (-1, -1),
                                          (1, 1))
        ]

        btrainer = nltk.tag.brill.FastBrillTaggerTrainer(
            self.backoff_tagger(total_set, aubt_tagger), brill_templates)
        self.tagger = btrainer.train(total_set, max_rules=300, min_score=3)
예제 #7
0
# Author: Leonel F. de Alencar <*****@*****.**>
#
# URL: <http://sourceforge.net/projects/aelius/>
# For license information, see LICENSE.TXT
#
# $Id: ConstroiBRUBT.py $

from pickle import dump
from nltk.tag import brill
from nltk.corpus import TaggedCorpusReader
from .Extras import carrega
from .AnotaCorpus import abre_etiquetador
from .ConstroiRUBT import EXEMPLO, SENTENCA

TEMPLATES = [
    brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1, 1)),
    brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (2, 2)),
    brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1, 2)),
    brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1, 3)),
    brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1, 1)),
    brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (2, 2)),
    brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1, 2)),
    brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1, 3)),
    brill.ProximateTokensTemplate(brill.ProximateTagsRule, (-1, -1), (1, 1)),
    brill.ProximateTokensTemplate(brill.ProximateWordsRule, (-1, -1), (1, 1))
]

INICIAL = carrega("AeliusRUBT.pkl")


def treina(expressao_regular,
예제 #8
0
def demo(num_sents=100,
         max_rules=200,
         min_score=2,
         error_output="errors.out",
         rule_output="rules.out",
         randomize=False,
         train=.8,
         trace=3):
    """
    Brill Tagger Demonstration

    @param num_sents: how many sentences of training and testing data to use
    @type num_sents: L{int}
    @param max_rules: maximum number of rule instances to create
    @type max_rules: L{int}
    @param min_score: the minimum score for a rule in order for it to be considered
    @type min_score: L{int}
    @param error_output: the file where errors will be saved
    @type error_output: L{string}
    @param rule_output: the file where rules will be saved
    @type rule_output: L{string}
    @param randomize: whether the training data should be a random subset of the corpus
    @type randomize: L{boolean}
    @param train: the fraction of the the corpus to be used for training (1=all)
    @type train: L{float}
    @param trace: the level of diagnostic tracing output to produce (0-3)
    @type trace: L{int}
    """

    from nltk.corpus import treebank
    from nltk import tag
    from nltk.tag import brill

    NN_CD_tagger = tag.Regexp([(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), (r'.*', 'NN')])

    # train is the proportion of data used in training; the rest is reserved
    # for testing.

    print "Loading tagged data..."
    sents = []
    for item in treebank.items:
        sents.extend(treebank.tagged(item))
    if randomize:
        random.seed(len(sents))
        random.shuffle(sents)

    tagged_data = [t for s in sents[:num_sents] for t in s]
    cutoff = int(len(tagged_data) * train)

    training_data = tagged_data[:cutoff]
    gold_data = tagged_data[cutoff:]

    testing_data = [t[0] for t in gold_data]

    # Unigram tagger

    print "Training unigram tagger:",
    u = tag.Unigram(backoff=NN_CD_tagger)

    # NB training and testing are required to use a list-of-lists structure,
    # so we wrap the flattened corpus data with the extra list structure.
    u.train([training_data])
    print("[accuracy: %f]" % tag.accuracy(u, [gold_data]))

    # Brill tagger

    templates = [
        brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule,
                                               (1, 1)),
        brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule,
                                               (2, 2)),
        brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule,
                                               (1, 2)),
        brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule,
                                               (1, 3)),
        brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule,
                                               (1, 1)),
        brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule,
                                               (2, 2)),
        brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule,
                                               (1, 2)),
        brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule,
                                               (1, 3)),
        brill.ProximateTokensTemplate(brill.ProximateTagsRule, (-1, -1),
                                      (1, 1)),
        brill.ProximateTokensTemplate(brill.ProximateWordsRule, (-1, -1),
                                      (1, 1)),
    ]

    #trainer = brill.FastBrillTrainer(u, templates, trace)
    trainer = brill.BrillTrainer(u, templates, trace)
    b = trainer.train(training_data, max_rules, min_score)

    print
    print("Brill accuracy: %f" % tag.accuracy(b, [gold_data]))

    print("\nRules: ")
    printRules = file(rule_output, 'w')
    for rule in b.rules():
        print(str(rule))
        printRules.write(str(rule) + "\n\n")

    testing_data = list(b.tag(testing_data))
    el = errorList(gold_data, testing_data)
    errorFile = file(error_output, 'w')

    for e in el:
        errorFile.write(e + "\n\n")
    errorFile.close()
    print "Done; rules and errors saved to %s and %s." % (rule_output,
                                                          error_output)