def train(self, data):
        # baseline tagger: unigram tagger
        hmm = HMMTagger()
        hmm.train(data)
        self.baseline_tagger = hmm.tagger

        # train brill tagger with HMM as baseline
        templates = [
            brill.Template(brill.Pos([-1])),
            brill.Template(brill.Pos([1])),
            brill.Template(brill.Pos([-2])),
            brill.Template(brill.Pos([2])),
            brill.Template(brill.Pos([-2, -1])),
            brill.Template(brill.Pos([1, 2])),
            brill.Template(brill.Pos([-3, -2, -1])),
            brill.Template(brill.Pos([1, 2, 3])),
            brill.Template(brill.Pos([-1]), brill.Pos([1])),
            brill.Template(brill.Word([-1])),
            brill.Template(brill.Word([1])),
            brill.Template(brill.Word([-2])),
            brill.Template(brill.Word([2])),
            brill.Template(brill.Word([-2, -1])),
            brill.Template(brill.Word([1, 2])),
            brill.Template(brill.Word([-3, -2, -1])),
            brill.Template(brill.Word([1, 2, 3])),
            brill.Template(brill.Word([-1]), brill.Word([1])),
        ]
        self.trainer = BrillTaggerTrainer(self.baseline_tagger, templates=templates)
        self.tagger = self.trainer.train(data)
Exemplo n.º 2
0
 def get_brill_tagger(self):
     train_data = TaggedCorpusReader('.',
                                     'tagged_input_sentences.txt',
                                     sep="/")
     traindata = list(train_data.tagged_sents())
     postag = load('taggers/maxent_treebank_pos_tagger/english.pickle')
     templates = [
         brill.Template(brill.Pos([-1])),
         brill.Template(brill.Pos([1])),
         brill.Template(brill.Pos([-2])),
         brill.Template(brill.Pos([2])),
         brill.Template(brill.Pos([-2, -1])),
         brill.Template(brill.Pos([1, 2])),
         brill.Template(brill.Pos([-3, -2, -1])),
         brill.Template(brill.Pos([1, 2, 3])),
         brill.Template(brill.Pos([-1]), brill.Pos([1])),
         brill.Template(brill.Word([-1])),
         brill.Template(brill.Word([1])),
         brill.Template(brill.Word([-2])),
         brill.Template(brill.Word([2])),
         brill.Template(brill.Word([-2, -1])),
         brill.Template(brill.Word([1, 2])),
         brill.Template(brill.Word([-3, -2, -1])),
         brill.Template(brill.Word([1, 2, 3])),
         brill.Template(brill.Word([-1]), brill.Word([1]))
     ]
     trainer = BrillTaggerTrainer(postag, templates=templates, trace=3)
     brill_tagger = trainer.train(traindata, max_rules=10)
     return brill_tagger
Exemplo n.º 3
0
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.tagger = kwargs["tagger"]

        templates = [
            brill.Template(brill.Pos([-1])),
            brill.Template(brill.Pos([1])),
            brill.Template(brill.Pos([-2])),
            brill.Template(brill.Pos([2])),
            brill.Template(brill.Pos([-2, -1])),
            brill.Template(brill.Pos([1, 2])),
            brill.Template(brill.Pos([-3, -2, -1])),
            brill.Template(brill.Pos([1, 2, 3])),
            brill.Template(brill.Pos([-1]), brill.Pos([1])),
            brill.Template(brill.Word([-1])),
            brill.Template(brill.Word([1])),
            brill.Template(brill.Word([-2])),
            brill.Template(brill.Word([2])),
            brill.Template(brill.Word([-2, -1])),
            brill.Template(brill.Word([1, 2])),
            brill.Template(brill.Word([-3, -2, -1])),
            brill.Template(brill.Word([1, 2, 3])),
            brill.Template(brill.Word([-1]), brill.Word([1])),
        ]

        self.brillTrainer = BrillTaggerTrainer(self.tagger,
                                               templates,
                                               deterministic=True)
Exemplo n.º 4
0
def nltk_brill_pos_tagger(input_dict):
    """Brill's transformational rule-based tagger.  Brill taggers use an
    initial tagger (such as ``tag.DefaultTagger``) to assign an initial
    tag sequence to a text; and then apply an ordered list of
    transformational rules to correct the tags of individual tokens.
    These transformation rules are specified by the ``BrillRule``
    interface.

    Brill taggers can be created directly, from an initial tagger and
    a list of transformational rules; but more often, Brill taggers
    are created by learning rules from a training corpus, using either
    ``BrillTaggerTrainer`` or ``FastBrillTaggerTrainer``.

    :param training_corpus: A tagged corpus consisting of a list of tagged
        sentences, where each sentence is a list of (word, tag) tuples.
    :param initial_tagger: The initial tagger. Brill taggers use an initial
        tagger (such as ``DefaultTagger``) to assign an initial tag
        sequence to a text.
    :param max_rules: The maximum number of transformations to be created
    :param min_score: The minimum acceptable net error reduction
        that each transformation must produce in the corpus.
    :param deterministic: If true, then choose between rules that
        have the same score by picking the one whose __repr__
        is lexicographically smaller.  If false, then just pick the
        first rule we find with a given score -- this will depend
        on the order in which keys are returned from dictionaries,
        and so may not be the same from one run to the next.  If
        not specified, treat as true iff trace > 0.
    :param templates: templates to be used in training

    :returns pos_tagger: A python dictionary containing the POS tagger
        object and its arguments.
    """
    chunk = input_dict['training_corpus']['chunk']
    corpus = input_dict['training_corpus']['corpus']
    training_corpus = corpus_reader(corpus, chunk)
    initial_tagger = input_dict['initial_tagger']['object'] if input_dict['initial_tagger'] else DefaultTagger('-None-')
    max_rules = int(input_dict['max_rules']) #default 200
    min_score = int(input_dict['min_score']) #default 2
    deterministic = True

    templates = getattr(nltk.tag.brill,input_dict['templates'])()

    trainer = BrillTaggerTrainer(initial_tagger, templates, deterministic=deterministic, trace=settings.DEBUG)
    brill_tagger = trainer.train(training_corpus, max_rules=max_rules, min_score=min_score) #return BrillTagger(self._initial_tagger, rules)

    if settings.DEBUG:
        for rule in brill_tagger.rules():
            print(str(rule))

    return {'pos_tagger': {
                'function':'tag_sents',
                'object': brill_tagger
            }
    }
Exemplo n.º 5
0
def nltk_brill_pos_tagger(input_dict):
    """Brill's transformational rule-based tagger.  Brill taggers use an
    initial tagger (such as ``tag.DefaultTagger``) to assign an initial
    tag sequence to a text; and then apply an ordered list of
    transformational rules to correct the tags of individual tokens.
    These transformation rules are specified by the ``BrillRule``
    interface.

    Brill taggers can be created directly, from an initial tagger and
    a list of transformational rules; but more often, Brill taggers
    are created by learning rules from a training corpus, using either
    ``BrillTaggerTrainer`` or ``FastBrillTaggerTrainer``.

    :param training_corpus: A tagged corpus consisting of a list of tagged
        sentences, where each sentence is a list of (word, tag) tuples.
    :param initial_tagger: The initial tagger. Brill taggers use an initial
        tagger (such as ``DefaultTagger``) to assign an initial tag
        sequence to a text.
    :param max_rules: The maximum number of transformations to be created
    :param min_score: The minimum acceptable net error reduction
        that each transformation must produce in the corpus.
    :param deterministic: If true, then choose between rules that
        have the same score by picking the one whose __repr__
        is lexicographically smaller.  If false, then just pick the
        first rule we find with a given score -- this will depend
        on the order in which keys are returned from dictionaries,
        and so may not be the same from one run to the next.  If
        not specified, treat as true iff trace > 0.
    :param templates: templates to be used in training

    :returns pos_tagger: A python dictionary containing the POS tagger
        object and its arguments.
    """
    training_corpus=corpus_reader(input_dict['training_corpus'])[:1000]
    initial_tagger=input_dict['initial_tagger']['object'] if input_dict['initial_tagger'] else DefaultTagger('-None-')
    max_rules=int(input_dict['max_rules']) #default 200
    min_score=int(input_dict['min_score']) #default 2
    deterministic=True

    templates = getattr(nltk.tag.brill,input_dict['templates'])()

    trainer = BrillTaggerTrainer(initial_tagger, templates, deterministic=deterministic, trace=settings.DEBUG)
    brill_tagger = trainer.train(training_corpus, max_rules=max_rules, min_score=min_score) #return BrillTagger(self._initial_tagger, rules)

    if settings.DEBUG:
        for rule in brill_tagger.rules():
            print(str(rule))

    return {'pos_tagger': {
                'function':'tag_sents',
                'object': brill_tagger
            }
    }
Exemplo n.º 6
0
def train_brill_tagger(initial_tagger, train_sents, end, trace=0, **kwargs):
	bounds = [(1, end)]
	
	# call this to fetch templates directly
	# NOTE : This is the comment from the method below:
	#### Return 37 templates taken from the postagging task of the
	#### fntbl distribution http://www.cs.jhu.edu/~rflorian/fntbl/
	templates = brill.fntbl37()
	
	trainer = BrillTaggerTrainer(initial_tagger, templates,
		deterministic=True, trace=trace)
	return trainer.train(train_sents, **kwargs)
Exemplo n.º 7
0
    def __init__(self):
        # bounds = [(1, end)]
        initial_tagger = get_initial_tagger()
        rules = brill.fntbl37()

        self.trainer = BrillTaggerTrainer(initial_tagger,
                                          rules,
                                          deterministic=True,
                                          trace=0)
        train_sents, test_sents = utils.training_testing_dataset()
        self.tagger = self.trainer.train(train_sents, max_rules=20)
        print('Brill tagger training completed')
def get_braupt_tagger():
    conll_sents = nltk.corpus.conll2000.tagged_sents()
    # conll_sents = nltk.corpus.conll2002.tagged_sents()

    word_patterns = get_word_patterns()
    raubt_tagger = backoff_tagger(conll_sents, [
        nltk.tag.AffixTagger, nltk.tag.UnigramTagger, nltk.tag.BigramTagger,
        nltk.tag.TrigramTagger
    ],
                                  backoff=nltk.tag.RegexpTagger(word_patterns))

    templates = brill.brill24()

    trainer = BrillTaggerTrainer(raubt_tagger, templates)
    braubt_tagger = trainer.train(conll_sents, max_rules=100, min_score=3)
    return braubt_tagger
Exemplo n.º 9
0
    def turkish_brill_tagger(self):

        train_data = self.set_train_set(self.get_cutoff())
        eval_data = self.set_evaluation_set(self.get_cutoff(),
                                            self.development_size)

        tr_unigram = self.turkish_unigram_tagger(train_data, eval_data,
                                                 self.re_tagger)
        tr_bigram = self.turkish_bigram_tagger(train_data, eval_data,
                                               tr_unigram)
        tr_trigram = self.turkish_trigram_tagger(train_data, eval_data,
                                                 tr_bigram)

        templates = [
            Template(brill.Pos, (1, 1)),
            Template(brill.Pos, (2, 2)),
            Template(brill.Pos, (1, 2)),
            Template(brill.Pos, (1, 3)),
            Template(brill.Word, (1, 1)),
            Template(brill.Word, (2, 2)),
            Template(brill.Word, (1, 2)),
            Template(brill.Word, (1, 3)),
            Template(brill.Pos, (-1, -1), (1, 1)),
            Template(brill.Pos, (-1, -1), (1, 1)),
        ]

        br_trainer = BrillTaggerTrainer(tr_trigram, templates)
        tr_brill_tagger = br_trainer.train(train_data, self.max_rules,
                                           self.min_score)
        print("TR initial Brill accuracy: %s" %
              tr_brill_tagger.evaluate(eval_data))

        for i in range(1, 5):
            self.randomize_sentences()
            training_data = self.set_train_set(self.get_cutoff())
            evaluation_data = self.set_evaluation_set(self.get_cutoff(),
                                                      self.development_size)
            print('Fold: %s' % i)
            tr_brill_tagger = br_trainer.train(training_data, self.max_rules,
                                               self.min_score)
            print("TR Brill accuracy: %s" %
                  tr_brill_tagger.evaluate(evaluation_data))

        self.dump_tagger_to_file(tr_brill_tagger)
Exemplo n.º 10
0
class BrillTagger(object):
    def __init__(self):
        # bounds = [(1, end)]
        initial_tagger = get_initial_tagger()
        rules = brill.fntbl37()

        self.trainer = BrillTaggerTrainer(initial_tagger,
                                          rules,
                                          deterministic=True,
                                          trace=0)
        train_sents, test_sents = utils.training_testing_dataset()
        self.tagger = self.trainer.train(train_sents, max_rules=20)
        print('Brill tagger training completed')

    def tag(self, sent_tokens):
        tagged_sentences = []
        for sent in sent_tokens:
            tags = self.tagger.tag([w for w in sent])
            tagged_sentences.append(tags)
        return tagged_sentences
Exemplo n.º 11
0
class BrillTagger(Tagger):
    def train(self, data):
        # baseline tagger: unigram tagger
        hmm = HMMTagger()
        hmm.train(data)
        self.baseline_tagger = hmm.tagger

        # train brill tagger with HMM as baseline
        templates = [
            brill.Template(brill.Pos([-1])),
            brill.Template(brill.Pos([1])),
            brill.Template(brill.Pos([-2])),
            brill.Template(brill.Pos([2])),
            brill.Template(brill.Pos([-2, -1])),
            brill.Template(brill.Pos([1, 2])),
            brill.Template(brill.Pos([-3, -2, -1])),
            brill.Template(brill.Pos([1, 2, 3])),
            brill.Template(brill.Pos([-1]), brill.Pos([1])),
            brill.Template(brill.Word([-1])),
            brill.Template(brill.Word([1])),
            brill.Template(brill.Word([-2])),
            brill.Template(brill.Word([2])),
            brill.Template(brill.Word([-2, -1])),
            brill.Template(brill.Word([1, 2])),
            brill.Template(brill.Word([-3, -2, -1])),
            brill.Template(brill.Word([1, 2, 3])),
            brill.Template(brill.Word([-1]), brill.Word([1])),
        ]
        self.trainer = BrillTaggerTrainer(self.baseline_tagger, templates=templates)
        self.tagger = self.trainer.train(data)

    def test(self, test_data):
        logger.info(
            "Baseline tagger accuracy: {:.2f}%".format(
                self.baseline_tagger.evaluate(test_data) * 100.0
            )
        )
        return self.tagger.evaluate(test_data)
Exemplo n.º 12
0
class Brill(NltkModel):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.tagger = kwargs["tagger"]

        templates = [
            brill.Template(brill.Pos([-1])),
            brill.Template(brill.Pos([1])),
            brill.Template(brill.Pos([-2])),
            brill.Template(brill.Pos([2])),
            brill.Template(brill.Pos([-2, -1])),
            brill.Template(brill.Pos([1, 2])),
            brill.Template(brill.Pos([-3, -2, -1])),
            brill.Template(brill.Pos([1, 2, 3])),
            brill.Template(brill.Pos([-1]), brill.Pos([1])),
            brill.Template(brill.Word([-1])),
            brill.Template(brill.Word([1])),
            brill.Template(brill.Word([-2])),
            brill.Template(brill.Word([2])),
            brill.Template(brill.Word([-2, -1])),
            brill.Template(brill.Word([1, 2])),
            brill.Template(brill.Word([-3, -2, -1])),
            brill.Template(brill.Word([1, 2, 3])),
            brill.Template(brill.Word([-1]), brill.Word([1])),
        ]

        self.brillTrainer = BrillTaggerTrainer(self.tagger,
                                               templates,
                                               deterministic=True)

    def train(self, text):
        self.trainedBrill = self.brillTrainer.train(text)

    def test(self, testText):
        acc = self.trainedBrill.evaluate(testText)
        return acc
Exemplo n.º 13
0
        td, vd = td_sents_by_code[code], vd_sents_by_code[code]

        hmm_fname = "%s_cv-%i_fold-%i_code-%s_stemed-%s.dill" % (hmm_model_prefix, CV_FOLDS, fold, code, str(STEM))
        if os.path.exists(hmm_fname):
            with open(hmm_fname, "rb") as f:
                base_tagger = dill.load(f)
        else:
            hmm_trainer = HiddenMarkovModelTrainer()
            base_tagger = hmm_trainer.train_supervised(td)
            with open(hmm_fname, "wb") as f:
                dill.dump(base_tagger, f)

        #See: http://streamhacker.com/2008/12/03/part-of-speech-tagging-with-nltk-part-3/
        #and http://streamhacker.com/2014/12/02/nltk-3/ for changes to interface

        trainer = BrillTaggerTrainer(base_tagger, templates, deterministic=True)
        model = trainer.train(td, max_rules=MAX_RULES, min_score=MIN_SCORE)
        code2model[code] = model

        wd_td_ys_bytag[code] = to_flattened_binary_tags(td)
        wd_vd_ys_bytag[code] = to_flattened_binary_tags(vd)

        td_predictions = model.tag_sents(to_sentences(td))
        vd_predictions = model.tag_sents(to_sentences(vd))

        td_wd_predictions_by_code[code] = to_flattened_binary_tags(td_predictions)
        vd_wd_predictions_by_code[code] = to_flattened_binary_tags(vd_predictions)

    merge_dictionaries(wd_td_ys_bytag, cv_wd_td_ys_by_tag)
    merge_dictionaries(wd_vd_ys_bytag, cv_wd_vd_ys_by_tag)
    merge_dictionaries(td_wd_predictions_by_code, cv_wd_td_predictions_by_tag)
Exemplo n.º 14
0
    def train(self, templates=None, verbose=True):
        """Train a new Brill tagger."""
        if templates is None:
            templates = brill.nltkdemo18()

        random.seed(len(self.tagged_data_list))
        random.shuffle(self.tagged_data_list)
        cutoff = int(self.dev_size * self.train_size)

        training_data = self.tagged_data_list[:cutoff]
        test_data = self.tagged_data_list[cutoff:self.dev_size]

        # very simple regular expression tagger
        regex_tagger = RegexpTagger([(r'^-?[0-9]+(.[0-9]+)?$', 'PUNCT'),
                                     (r'.*', 'N')])
        if verbose == True:
            print "Regular expression tagger accuracy:\n{}\n".format(
                regex_tagger.evaluate(test_data))

        # unigram tagger
        unigram_tagger = UnigramTagger(train=training_data,
                                       backoff=regex_tagger)
        if verbose == True:
            print "Unigram tagger accuracy:\n{}\n".format(
                unigram_tagger.evaluate(test_data))

        # bigram tagger
        bigram_tagger = BigramTagger(train=training_data,
                                     backoff=unigram_tagger)
        if verbose == True:
            print "Bigram tagger accuracy:\n{}\n".format(
                bigram_tagger.evaluate(test_data))

        # trigram tagger
        trigram_tagger = TrigramTagger(train=training_data,
                                       backoff=bigram_tagger)
        if verbose == True:
            print "Trigram tagger accuracy:\n{}\n".format(
                trigram_tagger.evaluate(test_data))

        # first iteration
        trainer = BrillTaggerTrainer(initial_tagger=trigram_tagger,
                                     templates=templates)
        brill_tagger = trainer.train(train_sents=training_data,
                                     max_rules=self.max_rules,
                                     min_score=self.min_score)
        if verbose == True:
            print "Initial Brill tagger accuracy:\n{}\n".format(
                brill_tagger.evaluate(test_data))

        # folding
        for i in range(0, self.num_groups):
            # random splitting
            random.seed(len(self.tagged_data_list))
            random.shuffle(self.tagged_data_list)
            cutoff = int(self.dev_size * self.train_size)

            training_data = self.tagged_data_list[:cutoff]
            test_data = self.tagged_data_list[cutoff:self.dev_size]

            # note that .train method returns a BrillTagger() object
            brill_tagger = trainer.train(train_sents=training_data,
                                         max_rules=self.max_rules,
                                         min_score=self.min_score)

            if verbose == True:
                print "Brill tagger accuracy, fold {}:\n{}\n".format(
                    i + 1, brill_tagger.evaluate(test_data))
Exemplo n.º 15
0
    def train(self, templates=None, verbose=True):
        """Train a new Brill tagger."""
        if templates is None:
            templates = brill.nltkdemo18()

        random.seed(len(self.tagged_data_list))
        random.shuffle(self.tagged_data_list)
        cutoff = int(self.dev_size * self.train_size)

        training_data = self.tagged_data_list[:cutoff]
        test_data = self.tagged_data_list[cutoff:self.dev_size]

        # very simple regular expression tagger
        regex_tagger = RegexpTagger([
            (r'^-?[0-9]+(.[0-9]+)?$', 'PUNCT'),
            (r'.*', 'N')
            ])
        if verbose == True:
            print "Regular expression tagger accuracy:\n{}\n".format(
                    regex_tagger.evaluate(test_data))

        # unigram tagger
        unigram_tagger = UnigramTagger(train=training_data,
                                       backoff=regex_tagger)
        if verbose == True:
            print "Unigram tagger accuracy:\n{}\n".format(
                    unigram_tagger.evaluate(test_data))

        # bigram tagger
        bigram_tagger = BigramTagger(train=training_data,
                                     backoff=unigram_tagger)
        if verbose == True:
            print "Bigram tagger accuracy:\n{}\n".format(
                    bigram_tagger.evaluate(test_data))

        # trigram tagger
        trigram_tagger = TrigramTagger(train=training_data,
                                       backoff=bigram_tagger)
        if verbose == True:
            print "Trigram tagger accuracy:\n{}\n".format(
                    trigram_tagger.evaluate(test_data))

        # first iteration
        trainer = BrillTaggerTrainer(initial_tagger=trigram_tagger,
                                     templates=templates)
        brill_tagger = trainer.train(train_sents=training_data,
                                     max_rules=self.max_rules,
                                     min_score=self.min_score)
        if verbose == True:
            print "Initial Brill tagger accuracy:\n{}\n".format(
                    brill_tagger.evaluate(test_data))

        # folding
        for i in range(0, self.num_groups):
            # random splitting
            random.seed(len(self.tagged_data_list))
            random.shuffle(self.tagged_data_list)
            cutoff = int(self.dev_size * self.train_size)

            training_data = self.tagged_data_list[:cutoff]
            test_data = self.tagged_data_list[cutoff:self.dev_size]

            # note that .train method returns a BrillTagger() object
            brill_tagger = trainer.train(train_sents=training_data,
                                         max_rules=self.max_rules,
                                         min_score=self.min_score)

            if verbose == True:
                print "Brill tagger accuracy, fold {}:\n{}\n".format(
                        i+1, brill_tagger.evaluate(test_data))
        hmm_fname = "%s_cv-%i_fold-%i_code-%s_stemed-%s.dill" % (
            hmm_model_prefix, CV_FOLDS, fold, code, str(STEM))
        if os.path.exists(hmm_fname):
            with open(hmm_fname, "rb") as f:
                base_tagger = dill.load(f)
        else:
            hmm_trainer = HiddenMarkovModelTrainer()
            base_tagger = hmm_trainer.train_supervised(td)
            with open(hmm_fname, "wb") as f:
                dill.dump(base_tagger, f)

        #See: http://streamhacker.com/2008/12/03/part-of-speech-tagging-with-nltk-part-3/
        #and http://streamhacker.com/2014/12/02/nltk-3/ for changes to interface

        trainer = BrillTaggerTrainer(base_tagger,
                                     templates,
                                     deterministic=True)
        model = trainer.train(td, max_rules=MAX_RULES, min_score=MIN_SCORE)
        code2model[code] = model

        wd_td_ys_bytag[code] = to_flattened_binary_tags(td)
        wd_vd_ys_bytag[code] = to_flattened_binary_tags(vd)

        td_predictions = model.tag_sents(to_sentences(td))
        vd_predictions = model.tag_sents(to_sentences(vd))

        td_wd_predictions_by_code[code] = to_flattened_binary_tags(
            td_predictions)
        vd_wd_predictions_by_code[code] = to_flattened_binary_tags(
            vd_predictions)
Exemplo n.º 17
0
def train_brill_tagger(train_data):
    # Modules for creating the templates.
    from nltk import UnigramTagger
    # The brill tagger module in NLTK.
    from nltk.tag.brill_trainer import BrillTaggerTrainer
    from nltk import BigramTagger,UnigramTagger,TrigramTagger
    import nltk
    from pickle import dump
    #unigram_tagger = UnigramTagger(train_data)
    templates=nltk.tag.brill.fntbl37()
    #Regular expression (Regex) Tagger as a default tagger
    default_tagger = nltk.RegexpTagger(
        [(r'^[Jj]ing', 'ABN'),
         (r'^[pP]yn', 'CAV'),
         (r'^[nN]ga$', '1PSG'),
         (r'^[pP]hi$', '2PG'),
         (r'^[pP]ha$', '2PF'),
         (r'^[mM]e$', '2PM'),
         (r'^[iI]$', '3PSG'),
         (r'^[bB]an$', 'INP'),
         (r'^[Kk]a$', '3PSF'),
         (r'^[uU]$', '3PSM'),
         (r'^[kK]i$', '3PPG'),
         (r'(sha|da|na|hapoh|halor|ha|naduh|shaduh|hapdeng|haduh)$', 'IN'),
         (r'(bad|ruh|namar|hynrei|tangba|katba|katta)$', 'COC'),
         (r'(lada|haba|khnang|ynda)$', 'SUC'),
         (r'(katkum|kat|pat|wat|tang|lang)$', 'AD'),
         (r'(bun|baroh)$', 'QNT'),
         (r'^-?[0-9]+(.[0-9]+)?$', 'CN'),
         (r'(dei|long|don)$', 'CO'),
         (r'^[jJ]ong$', 'POP'),
         (r'^[sS]hah$', 'PAV'),
         (r'^[lL]ah$', 'MOD'),
         (r'^[lL]a$', 'VST'),
         (r'(ym|em|khlem|nym|kam)$', 'NEG'),
         (r'^hi$', 'EM'),
         (r'.*lade$', 'RFP'),
         (r'(dang|nang)$', 'VPP'),
         (r'([uU]n|[kK]an|[kK]in|[sS]a|[yY]n|[nN]gin|[pP]hin)$', 'VFT'),
         (r'(.*ngut|.*tylli)$', 'ADJ'),
         (r'^[bB]a$', 'COM'),
         (r'^\W+$', 'SYM'),
         (r'[^a-z\W]a$', 'IN'),
         (r'([vV]ote|[bB]ye|[cC]onstituency|[sS]outh)$', 'FR'),
         (r'.*', 'CMN')

         ])
    t0 = default_tagger
    print(train_data)
    t1 = UnigramTagger(train_data,backoff=t0)
    t2 = BigramTagger(train_data,backoff=t1)
    t3 = TrigramTagger(train_data,backoff=t2)


    trainer = BrillTaggerTrainer(initial_tagger=t3,
                                   templates=templates, trace=3,
                                   deterministic=True)
    brill_tagger = trainer.train(train_data,max_rules=10)

    # Saving the Tagger for future use
    output = open('t2.pkl', 'wb')
    dump(t3, output, -1)
    output.close()
    return brill_tagger
Exemplo n.º 18
0
        brill.Template(brill.Pos([2])),
        brill.Template(brill.Pos([-2, -1])),
        brill.Template(brill.Pos([1, 2])),
        brill.Template(brill.Pos([-3, -2, -1])),
        brill.Template(brill.Pos([1, 2, 3])),
        brill.Template(brill.Pos([-1]), brill.Pos([1])),
        brill.Template(brill.Word([-1])),
        brill.Template(brill.Word([1])),
        brill.Template(brill.Word([-2])),
        brill.Template(brill.Word([2])),
        brill.Template(brill.Word([-2, -1])),
        brill.Template(brill.Word([1, 2])),
        brill.Template(brill.Word([-3, -2, -1])),
        brill.Template(brill.Word([1, 2, 3])),
        brill.Template(brill.Word([-1]), brill.Word([1]))]        
trainer = BrillTaggerTrainer(postag, templates = templates, trace = 3)
brill_tagger = trainer.train(traindata, max_rules = 10)


# # Source and Destination Extraction From Sentence # 

# In[12]:

def extract_location(inp):
    tagged = brill_tagger.tag(word_tokenize(inp))
    source = None
    destination = None
    chunkGram = """Source: {<IN>(<NN.*><,>?)+}"""
    chunkParser = nltk.RegexpParser(chunkGram)
    chunked = chunkParser.parse(tagged)
    for subtree in chunked.subtrees(filter=lambda t: t.label() == 'Source'):