Python BrillTaggerTrainer 예제들, nltk.tag.brill_trainer.BrillTaggerTrainer Python 예제들

예제 #1

0

파일 보기

파일: tagger.py 프로젝트: michaellzc/cmput497_a3_yonael_zichun3

    def train(self, data):
        # baseline tagger: unigram tagger
        hmm = HMMTagger()
        hmm.train(data)
        self.baseline_tagger = hmm.tagger

        # train brill tagger with HMM as baseline
        templates = [
            brill.Template(brill.Pos([-1])),
            brill.Template(brill.Pos([1])),
            brill.Template(brill.Pos([-2])),
            brill.Template(brill.Pos([2])),
            brill.Template(brill.Pos([-2, -1])),
            brill.Template(brill.Pos([1, 2])),
            brill.Template(brill.Pos([-3, -2, -1])),
            brill.Template(brill.Pos([1, 2, 3])),
            brill.Template(brill.Pos([-1]), brill.Pos([1])),
            brill.Template(brill.Word([-1])),
            brill.Template(brill.Word([1])),
            brill.Template(brill.Word([-2])),
            brill.Template(brill.Word([2])),
            brill.Template(brill.Word([-2, -1])),
            brill.Template(brill.Word([1, 2])),
            brill.Template(brill.Word([-3, -2, -1])),
            brill.Template(brill.Word([1, 2, 3])),
            brill.Template(brill.Word([-1]), brill.Word([1])),
        ]
        self.trainer = BrillTaggerTrainer(self.baseline_tagger, templates=templates)
        self.tagger = self.trainer.train(data)

예제 #2

0

파일 보기

파일: kabbot.py 프로젝트: gnespatel1618/KabBot

 def get_brill_tagger(self):
     train_data = TaggedCorpusReader('.',
                                     'tagged_input_sentences.txt',
                                     sep="/")
     traindata = list(train_data.tagged_sents())
     postag = load('taggers/maxent_treebank_pos_tagger/english.pickle')
     templates = [
         brill.Template(brill.Pos([-1])),
         brill.Template(brill.Pos([1])),
         brill.Template(brill.Pos([-2])),
         brill.Template(brill.Pos([2])),
         brill.Template(brill.Pos([-2, -1])),
         brill.Template(brill.Pos([1, 2])),
         brill.Template(brill.Pos([-3, -2, -1])),
         brill.Template(brill.Pos([1, 2, 3])),
         brill.Template(brill.Pos([-1]), brill.Pos([1])),
         brill.Template(brill.Word([-1])),
         brill.Template(brill.Word([1])),
         brill.Template(brill.Word([-2])),
         brill.Template(brill.Word([2])),
         brill.Template(brill.Word([-2, -1])),
         brill.Template(brill.Word([1, 2])),
         brill.Template(brill.Word([-3, -2, -1])),
         brill.Template(brill.Word([1, 2, 3])),
         brill.Template(brill.Word([-1]), brill.Word([1]))
     ]
     trainer = BrillTaggerTrainer(postag, templates=templates, trace=3)
     brill_tagger = trainer.train(traindata, max_rules=10)
     return brill_tagger

예제 #3

0

파일 보기

    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.tagger = kwargs["tagger"]

        templates = [
            brill.Template(brill.Pos([-1])),
            brill.Template(brill.Pos([1])),
            brill.Template(brill.Pos([-2])),
            brill.Template(brill.Pos([2])),
            brill.Template(brill.Pos([-2, -1])),
            brill.Template(brill.Pos([1, 2])),
            brill.Template(brill.Pos([-3, -2, -1])),
            brill.Template(brill.Pos([1, 2, 3])),
            brill.Template(brill.Pos([-1]), brill.Pos([1])),
            brill.Template(brill.Word([-1])),
            brill.Template(brill.Word([1])),
            brill.Template(brill.Word([-2])),
            brill.Template(brill.Word([2])),
            brill.Template(brill.Word([-2, -1])),
            brill.Template(brill.Word([1, 2])),
            brill.Template(brill.Word([-3, -2, -1])),
            brill.Template(brill.Word([1, 2, 3])),
            brill.Template(brill.Word([-1]), brill.Word([1])),
        ]

        self.brillTrainer = BrillTaggerTrainer(self.tagger,
                                               templates,
                                               deterministic=True)

예제 #4

0

파일 보기

파일: part_of_speech_tagging.py 프로젝트: BillTheBest/tf_core

def nltk_brill_pos_tagger(input_dict):
    """Brill's transformational rule-based tagger.  Brill taggers use an
    initial tagger (such as ``tag.DefaultTagger``) to assign an initial
    tag sequence to a text; and then apply an ordered list of
    transformational rules to correct the tags of individual tokens.
    These transformation rules are specified by the ``BrillRule``
    interface.

    Brill taggers can be created directly, from an initial tagger and
    a list of transformational rules; but more often, Brill taggers
    are created by learning rules from a training corpus, using either
    ``BrillTaggerTrainer`` or ``FastBrillTaggerTrainer``.

    :param training_corpus: A tagged corpus consisting of a list of tagged
        sentences, where each sentence is a list of (word, tag) tuples.
    :param initial_tagger: The initial tagger. Brill taggers use an initial
        tagger (such as ``DefaultTagger``) to assign an initial tag
        sequence to a text.
    :param max_rules: The maximum number of transformations to be created
    :param min_score: The minimum acceptable net error reduction
        that each transformation must produce in the corpus.
    :param deterministic: If true, then choose between rules that
        have the same score by picking the one whose __repr__
        is lexicographically smaller.  If false, then just pick the
        first rule we find with a given score -- this will depend
        on the order in which keys are returned from dictionaries,
        and so may not be the same from one run to the next.  If
        not specified, treat as true iff trace > 0.
    :param templates: templates to be used in training

    :returns pos_tagger: A python dictionary containing the POS tagger
        object and its arguments.
    """
    chunk = input_dict['training_corpus']['chunk']
    corpus = input_dict['training_corpus']['corpus']
    training_corpus = corpus_reader(corpus, chunk)
    initial_tagger = input_dict['initial_tagger']['object'] if input_dict['initial_tagger'] else DefaultTagger('-None-')
    max_rules = int(input_dict['max_rules']) #default 200
    min_score = int(input_dict['min_score']) #default 2
    deterministic = True

    templates = getattr(nltk.tag.brill,input_dict['templates'])()

    trainer = BrillTaggerTrainer(initial_tagger, templates, deterministic=deterministic, trace=settings.DEBUG)
    brill_tagger = trainer.train(training_corpus, max_rules=max_rules, min_score=min_score) #return BrillTagger(self._initial_tagger, rules)

    if settings.DEBUG:
        for rule in brill_tagger.rules():
            print(str(rule))

    return {'pos_tagger': {
                'function':'tag_sents',
                'object': brill_tagger
            }
    }

예제 #5

0

파일 보기

파일: part_of_speech_tagging.py 프로젝트: xflows/textflows

def nltk_brill_pos_tagger(input_dict):
    """Brill's transformational rule-based tagger.  Brill taggers use an
    initial tagger (such as ``tag.DefaultTagger``) to assign an initial
    tag sequence to a text; and then apply an ordered list of
    transformational rules to correct the tags of individual tokens.
    These transformation rules are specified by the ``BrillRule``
    interface.

    Brill taggers can be created directly, from an initial tagger and
    a list of transformational rules; but more often, Brill taggers
    are created by learning rules from a training corpus, using either
    ``BrillTaggerTrainer`` or ``FastBrillTaggerTrainer``.

    :param training_corpus: A tagged corpus consisting of a list of tagged
        sentences, where each sentence is a list of (word, tag) tuples.
    :param initial_tagger: The initial tagger. Brill taggers use an initial
        tagger (such as ``DefaultTagger``) to assign an initial tag
        sequence to a text.
    :param max_rules: The maximum number of transformations to be created
    :param min_score: The minimum acceptable net error reduction
        that each transformation must produce in the corpus.
    :param deterministic: If true, then choose between rules that
        have the same score by picking the one whose __repr__
        is lexicographically smaller.  If false, then just pick the
        first rule we find with a given score -- this will depend
        on the order in which keys are returned from dictionaries,
        and so may not be the same from one run to the next.  If
        not specified, treat as true iff trace > 0.
    :param templates: templates to be used in training

    :returns pos_tagger: A python dictionary containing the POS tagger
        object and its arguments.
    """
    training_corpus=corpus_reader(input_dict['training_corpus'])[:1000]
    initial_tagger=input_dict['initial_tagger']['object'] if input_dict['initial_tagger'] else DefaultTagger('-None-')
    max_rules=int(input_dict['max_rules']) #default 200
    min_score=int(input_dict['min_score']) #default 2
    deterministic=True

    templates = getattr(nltk.tag.brill,input_dict['templates'])()

    trainer = BrillTaggerTrainer(initial_tagger, templates, deterministic=deterministic, trace=settings.DEBUG)
    brill_tagger = trainer.train(training_corpus, max_rules=max_rules, min_score=min_score) #return BrillTagger(self._initial_tagger, rules)

    if settings.DEBUG:
        for rule in brill_tagger.rules():
            print(str(rule))

    return {'pos_tagger': {
                'function':'tag_sents',
                'object': brill_tagger
            }
    }

예제 #6

0

파일 보기

def train_brill_tagger(initial_tagger, train_sents, end, trace=0, **kwargs):
	bounds = [(1, end)]
	
	# call this to fetch templates directly
	# NOTE : This is the comment from the method below:
	#### Return 37 templates taken from the postagging task of the
	#### fntbl distribution http://www.cs.jhu.edu/~rflorian/fntbl/
	templates = brill.fntbl37()
	
	trainer = BrillTaggerTrainer(initial_tagger, templates,
		deterministic=True, trace=trace)
	return trainer.train(train_sents, **kwargs)

예제 #7

0

파일 보기

파일: BrillTagger.py 프로젝트: AnandN5/qint_nlp

    def __init__(self):
        # bounds = [(1, end)]
        initial_tagger = get_initial_tagger()
        rules = brill.fntbl37()

        self.trainer = BrillTaggerTrainer(initial_tagger,
                                          rules,
                                          deterministic=True,
                                          trace=0)
        train_sents, test_sents = utils.training_testing_dataset()
        self.tagger = self.trainer.train(train_sents, max_rules=20)
        print('Brill tagger training completed')

예제 #8

0

파일 보기

파일: clean_text.py 프로젝트: 1436722103/reveal-user-annotation

def get_braupt_tagger():
    conll_sents = nltk.corpus.conll2000.tagged_sents()
    # conll_sents = nltk.corpus.conll2002.tagged_sents()

    word_patterns = get_word_patterns()
    raubt_tagger = backoff_tagger(conll_sents, [
        nltk.tag.AffixTagger, nltk.tag.UnigramTagger, nltk.tag.BigramTagger,
        nltk.tag.TrigramTagger
    ],
                                  backoff=nltk.tag.RegexpTagger(word_patterns))

    templates = brill.brill24()

    trainer = BrillTaggerTrainer(raubt_tagger, templates)
    braubt_tagger = trainer.train(conll_sents, max_rules=100, min_score=3)
    return braubt_tagger

예제 #9

0

파일 보기

    def turkish_brill_tagger(self):

        train_data = self.set_train_set(self.get_cutoff())
        eval_data = self.set_evaluation_set(self.get_cutoff(),
                                            self.development_size)

        tr_unigram = self.turkish_unigram_tagger(train_data, eval_data,
                                                 self.re_tagger)
        tr_bigram = self.turkish_bigram_tagger(train_data, eval_data,
                                               tr_unigram)
        tr_trigram = self.turkish_trigram_tagger(train_data, eval_data,
                                                 tr_bigram)

        templates = [
            Template(brill.Pos, (1, 1)),
            Template(brill.Pos, (2, 2)),
            Template(brill.Pos, (1, 2)),
            Template(brill.Pos, (1, 3)),
            Template(brill.Word, (1, 1)),
            Template(brill.Word, (2, 2)),
            Template(brill.Word, (1, 2)),
            Template(brill.Word, (1, 3)),
            Template(brill.Pos, (-1, -1), (1, 1)),
            Template(brill.Pos, (-1, -1), (1, 1)),
        ]

        br_trainer = BrillTaggerTrainer(tr_trigram, templates)
        tr_brill_tagger = br_trainer.train(train_data, self.max_rules,
                                           self.min_score)
        print("TR initial Brill accuracy: %s" %
              tr_brill_tagger.evaluate(eval_data))

        for i in range(1, 5):
            self.randomize_sentences()
            training_data = self.set_train_set(self.get_cutoff())
            evaluation_data = self.set_evaluation_set(self.get_cutoff(),
                                                      self.development_size)
            print('Fold: %s' % i)
            tr_brill_tagger = br_trainer.train(training_data, self.max_rules,
                                               self.min_score)
            print("TR Brill accuracy: %s" %
                  tr_brill_tagger.evaluate(evaluation_data))

        self.dump_tagger_to_file(tr_brill_tagger)

예제 #10

0

파일 보기

파일: BrillTagger.py 프로젝트: AnandN5/qint_nlp

class BrillTagger(object):
    def __init__(self):
        # bounds = [(1, end)]
        initial_tagger = get_initial_tagger()
        rules = brill.fntbl37()

        self.trainer = BrillTaggerTrainer(initial_tagger,
                                          rules,
                                          deterministic=True,
                                          trace=0)
        train_sents, test_sents = utils.training_testing_dataset()
        self.tagger = self.trainer.train(train_sents, max_rules=20)
        print('Brill tagger training completed')

    def tag(self, sent_tokens):
        tagged_sentences = []
        for sent in sent_tokens:
            tags = self.tagger.tag([w for w in sent])
            tagged_sentences.append(tags)
        return tagged_sentences

예제 #11

0

파일 보기

파일: tagger.py 프로젝트: michaellzc/cmput497_a3_yonael_zichun3

class BrillTagger(Tagger):
    def train(self, data):
        # baseline tagger: unigram tagger
        hmm = HMMTagger()
        hmm.train(data)
        self.baseline_tagger = hmm.tagger

        # train brill tagger with HMM as baseline
        templates = [
            brill.Template(brill.Pos([-1])),
            brill.Template(brill.Pos([1])),
            brill.Template(brill.Pos([-2])),
            brill.Template(brill.Pos([2])),
            brill.Template(brill.Pos([-2, -1])),
            brill.Template(brill.Pos([1, 2])),
            brill.Template(brill.Pos([-3, -2, -1])),
            brill.Template(brill.Pos([1, 2, 3])),
            brill.Template(brill.Pos([-1]), brill.Pos([1])),
            brill.Template(brill.Word([-1])),
            brill.Template(brill.Word([1])),
            brill.Template(brill.Word([-2])),
            brill.Template(brill.Word([2])),
            brill.Template(brill.Word([-2, -1])),
            brill.Template(brill.Word([1, 2])),
            brill.Template(brill.Word([-3, -2, -1])),
            brill.Template(brill.Word([1, 2, 3])),
            brill.Template(brill.Word([-1]), brill.Word([1])),
        ]
        self.trainer = BrillTaggerTrainer(self.baseline_tagger, templates=templates)
        self.tagger = self.trainer.train(data)

    def test(self, test_data):
        logger.info(
            "Baseline tagger accuracy: {:.2f}%".format(
                self.baseline_tagger.evaluate(test_data) * 100.0
            )
        )
        return self.tagger.evaluate(test_data)

예제 #12

0

파일 보기

class Brill(NltkModel):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.tagger = kwargs["tagger"]

        templates = [
            brill.Template(brill.Pos([-1])),
            brill.Template(brill.Pos([1])),
            brill.Template(brill.Pos([-2])),
            brill.Template(brill.Pos([2])),
            brill.Template(brill.Pos([-2, -1])),
            brill.Template(brill.Pos([1, 2])),
            brill.Template(brill.Pos([-3, -2, -1])),
            brill.Template(brill.Pos([1, 2, 3])),
            brill.Template(brill.Pos([-1]), brill.Pos([1])),
            brill.Template(brill.Word([-1])),
            brill.Template(brill.Word([1])),
            brill.Template(brill.Word([-2])),
            brill.Template(brill.Word([2])),
            brill.Template(brill.Word([-2, -1])),
            brill.Template(brill.Word([1, 2])),
            brill.Template(brill.Word([-3, -2, -1])),
            brill.Template(brill.Word([1, 2, 3])),
            brill.Template(brill.Word([-1]), brill.Word([1])),
        ]

        self.brillTrainer = BrillTaggerTrainer(self.tagger,
                                               templates,
                                               deterministic=True)

    def train(self, text):
        self.trainedBrill = self.brillTrainer.train(text)

    def test(self, testText):
        acc = self.trainedBrill.evaluate(testText)
        return acc

예제 #13

0

파일 보기

파일: brillTagger.py 프로젝트: simonhughes22/PythonNlpResearch

        td, vd = td_sents_by_code[code], vd_sents_by_code[code]

        hmm_fname = "%s_cv-%i_fold-%i_code-%s_stemed-%s.dill" % (hmm_model_prefix, CV_FOLDS, fold, code, str(STEM))
        if os.path.exists(hmm_fname):
            with open(hmm_fname, "rb") as f:
                base_tagger = dill.load(f)
        else:
            hmm_trainer = HiddenMarkovModelTrainer()
            base_tagger = hmm_trainer.train_supervised(td)
            with open(hmm_fname, "wb") as f:
                dill.dump(base_tagger, f)

        #See: http://streamhacker.com/2008/12/03/part-of-speech-tagging-with-nltk-part-3/
        #and http://streamhacker.com/2014/12/02/nltk-3/ for changes to interface

        trainer = BrillTaggerTrainer(base_tagger, templates, deterministic=True)
        model = trainer.train(td, max_rules=MAX_RULES, min_score=MIN_SCORE)
        code2model[code] = model

        wd_td_ys_bytag[code] = to_flattened_binary_tags(td)
        wd_vd_ys_bytag[code] = to_flattened_binary_tags(vd)

        td_predictions = model.tag_sents(to_sentences(td))
        vd_predictions = model.tag_sents(to_sentences(vd))

        td_wd_predictions_by_code[code] = to_flattened_binary_tags(td_predictions)
        vd_wd_predictions_by_code[code] = to_flattened_binary_tags(vd_predictions)

    merge_dictionaries(wd_td_ys_bytag, cv_wd_td_ys_by_tag)
    merge_dictionaries(wd_vd_ys_bytag, cv_wd_vd_ys_by_tag)
    merge_dictionaries(td_wd_predictions_by_code, cv_wd_td_predictions_by_tag)

예제 #14

0

파일 보기

파일: brill.py 프로젝트: menzenski/Razmetka

    def train(self, templates=None, verbose=True):
        """Train a new Brill tagger."""
        if templates is None:
            templates = brill.nltkdemo18()

        random.seed(len(self.tagged_data_list))
        random.shuffle(self.tagged_data_list)
        cutoff = int(self.dev_size * self.train_size)

        training_data = self.tagged_data_list[:cutoff]
        test_data = self.tagged_data_list[cutoff:self.dev_size]

        # very simple regular expression tagger
        regex_tagger = RegexpTagger([(r'^-?[0-9]+(.[0-9]+)?$', 'PUNCT'),
                                     (r'.*', 'N')])
        if verbose == True:
            print "Regular expression tagger accuracy:\n{}\n".format(
                regex_tagger.evaluate(test_data))

        # unigram tagger
        unigram_tagger = UnigramTagger(train=training_data,
                                       backoff=regex_tagger)
        if verbose == True:
            print "Unigram tagger accuracy:\n{}\n".format(
                unigram_tagger.evaluate(test_data))

        # bigram tagger
        bigram_tagger = BigramTagger(train=training_data,
                                     backoff=unigram_tagger)
        if verbose == True:
            print "Bigram tagger accuracy:\n{}\n".format(
                bigram_tagger.evaluate(test_data))

        # trigram tagger
        trigram_tagger = TrigramTagger(train=training_data,
                                       backoff=bigram_tagger)
        if verbose == True:
            print "Trigram tagger accuracy:\n{}\n".format(
                trigram_tagger.evaluate(test_data))

        # first iteration
        trainer = BrillTaggerTrainer(initial_tagger=trigram_tagger,
                                     templates=templates)
        brill_tagger = trainer.train(train_sents=training_data,
                                     max_rules=self.max_rules,
                                     min_score=self.min_score)
        if verbose == True:
            print "Initial Brill tagger accuracy:\n{}\n".format(
                brill_tagger.evaluate(test_data))

        # folding
        for i in range(0, self.num_groups):
            # random splitting
            random.seed(len(self.tagged_data_list))
            random.shuffle(self.tagged_data_list)
            cutoff = int(self.dev_size * self.train_size)

            training_data = self.tagged_data_list[:cutoff]
            test_data = self.tagged_data_list[cutoff:self.dev_size]

            # note that .train method returns a BrillTagger() object
            brill_tagger = trainer.train(train_sents=training_data,
                                         max_rules=self.max_rules,
                                         min_score=self.min_score)

            if verbose == True:
                print "Brill tagger accuracy, fold {}:\n{}\n".format(
                    i + 1, brill_tagger.evaluate(test_data))

예제 #15

0

파일 보기

파일: brill.py 프로젝트: menzenski/Razmetka

    def train(self, templates=None, verbose=True):
        """Train a new Brill tagger."""
        if templates is None:
            templates = brill.nltkdemo18()

        random.seed(len(self.tagged_data_list))
        random.shuffle(self.tagged_data_list)
        cutoff = int(self.dev_size * self.train_size)

        training_data = self.tagged_data_list[:cutoff]
        test_data = self.tagged_data_list[cutoff:self.dev_size]

        # very simple regular expression tagger
        regex_tagger = RegexpTagger([
            (r'^-?[0-9]+(.[0-9]+)?$', 'PUNCT'),
            (r'.*', 'N')
            ])
        if verbose == True:
            print "Regular expression tagger accuracy:\n{}\n".format(
                    regex_tagger.evaluate(test_data))

        # unigram tagger
        unigram_tagger = UnigramTagger(train=training_data,
                                       backoff=regex_tagger)
        if verbose == True:
            print "Unigram tagger accuracy:\n{}\n".format(
                    unigram_tagger.evaluate(test_data))

        # bigram tagger
        bigram_tagger = BigramTagger(train=training_data,
                                     backoff=unigram_tagger)
        if verbose == True:
            print "Bigram tagger accuracy:\n{}\n".format(
                    bigram_tagger.evaluate(test_data))

        # trigram tagger
        trigram_tagger = TrigramTagger(train=training_data,
                                       backoff=bigram_tagger)
        if verbose == True:
            print "Trigram tagger accuracy:\n{}\n".format(
                    trigram_tagger.evaluate(test_data))

        # first iteration
        trainer = BrillTaggerTrainer(initial_tagger=trigram_tagger,
                                     templates=templates)
        brill_tagger = trainer.train(train_sents=training_data,
                                     max_rules=self.max_rules,
                                     min_score=self.min_score)
        if verbose == True:
            print "Initial Brill tagger accuracy:\n{}\n".format(
                    brill_tagger.evaluate(test_data))

        # folding
        for i in range(0, self.num_groups):
            # random splitting
            random.seed(len(self.tagged_data_list))
            random.shuffle(self.tagged_data_list)
            cutoff = int(self.dev_size * self.train_size)

            training_data = self.tagged_data_list[:cutoff]
            test_data = self.tagged_data_list[cutoff:self.dev_size]

            # note that .train method returns a BrillTagger() object
            brill_tagger = trainer.train(train_sents=training_data,
                                         max_rules=self.max_rules,
                                         min_score=self.min_score)

            if verbose == True:
                print "Brill tagger accuracy, fold {}:\n{}\n".format(
                        i+1, brill_tagger.evaluate(test_data))

예제 #16

0

파일 보기

파일: brillTagger.py 프로젝트: IslamMohamedMosaad/PythonNlpResearch

        hmm_fname = "%s_cv-%i_fold-%i_code-%s_stemed-%s.dill" % (
            hmm_model_prefix, CV_FOLDS, fold, code, str(STEM))
        if os.path.exists(hmm_fname):
            with open(hmm_fname, "rb") as f:
                base_tagger = dill.load(f)
        else:
            hmm_trainer = HiddenMarkovModelTrainer()
            base_tagger = hmm_trainer.train_supervised(td)
            with open(hmm_fname, "wb") as f:
                dill.dump(base_tagger, f)

        #See: http://streamhacker.com/2008/12/03/part-of-speech-tagging-with-nltk-part-3/
        #and http://streamhacker.com/2014/12/02/nltk-3/ for changes to interface

        trainer = BrillTaggerTrainer(base_tagger,
                                     templates,
                                     deterministic=True)
        model = trainer.train(td, max_rules=MAX_RULES, min_score=MIN_SCORE)
        code2model[code] = model

        wd_td_ys_bytag[code] = to_flattened_binary_tags(td)
        wd_vd_ys_bytag[code] = to_flattened_binary_tags(vd)

        td_predictions = model.tag_sents(to_sentences(td))
        vd_predictions = model.tag_sents(to_sentences(vd))

        td_wd_predictions_by_code[code] = to_flattened_binary_tags(
            td_predictions)
        vd_wd_predictions_by_code[code] = to_flattened_binary_tags(
            vd_predictions)

예제 #17

0

파일 보기

def train_brill_tagger(train_data):
    # Modules for creating the templates.
    from nltk import UnigramTagger
    # The brill tagger module in NLTK.
    from nltk.tag.brill_trainer import BrillTaggerTrainer
    from nltk import BigramTagger,UnigramTagger,TrigramTagger
    import nltk
    from pickle import dump
    #unigram_tagger = UnigramTagger(train_data)
    templates=nltk.tag.brill.fntbl37()
    #Regular expression (Regex) Tagger as a default tagger
    default_tagger = nltk.RegexpTagger(
        [(r'^[Jj]ing', 'ABN'),
         (r'^[pP]yn', 'CAV'),
         (r'^[nN]ga$', '1PSG'),
         (r'^[pP]hi$', '2PG'),
         (r'^[pP]ha$', '2PF'),
         (r'^[mM]e$', '2PM'),
         (r'^[iI]$', '3PSG'),
         (r'^[bB]an$', 'INP'),
         (r'^[Kk]a$', '3PSF'),
         (r'^[uU]$', '3PSM'),
         (r'^[kK]i$', '3PPG'),
         (r'(sha|da|na|hapoh|halor|ha|naduh|shaduh|hapdeng|haduh)$', 'IN'),
         (r'(bad|ruh|namar|hynrei|tangba|katba|katta)$', 'COC'),
         (r'(lada|haba|khnang|ynda)$', 'SUC'),
         (r'(katkum|kat|pat|wat|tang|lang)$', 'AD'),
         (r'(bun|baroh)$', 'QNT'),
         (r'^-?[0-9]+(.[0-9]+)?$', 'CN'),
         (r'(dei|long|don)$', 'CO'),
         (r'^[jJ]ong$', 'POP'),
         (r'^[sS]hah$', 'PAV'),
         (r'^[lL]ah$', 'MOD'),
         (r'^[lL]a$', 'VST'),
         (r'(ym|em|khlem|nym|kam)$', 'NEG'),
         (r'^hi$', 'EM'),
         (r'.*lade$', 'RFP'),
         (r'(dang|nang)$', 'VPP'),
         (r'([uU]n|[kK]an|[kK]in|[sS]a|[yY]n|[nN]gin|[pP]hin)$', 'VFT'),
         (r'(.*ngut|.*tylli)$', 'ADJ'),
         (r'^[bB]a$', 'COM'),
         (r'^\W+$', 'SYM'),
         (r'[^a-z\W]a$', 'IN'),
         (r'([vV]ote|[bB]ye|[cC]onstituency|[sS]outh)$', 'FR'),
         (r'.*', 'CMN')

         ])
    t0 = default_tagger
    print(train_data)
    t1 = UnigramTagger(train_data,backoff=t0)
    t2 = BigramTagger(train_data,backoff=t1)
    t3 = TrigramTagger(train_data,backoff=t2)


    trainer = BrillTaggerTrainer(initial_tagger=t3,
                                   templates=templates, trace=3,
                                   deterministic=True)
    brill_tagger = trainer.train(train_data,max_rules=10)

    # Saving the Tagger for future use
    output = open('t2.pkl', 'wb')
    dump(t3, output, -1)
    output.close()
    return brill_tagger

예제 #18

0

파일 보기

        brill.Template(brill.Pos([2])),
        brill.Template(brill.Pos([-2, -1])),
        brill.Template(brill.Pos([1, 2])),
        brill.Template(brill.Pos([-3, -2, -1])),
        brill.Template(brill.Pos([1, 2, 3])),
        brill.Template(brill.Pos([-1]), brill.Pos([1])),
        brill.Template(brill.Word([-1])),
        brill.Template(brill.Word([1])),
        brill.Template(brill.Word([-2])),
        brill.Template(brill.Word([2])),
        brill.Template(brill.Word([-2, -1])),
        brill.Template(brill.Word([1, 2])),
        brill.Template(brill.Word([-3, -2, -1])),
        brill.Template(brill.Word([1, 2, 3])),
        brill.Template(brill.Word([-1]), brill.Word([1]))]        
trainer = BrillTaggerTrainer(postag, templates = templates, trace = 3)
brill_tagger = trainer.train(traindata, max_rules = 10)


# # Source and Destination Extraction From Sentence # 

# In[12]:

def extract_location(inp):
    tagged = brill_tagger.tag(word_tokenize(inp))
    source = None
    destination = None
    chunkGram = """Source: {<IN>(<NN.*><,>?)+}"""
    chunkParser = nltk.RegexpParser(chunkGram)
    chunked = chunkParser.parse(tagged)
    for subtree in chunked.subtrees(filter=lambda t: t.label() == 'Source'):