Exemplo n.º 1
0
    def tag(self, sent, tagregex=True, deftag='XX', verbose=False):
        kalimat = sent.encode('utf-8')

        ## :> --___<<IMPORTANT>>___--
        ##      Untuk beberapa hal default tagger harus dibiarkan 'XX'
        ##      dengan tujuan identifikasi Entitas
        backoff_tagger = DefaultTagger(deftag)

        if tagregex:
           text = self.regexTokenizer(kalimat.lower().strip())
           regexp_tagger = RegexpTagger(patterns.regex_patterns,backoff=backoff_tagger)
           unigram_tagger = UnigramTagger(self.reader_train, backoff=regexp_tagger)
        else:
           text = self.wordTokenizer(kalimat.lower().strip())
           unigram_tagger = UnigramTagger(self.reader_train, backoff=backoff_tagger)
           
        bigram_tagger = BigramTagger(self.reader_train, backoff=unigram_tagger)
        trigram_tagger = TrigramTagger(self.reader_train, backoff=bigram_tagger)
        
        """
        # Menggunakan dataset pan localization bahasa indonesia "UI-1M-tagged.txt"
        # kombinasi proses tagging diatas menghasilkan tingkat akurasi:
        #      dengan regextagger: < 77%
        #      tanpa regextagger : > 83%
        """
        if verbose:
           # Semakin besar dokumen, semakin lama proses perhitungan akurasi
           # disarankan hanya untuk testing
           print "Calculating Tagger Accuracy..."
           self.tagAccuracy = trigram_tagger.evaluate(self.test_sent)
           print "Accuracy is: %4.2f %%" % (100.0 * self.tagAccuracy)
        
        return trigram_tagger.tag(text)
Exemplo n.º 2
0
    def tag(self, sent, tagregex=True, deftag='XX', verbose=False):
        kalimat = sent.encode('utf-8')

        text = self.regexTokenizer(kalimat.lower().strip())

        ## :> --___<<IMPORTANT>>___--
        ##      Untuk beberapa hal default tagger harus dibiarkan 'XX'
        ##      dengan tujuan identifikasi Entitas
        backoff_tagger = DefaultTagger(deftag)

        if tagregex:
           regexp_tagger = RegexpTagger(patterns.regex_patterns,backoff=backoff_tagger)
           unigram_tagger = UnigramTagger(self.reader_train, backoff=regexp_tagger)
        else:
           unigram_tagger = UnigramTagger(self.reader_train, backoff=backoff_tagger)
           
        bigram_tagger = BigramTagger(self.reader_train, backoff=unigram_tagger)
        trigram_tagger = TrigramTagger(self.reader_train, backoff=bigram_tagger)
        
        """
        # Menggunakan dataset pan localization bahasa indonesia "UI-1M-tagged.txt"
        # kombinasi proses tagging diatas menghasilkan tingkat akurasi:
        #      dengan regextagger: 77%
        #      tanpa regextagger : > 90%
        """
        if verbose:
           # Semakin besar dokumen, semakin lama proses perhitungan akurasi
           # disarankan hanya untuk testing
           print ("Calculating Tagger Accuracy...")
           self.tagAccuracy = trigram_tagger.evaluate(self.test_sent)
           print ("Accuracy is: %s" % (self.tagAccuracy))
        
        return trigram_tagger.tag(text)
def train_trigram(data):
    trigram_tagger = TrigramTagger(data)
    with open(os.path.join(PACKDIR, "nib_marburg", "trigram.pickle"),
              "wb") as f:
        pickle.dump(trigram_tagger, f)
    res = trigram_tagger.tag(
        "uns ist in alten mæren wunders vil geseit".split(" "))
    print(res)
Exemplo n.º 4
0
def unigram_trigram_tagger(train_sentences):
    return TrigramTagger(
        train_sentences,
        backoff=UnigramTagger(
            train_sentences,
            backoff=DefaultTagger("NN")
        )
    )
Exemplo n.º 5
0
def ngramtagging(train): #PROSES POS TAGGING
    train_data = []
    train_data.append(train)
    backoff_tagger = DefaultTagger('nn')
    unigram_tagger = UnigramTagger(train_data, backoff = backoff_tagger)
    bigram_tagger = BigramTagger(train_data, backoff = unigram_tagger)
    trigram_tagger = TrigramTagger(train_data, backoff = bigram_tagger)
    return trigram_tagger
Exemplo n.º 6
0
    def __init__(self, *args, **kwargs):
        SequentialBackoffTagger.__init__(self, *args, **kwargs)

        self.dist = FreqDist()

        tagged_brown_path = os.path.join(os.path.dirname(__file__),
                                         '../data/brown_clawstags.pickle')
        train_sents = pickle.load(open(tagged_brown_path, 'rb'))

        # make sure all tuples are in the required format: (TAG, word)
        train_sents = [[t for t in sentence if len(t) == 2]
                       for sentence in train_sents]

        # default_tagger = DefaultTagger('nn')
        wn_tagger = WordNetTagger()
        names_tagger = NamesTagger(wn_tagger)
        coca_tagger = COCATagger(names_tagger)
        bigram_tagger = BigramTagger(train_sents, backoff=coca_tagger)
        trigram_tagger = TrigramTagger(train_sents, backoff=bigram_tagger)

        # doesn't include self cause it's a dumb tagger (would always return None)
        self._taggers = trigram_tagger._taggers
Exemplo n.º 7
0
    def train(self, templates=None, verbose=True):
        """Train a new Brill tagger."""
        if templates is None:
            templates = brill.nltkdemo18()

        random.seed(len(self.tagged_data_list))
        random.shuffle(self.tagged_data_list)
        cutoff = int(self.dev_size * self.train_size)

        training_data = self.tagged_data_list[:cutoff]
        test_data = self.tagged_data_list[cutoff:self.dev_size]

        # very simple regular expression tagger
        regex_tagger = RegexpTagger([(r'^-?[0-9]+(.[0-9]+)?$', 'PUNCT'),
                                     (r'.*', 'N')])
        if verbose == True:
            print "Regular expression tagger accuracy:\n{}\n".format(
                regex_tagger.evaluate(test_data))

        # unigram tagger
        unigram_tagger = UnigramTagger(train=training_data,
                                       backoff=regex_tagger)
        if verbose == True:
            print "Unigram tagger accuracy:\n{}\n".format(
                unigram_tagger.evaluate(test_data))

        # bigram tagger
        bigram_tagger = BigramTagger(train=training_data,
                                     backoff=unigram_tagger)
        if verbose == True:
            print "Bigram tagger accuracy:\n{}\n".format(
                bigram_tagger.evaluate(test_data))

        # trigram tagger
        trigram_tagger = TrigramTagger(train=training_data,
                                       backoff=bigram_tagger)
        if verbose == True:
            print "Trigram tagger accuracy:\n{}\n".format(
                trigram_tagger.evaluate(test_data))

        # first iteration
        trainer = BrillTaggerTrainer(initial_tagger=trigram_tagger,
                                     templates=templates)
        brill_tagger = trainer.train(train_sents=training_data,
                                     max_rules=self.max_rules,
                                     min_score=self.min_score)
        if verbose == True:
            print "Initial Brill tagger accuracy:\n{}\n".format(
                brill_tagger.evaluate(test_data))

        # folding
        for i in range(0, self.num_groups):
            # random splitting
            random.seed(len(self.tagged_data_list))
            random.shuffle(self.tagged_data_list)
            cutoff = int(self.dev_size * self.train_size)

            training_data = self.tagged_data_list[:cutoff]
            test_data = self.tagged_data_list[cutoff:self.dev_size]

            # note that .train method returns a BrillTagger() object
            brill_tagger = trainer.train(train_sents=training_data,
                                         max_rules=self.max_rules,
                                         min_score=self.min_score)

            if verbose == True:
                print "Brill tagger accuracy, fold {}:\n{}\n".format(
                    i + 1, brill_tagger.evaluate(test_data))
Exemplo n.º 8
0
    def train(self, templates=None, verbose=True):
        """Train a new Brill tagger."""
        if templates is None:
            templates = brill.nltkdemo18()

        random.seed(len(self.tagged_data_list))
        random.shuffle(self.tagged_data_list)
        cutoff = int(self.dev_size * self.train_size)

        training_data = self.tagged_data_list[:cutoff]
        test_data = self.tagged_data_list[cutoff:self.dev_size]

        # very simple regular expression tagger
        regex_tagger = RegexpTagger([
            (r'^-?[0-9]+(.[0-9]+)?$', 'PUNCT'),
            (r'.*', 'N')
            ])
        if verbose == True:
            print "Regular expression tagger accuracy:\n{}\n".format(
                    regex_tagger.evaluate(test_data))

        # unigram tagger
        unigram_tagger = UnigramTagger(train=training_data,
                                       backoff=regex_tagger)
        if verbose == True:
            print "Unigram tagger accuracy:\n{}\n".format(
                    unigram_tagger.evaluate(test_data))

        # bigram tagger
        bigram_tagger = BigramTagger(train=training_data,
                                     backoff=unigram_tagger)
        if verbose == True:
            print "Bigram tagger accuracy:\n{}\n".format(
                    bigram_tagger.evaluate(test_data))

        # trigram tagger
        trigram_tagger = TrigramTagger(train=training_data,
                                       backoff=bigram_tagger)
        if verbose == True:
            print "Trigram tagger accuracy:\n{}\n".format(
                    trigram_tagger.evaluate(test_data))

        # first iteration
        trainer = BrillTaggerTrainer(initial_tagger=trigram_tagger,
                                     templates=templates)
        brill_tagger = trainer.train(train_sents=training_data,
                                     max_rules=self.max_rules,
                                     min_score=self.min_score)
        if verbose == True:
            print "Initial Brill tagger accuracy:\n{}\n".format(
                    brill_tagger.evaluate(test_data))

        # folding
        for i in range(0, self.num_groups):
            # random splitting
            random.seed(len(self.tagged_data_list))
            random.shuffle(self.tagged_data_list)
            cutoff = int(self.dev_size * self.train_size)

            training_data = self.tagged_data_list[:cutoff]
            test_data = self.tagged_data_list[cutoff:self.dev_size]

            # note that .train method returns a BrillTagger() object
            brill_tagger = trainer.train(train_sents=training_data,
                                         max_rules=self.max_rules,
                                         min_score=self.min_score)

            if verbose == True:
                print "Brill tagger accuracy, fold {}:\n{}\n".format(
                        i+1, brill_tagger.evaluate(test_data))