def train_bigram(data): bigram_tagger = BigramTagger(data) with open(os.path.join(PACKDIR, "nib_marburg", "bigram.pickle"), "wb") as f: pickle.dump(bigram_tagger, f) res = bigram_tagger.tag( "uns ist in alten mæren wunders vil geseit".split(" ")) print(res)
def tag(self, sent, tagregex=True, deftag='XX', verbose=False): kalimat = sent.encode('utf-8') text = self.regexTokenizer(kalimat.lower().strip()) ## :> --___<<IMPORTANT>>___-- ## Untuk beberapa hal default tagger harus dibiarkan 'XX' ## dengan tujuan identifikasi Entitas backoff_tagger = DefaultTagger(deftag) if tagregex: regexp_tagger = RegexpTagger(patterns.regex_patterns,backoff=backoff_tagger) unigram_tagger = UnigramTagger(self.reader_train, backoff=regexp_tagger) else: unigram_tagger = UnigramTagger(self.reader_train, backoff=backoff_tagger) bigram_tagger = BigramTagger(self.reader_train, backoff=unigram_tagger) trigram_tagger = TrigramTagger(self.reader_train, backoff=bigram_tagger) """ # Menggunakan dataset pan localization bahasa indonesia "UI-1M-tagged.txt" # kombinasi proses tagging diatas menghasilkan tingkat akurasi: # dengan regextagger: 77% # tanpa regextagger : > 90% """ if verbose: # Semakin besar dokumen, semakin lama proses perhitungan akurasi # disarankan hanya untuk testing print ("Calculating Tagger Accuracy...") self.tagAccuracy = trigram_tagger.evaluate(self.test_sent) print ("Accuracy is: %s" % (self.tagAccuracy)) return trigram_tagger.tag(text)
def unigram_bigram_tagger(train_sentences): return BigramTagger( train_sentences, backoff=UnigramTagger( train_sentences, backoff=DefaultTagger("NN") ) )
def ngramtagging(train): #PROSES POS TAGGING train_data = [] train_data.append(train) backoff_tagger = DefaultTagger('nn') unigram_tagger = UnigramTagger(train_data, backoff = backoff_tagger) bigram_tagger = BigramTagger(train_data, backoff = unigram_tagger) trigram_tagger = TrigramTagger(train_data, backoff = bigram_tagger) return trigram_tagger
def __init__(self, *args, **kwargs): SequentialBackoffTagger.__init__(self, *args, **kwargs) self.dist = FreqDist() tagged_brown_path = os.path.join(os.path.dirname(__file__), '../data/brown_clawstags.pickle') train_sents = pickle.load(open(tagged_brown_path, 'rb')) # make sure all tuples are in the required format: (TAG, word) train_sents = [[t for t in sentence if len(t) == 2] for sentence in train_sents] # default_tagger = DefaultTagger('nn') wn_tagger = WordNetTagger() names_tagger = NamesTagger(wn_tagger) coca_tagger = COCATagger(names_tagger) bigram_tagger = BigramTagger(train_sents, backoff=coca_tagger) trigram_tagger = TrigramTagger(train_sents, backoff=bigram_tagger) # doesn't include self cause it's a dumb tagger (would always return None) self._taggers = trigram_tagger._taggers
def train(self, templates=None, verbose=True): """Train a new Brill tagger.""" if templates is None: templates = brill.nltkdemo18() random.seed(len(self.tagged_data_list)) random.shuffle(self.tagged_data_list) cutoff = int(self.dev_size * self.train_size) training_data = self.tagged_data_list[:cutoff] test_data = self.tagged_data_list[cutoff:self.dev_size] # very simple regular expression tagger regex_tagger = RegexpTagger([(r'^-?[0-9]+(.[0-9]+)?$', 'PUNCT'), (r'.*', 'N')]) if verbose == True: print "Regular expression tagger accuracy:\n{}\n".format( regex_tagger.evaluate(test_data)) # unigram tagger unigram_tagger = UnigramTagger(train=training_data, backoff=regex_tagger) if verbose == True: print "Unigram tagger accuracy:\n{}\n".format( unigram_tagger.evaluate(test_data)) # bigram tagger bigram_tagger = BigramTagger(train=training_data, backoff=unigram_tagger) if verbose == True: print "Bigram tagger accuracy:\n{}\n".format( bigram_tagger.evaluate(test_data)) # trigram tagger trigram_tagger = TrigramTagger(train=training_data, backoff=bigram_tagger) if verbose == True: print "Trigram tagger accuracy:\n{}\n".format( trigram_tagger.evaluate(test_data)) # first iteration trainer = BrillTaggerTrainer(initial_tagger=trigram_tagger, templates=templates) brill_tagger = trainer.train(train_sents=training_data, max_rules=self.max_rules, min_score=self.min_score) if verbose == True: print "Initial Brill tagger accuracy:\n{}\n".format( brill_tagger.evaluate(test_data)) # folding for i in range(0, self.num_groups): # random splitting random.seed(len(self.tagged_data_list)) random.shuffle(self.tagged_data_list) cutoff = int(self.dev_size * self.train_size) training_data = self.tagged_data_list[:cutoff] test_data = self.tagged_data_list[cutoff:self.dev_size] # note that .train method returns a BrillTagger() object brill_tagger = trainer.train(train_sents=training_data, max_rules=self.max_rules, min_score=self.min_score) if verbose == True: print "Brill tagger accuracy, fold {}:\n{}\n".format( i + 1, brill_tagger.evaluate(test_data))
def train(self, templates=None, verbose=True): """Train a new Brill tagger.""" if templates is None: templates = brill.nltkdemo18() random.seed(len(self.tagged_data_list)) random.shuffle(self.tagged_data_list) cutoff = int(self.dev_size * self.train_size) training_data = self.tagged_data_list[:cutoff] test_data = self.tagged_data_list[cutoff:self.dev_size] # very simple regular expression tagger regex_tagger = RegexpTagger([ (r'^-?[0-9]+(.[0-9]+)?$', 'PUNCT'), (r'.*', 'N') ]) if verbose == True: print "Regular expression tagger accuracy:\n{}\n".format( regex_tagger.evaluate(test_data)) # unigram tagger unigram_tagger = UnigramTagger(train=training_data, backoff=regex_tagger) if verbose == True: print "Unigram tagger accuracy:\n{}\n".format( unigram_tagger.evaluate(test_data)) # bigram tagger bigram_tagger = BigramTagger(train=training_data, backoff=unigram_tagger) if verbose == True: print "Bigram tagger accuracy:\n{}\n".format( bigram_tagger.evaluate(test_data)) # trigram tagger trigram_tagger = TrigramTagger(train=training_data, backoff=bigram_tagger) if verbose == True: print "Trigram tagger accuracy:\n{}\n".format( trigram_tagger.evaluate(test_data)) # first iteration trainer = BrillTaggerTrainer(initial_tagger=trigram_tagger, templates=templates) brill_tagger = trainer.train(train_sents=training_data, max_rules=self.max_rules, min_score=self.min_score) if verbose == True: print "Initial Brill tagger accuracy:\n{}\n".format( brill_tagger.evaluate(test_data)) # folding for i in range(0, self.num_groups): # random splitting random.seed(len(self.tagged_data_list)) random.shuffle(self.tagged_data_list) cutoff = int(self.dev_size * self.train_size) training_data = self.tagged_data_list[:cutoff] test_data = self.tagged_data_list[cutoff:self.dev_size] # note that .train method returns a BrillTagger() object brill_tagger = trainer.train(train_sents=training_data, max_rules=self.max_rules, min_score=self.min_score) if verbose == True: print "Brill tagger accuracy, fold {}:\n{}\n".format( i+1, brill_tagger.evaluate(test_data))