def parse_sections(sections): for section_name, lines in sections.iteritems(): backup_tagger = nltk.data.load( 'taggers/maxent_treebank_pos_tagger/english.pickle') medicines = MedicineTagger("drugs.txt", backup_tagger) tagger = RegexpTagger(tokens, medicines) sents = sent_tokenize(" ".join(lines)) sents = [tagger.tag(nltk.word_tokenize(s)) for s in sents] #prev = [] #w = word_tokenize(s) #for i in range(0,len(w)): # prev.append(tagger.choose_tag(w, i, prev)) #print ",".join([j or "" for j in prev]) p = nltk.RegexpParser(parse_rules) for i in range(0, len(sents)): #print sents[i] if len(sents[i]) == 0: continue result = p.parse(sents[i]) if (len(sys.argv) > 2 and sys.argv[2] == 'debug'): print result for child in result: # The only trees will be the ones we created. #if isinstance(child, nltk.tree.Tree): anything_useful(section_name, child)
def parse_sections(sections): for section_name, lines in sections.iteritems(): backup_tagger = nltk.data.load('taggers/maxent_treebank_pos_tagger/english.pickle') medicines = MedicineTagger("drugs.txt", backup_tagger) tagger = RegexpTagger(tokens,medicines) sents = sent_tokenize(" ".join(lines)) sents = [ tagger.tag(nltk.word_tokenize(s)) for s in sents ] #prev = [] #w = word_tokenize(s) #for i in range(0,len(w)): # prev.append(tagger.choose_tag(w, i, prev)) #print ",".join([j or "" for j in prev]) p = nltk.RegexpParser(parse_rules) for i in range(0,len(sents)): #print sents[i] if len(sents[i]) == 0: continue result = p.parse(sents[i]) if (len(sys.argv) > 2 and sys.argv[2] == 'debug'): print result for child in result: # The only trees will be the ones we created. #if isinstance(child, nltk.tree.Tree): anything_useful(section_name, child)
def __init__(self, regexps=None, backoff=None): """Setup for RegexpLemmatizer() :param regexps: List of tuples of form (PATTERN, REPLACEMENT) :param backoff: Next lemmatizer in backoff chain. """ SequentialBackoffLemmatizer.__init__(self, backoff) RegexpTagger.__init__(self, regexps, backoff) self._regexs = regexps
def __init__(self, regexps=None, backoff=None): """Setup for RegexpLemmatizer() :param regexps: List of tuples of form (PATTERN, REPLACEMENT) :param backoff: Next lemmatizer in backoff chain. """ SequentialBackoffLemmatizer.__init__(self, backoff) RegexpTagger.__init__(self, regexps, backoff) self._check = re.compile('|'.join('(?:%s)' % r[0] for r in regexps)) self._regexs = [(re.compile(regexp), pattern,) for regexp, pattern in regexps]
def __init__(self: object, regexps=None, source=None, backoff=None, verbose: bool = False): """Setup for RegexpLemmatizer() :type regexps: list :param regexps: List of tuples of form (PATTERN, REPLACEMENT) :param backoff: Next lemmatizer in backoff chain. """ SequentialBackoffLemmatizer.__init__(self, backoff=None, verbose=verbose) RegexpTagger.__init__(self, regexps, backoff) self._regexs = regexps self.source = source
def __init__(self: object, regexps=None, backoff=None, source: str = None, verbose: bool = False): """Setup for RegexpLemmatizer() :param regexps: List of tuples of form (PATTERN, REPLACEMENT) :param backoff: Next lemmatizer in backoff chain :param source: String for labelling lemmatizer in repr; used by verbose mode :param verbose: Flag to include which lemmatizer assigned in a given tag in the return tuple """ SequentialEnsembleLemmatizer.__init__(self, backoff=None, verbose=verbose) RegexpTagger.__init__(self, regexps, backoff) self._regexs = regexps self.source = source
def tag(self, sent, tagregex=True, deftag='XX', verbose=False): kalimat = sent.encode('utf-8') text = self.regexTokenizer(kalimat.lower().strip()) ## :> --___<<IMPORTANT>>___-- ## Untuk beberapa hal default tagger harus dibiarkan 'XX' ## dengan tujuan identifikasi Entitas backoff_tagger = DefaultTagger(deftag) if tagregex: regexp_tagger = RegexpTagger(patterns.regex_patterns,backoff=backoff_tagger) unigram_tagger = UnigramTagger(self.reader_train, backoff=regexp_tagger) else: unigram_tagger = UnigramTagger(self.reader_train, backoff=backoff_tagger) bigram_tagger = BigramTagger(self.reader_train, backoff=unigram_tagger) trigram_tagger = TrigramTagger(self.reader_train, backoff=bigram_tagger) """ # Menggunakan dataset pan localization bahasa indonesia "UI-1M-tagged.txt" # kombinasi proses tagging diatas menghasilkan tingkat akurasi: # dengan regextagger: 77% # tanpa regextagger : > 90% """ if verbose: # Semakin besar dokumen, semakin lama proses perhitungan akurasi # disarankan hanya untuk testing print ("Calculating Tagger Accuracy...") self.tagAccuracy = trigram_tagger.evaluate(self.test_sent) print ("Accuracy is: %s" % (self.tagAccuracy)) return trigram_tagger.tag(text)
train_data = brown_tagged_sents[:int(len(brown_tagged_sents) * 0.9)] test_data = brown_tagged_sents[int(len(brown_tagged_sents) * 0.9):] unigram_tagger = UnigramTagger(train_data, backoff=default_tagger) print(unigram_tagger.evaluate(test_data)) # 0.835841722316356 bigram_tagger = BigramTagger(train_data, backoff=unigram_tagger) print(bigram_tagger.evaluate(test_data)) # 0.8454101465164956 trigram_tagger = TrigramTagger(train_data, backoff=bigram_tagger) print(trigram_tagger.evaluate(test_data)) # 0.8427190272102063 regexp_tagger = RegexpTagger( [( r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers ( r'(The|the|A|a|An|an)$', 'AT'), # articles ( r'.*able$', 'JJ'), # adjectives ( r'.*ness$', 'NN'), # nouns formed from adj ( r'.*ly$', 'RB'), # adverbs ( r'.*s$', 'NNS'), # plural nouns ( r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # past tense verbs (r'.*', 'NN') # nouns (default) ]) print(regexp_tagger.evaluate(test_data)) # 0.31306687929831556
>>>unigram_tagger = UnigramTagger(train_data,backoff=default_tagger) >>>print unigram_tagger.evaluate(test_data) >>>bigram_tagger= BigramTagger(train_data, backoff=unigram_tagger) >>>print bigram_tagger.evaluate(test_data) >>>trigram_tagger=TrigramTagger(train_data,backoff=bigram_tagger) >>>print trigram_tagger.evaluate(test_data) # Regex tagger >>>from nltk.tag.sequential import RegexpTagger >>>regexp_tagger = RegexpTagger( [( r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers ( r'(The|the|A|a|An|an)$', 'AT'), # articles ( r'.*able$', 'JJ'), # adjectives ( r'.*ness$', 'NN'), # nouns formed from adj ( r'.*ly$', 'RB'), # adverbs ( r'.*s$', 'NNS'), # plural nouns ( r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # past tense verbs (r'.*', 'NN') # nouns (default) ]) >>>print regexp_tagger.evaluate(test_data) # NER tagger >>>import nltk >>>from nltk import ne_chunk >>>from nltk import word_tokenize >>>sent = "Mark is studying at Stanford University in California" >>>print(ne_chunk(nltk.pos_tag(word_tokenize(sent)), binary=False))
def train(self, templates=None, verbose=True): """Train a new Brill tagger.""" if templates is None: templates = brill.nltkdemo18() random.seed(len(self.tagged_data_list)) random.shuffle(self.tagged_data_list) cutoff = int(self.dev_size * self.train_size) training_data = self.tagged_data_list[:cutoff] test_data = self.tagged_data_list[cutoff:self.dev_size] # very simple regular expression tagger regex_tagger = RegexpTagger([(r'^-?[0-9]+(.[0-9]+)?$', 'PUNCT'), (r'.*', 'N')]) if verbose == True: print "Regular expression tagger accuracy:\n{}\n".format( regex_tagger.evaluate(test_data)) # unigram tagger unigram_tagger = UnigramTagger(train=training_data, backoff=regex_tagger) if verbose == True: print "Unigram tagger accuracy:\n{}\n".format( unigram_tagger.evaluate(test_data)) # bigram tagger bigram_tagger = BigramTagger(train=training_data, backoff=unigram_tagger) if verbose == True: print "Bigram tagger accuracy:\n{}\n".format( bigram_tagger.evaluate(test_data)) # trigram tagger trigram_tagger = TrigramTagger(train=training_data, backoff=bigram_tagger) if verbose == True: print "Trigram tagger accuracy:\n{}\n".format( trigram_tagger.evaluate(test_data)) # first iteration trainer = BrillTaggerTrainer(initial_tagger=trigram_tagger, templates=templates) brill_tagger = trainer.train(train_sents=training_data, max_rules=self.max_rules, min_score=self.min_score) if verbose == True: print "Initial Brill tagger accuracy:\n{}\n".format( brill_tagger.evaluate(test_data)) # folding for i in range(0, self.num_groups): # random splitting random.seed(len(self.tagged_data_list)) random.shuffle(self.tagged_data_list) cutoff = int(self.dev_size * self.train_size) training_data = self.tagged_data_list[:cutoff] test_data = self.tagged_data_list[cutoff:self.dev_size] # note that .train method returns a BrillTagger() object brill_tagger = trainer.train(train_sents=training_data, max_rules=self.max_rules, min_score=self.min_score) if verbose == True: print "Brill tagger accuracy, fold {}:\n{}\n".format( i + 1, brill_tagger.evaluate(test_data))
def train(self, templates=None, verbose=True): """Train a new Brill tagger.""" if templates is None: templates = brill.nltkdemo18() random.seed(len(self.tagged_data_list)) random.shuffle(self.tagged_data_list) cutoff = int(self.dev_size * self.train_size) training_data = self.tagged_data_list[:cutoff] test_data = self.tagged_data_list[cutoff:self.dev_size] # very simple regular expression tagger regex_tagger = RegexpTagger([ (r'^-?[0-9]+(.[0-9]+)?$', 'PUNCT'), (r'.*', 'N') ]) if verbose == True: print "Regular expression tagger accuracy:\n{}\n".format( regex_tagger.evaluate(test_data)) # unigram tagger unigram_tagger = UnigramTagger(train=training_data, backoff=regex_tagger) if verbose == True: print "Unigram tagger accuracy:\n{}\n".format( unigram_tagger.evaluate(test_data)) # bigram tagger bigram_tagger = BigramTagger(train=training_data, backoff=unigram_tagger) if verbose == True: print "Bigram tagger accuracy:\n{}\n".format( bigram_tagger.evaluate(test_data)) # trigram tagger trigram_tagger = TrigramTagger(train=training_data, backoff=bigram_tagger) if verbose == True: print "Trigram tagger accuracy:\n{}\n".format( trigram_tagger.evaluate(test_data)) # first iteration trainer = BrillTaggerTrainer(initial_tagger=trigram_tagger, templates=templates) brill_tagger = trainer.train(train_sents=training_data, max_rules=self.max_rules, min_score=self.min_score) if verbose == True: print "Initial Brill tagger accuracy:\n{}\n".format( brill_tagger.evaluate(test_data)) # folding for i in range(0, self.num_groups): # random splitting random.seed(len(self.tagged_data_list)) random.shuffle(self.tagged_data_list) cutoff = int(self.dev_size * self.train_size) training_data = self.tagged_data_list[:cutoff] test_data = self.tagged_data_list[cutoff:self.dev_size] # note that .train method returns a BrillTagger() object brill_tagger = trainer.train(train_sents=training_data, max_rules=self.max_rules, min_score=self.min_score) if verbose == True: print "Brill tagger accuracy, fold {}:\n{}\n".format( i+1, brill_tagger.evaluate(test_data))