def exercise1(): # 进行词性标注 text = nltk.word_tokenize("You are a good man, but i don't like you!") print(text) print(nltk.pos_tag(text)) nltk.tag.pos_tag() words_tag = brown.tagged_words(categories='news') print(words_tag[:30]) words_tag = brown.tagged_words(categories='news', tagset='universal') print(words_tag[:30]) words_tag = brown.tagged_words(categories='news', tagset='wsj') print(words_tag[:30]) words_tag = brown.tagged_words(categories='news', tagset='brown') print(words_tag[:30]) words_tag = sinica_treebank.tagged_sents() print(words_tag) raw = "You are a good man, but i don't love you!" tokens = nltk.word_tokenize(raw) default_tagger = nltk.DefaultTagger('NN') lagged_words = default_tagger.tag(tokens) print(lagged_words) tagged_sents = brown.tagged_sents(categories='news') print(default_tagger.evaluate(tagged_sents))
def Tag_Word_model(): sinica_treebank_tagged_sents = sinica_treebank.tagged_sents() size = int(len(sinica_treebank_tagged_sents) * 0.9) train_sents = sinica_treebank_tagged_sents[:size] # 90%数据作为训练集 test_sents = sinica_treebank_tagged_sents[size:] # 10%数据作为测试集 t0 = nltk.DefaultTagger('Nab') # 词性的默认值为名词 # t1 = nltk.pos_tag(train_sents,str="cn") t1 = nltk.UnigramTagger(train_sents, backoff=t0) # 一元标注 t2 = nltk.BigramTagger(train_sents, backoff=t1) # 多元(二元)标注 # dump_result(t2.tag(test_sents)) print t2.evaluate(train_sents) # 根据带标注的文本,评估标注器的正确率 return t2
def load_featuresets(): tagged_sents= sinica.tagged_sents() featuresets= [] prev_tag= '<START>' prev2_tag= '<START>' for tagged_sent in tagged_sents: untagged_sent= nltk.tag.untag(tagged_sent) for i,(word, tag) in enumerate(tagged_sent): featuresets.append( (get_features(untagged_sent, i, prev_tag, prev2_tag), tag) ) prev2_tag= prev_tag prev_tag= tag return featuresets
def __init__( self, mode, language ): self.mode = mode if language.lower()=='en': self.tagged_sents = brown.tagged_sents(categories='news') self.default_tagger = nltk.DefaultTagger('NN') elif language.lower()=='zh': self.tagged_sents = sinica_treebank.tagged_sents() # 以句为单位标 self.default_tagger = nltk.DefaultTagger('Nab') else: print( 'only supports en or zh as language.') raise self.train_size = int(len(self.tagged_sents) * 0.9) self.train_sets = self.tagged_sents[:self.train_size] self.test_sets = self.tagged_sents[self.train_size:] del(self.tagged_sents) if self.mode=='unigram': self.unigram_tagger = nltk.UnigramTagger(train=self.train_sets,backoff=self.default_tagger) if self.mode=='bigram': self.unigram_tagger = nltk.UnigramTagger(train=self.train_sets,backoff=self.default_tagger) self.bigram_tagger = nltk.BigramTagger(train=self.train_sets,backoff=self.unigram_tagger)
lambda: brown.tagged_sents(categories="science_fiction", tagset="universal"), "English: Brown Corpus (Romance, simplified)": lambda: brown.tagged_sents(categories="romance", tagset="universal"), "English: Brown Corpus (Humor, simplified)": lambda: brown.tagged_sents(categories="humor", tagset="universal"), "English: NPS Chat Corpus": lambda: nps_chat.tagged_posts(), "English: NPS Chat Corpus (simplified)": lambda: nps_chat.tagged_posts(tagset="universal"), "English: Wall Street Journal Corpus": lambda: treebank.tagged_sents(), "English: Wall Street Journal Corpus (simplified)": lambda: treebank.tagged_sents(tagset="universal"), "Chinese: Sinica Corpus": lambda: sinica_treebank.tagged_sents(), "Chinese: Sinica Corpus (simplified)": lambda: sinica_treebank.tagged_sents(tagset="universal"), "Dutch: Alpino Corpus": lambda: alpino.tagged_sents(), "Dutch: Alpino Corpus (simplified)": lambda: alpino.tagged_sents(tagset="universal"), "Hindi: Indian Languages Corpus": lambda: indian.tagged_sents(files="hindi.pos"), "Hindi: Indian Languages Corpus (simplified)": lambda: indian.tagged_sents(files="hindi.pos", tagset="universal"), "Portuguese: Floresta Corpus (Portugal)": lambda: floresta.tagged_sents(), "Portuguese: Floresta Corpus (Portugal, simplified)": lambda: floresta.tagged_sents(tagset="universal"), "Portuguese: MAC-MORPHO Corpus (Brazil)":
'English: Brown Corpus (Science Fiction, simplified)': lambda: brown.tagged_sents(categories='science_fiction', tagset='universal'), 'English: Brown Corpus (Romance, simplified)': lambda: brown.tagged_sents(categories='romance', tagset='universal'), 'English: Brown Corpus (Humor, simplified)': lambda: brown.tagged_sents(categories='humor', tagset='universal'), 'English: NPS Chat Corpus': lambda: nps_chat.tagged_posts(), 'English: NPS Chat Corpus (simplified)': lambda: nps_chat.tagged_posts(tagset='universal'), 'English: Wall Street Journal Corpus': lambda: treebank.tagged_sents(), 'English: Wall Street Journal Corpus (simplified)': lambda: treebank.tagged_sents(tagset='universal'), 'Chinese: Sinica Corpus': lambda: sinica_treebank.tagged_sents(), 'Chinese: Sinica Corpus (simplified)': lambda: sinica_treebank.tagged_sents(tagset='universal'), 'Dutch: Alpino Corpus': lambda: alpino.tagged_sents(), 'Dutch: Alpino Corpus (simplified)': lambda: alpino.tagged_sents(tagset='universal'), 'Hindi: Indian Languages Corpus': lambda: indian.tagged_sents(files='hindi.pos'), 'Hindi: Indian Languages Corpus (simplified)': lambda: indian.tagged_sents(files='hindi.pos', tagset='universal'), 'Portuguese: Floresta Corpus (Portugal)': lambda: floresta.tagged_sents(), 'Portuguese: Floresta Corpus (Portugal, simplified)': lambda: floresta.tagged_sents(tagset='universal'), 'Portuguese: MAC-MORPHO Corpus (Brazil)':
'English: Brown Corpus (Science Fiction, simplified)': lambda: brown.tagged_sents(categories='science_fiction', tagset='simple'), 'English: Brown Corpus (Romance, simplified)': lambda: brown.tagged_sents(categories='romance', tagset='simple'), 'English: Brown Corpus (Humor, simplified)': lambda: brown.tagged_sents(categories='humor', tagset='simple'), 'English: NPS Chat Corpus': lambda: nps_chat.tagged_posts(), 'English: NPS Chat Corpus (simplified)': lambda: nps_chat.tagged_posts(tagset='simple'), 'English: Wall Street Journal Corpus': lambda: treebank.tagged_sents(), 'English: Wall Street Journal Corpus (simplified)': lambda: treebank.tagged_sents(tagset='simple'), 'Chinese: Sinica Corpus': lambda: sinica_treebank.tagged_sents(), 'Chinese: Sinica Corpus (simplified)': lambda: sinica_treebank.tagged_sents(tagset='simple'), 'Dutch: Alpino Corpus': lambda: alpino.tagged_sents(), 'Dutch: Alpino Corpus (simplified)': lambda: alpino.tagged_sents(tagset='simple'), 'Hindi: Indian Languages Corpus': lambda: indian.tagged_sents(files='hindi.pos'), 'Hindi: Indian Languages Corpus (simplified)': lambda: indian.tagged_sents(files='hindi.pos', tagset='simple'), 'Portuguese: Floresta Corpus (Portugal)': lambda: floresta.tagged_sents(), 'Portuguese: Floresta Corpus (Portugal, simplified)': lambda: floresta.tagged_sents(tagset='simple'), 'Portuguese: MAC-MORPHO Corpus (Brazil)':
'English: Brown Corpus (Science Fiction, simplified)': lambda: brown.tagged_sents(categories='science_fiction', simplify_tags=True), 'English: Brown Corpus (Romance, simplified)': lambda: brown.tagged_sents(categories='romance', simplify_tags=True), 'English: Brown Corpus (Humor, simplified)': lambda: brown.tagged_sents(categories='humor', simplify_tags=True), 'English: NPS Chat Corpus': lambda: nps_chat.tagged_posts(), 'English: NPS Chat Corpus (simplified)': lambda: nps_chat.tagged_posts(simplify_tags=True), 'English: Wall Street Journal Corpus': lambda: treebank.tagged_sents(), 'English: Wall Street Journal Corpus (simplified)': lambda: treebank.tagged_sents(simplify_tags=True), 'Chinese: Sinica Corpus': lambda: sinica_treebank.tagged_sents(), 'Chinese: Sinica Corpus (simplified)': lambda: sinica_treebank.tagged_sents(simplify_tags=True), 'Dutch: Alpino Corpus': lambda: alpino.tagged_sents(), 'Dutch: Alpino Corpus (simplified)': lambda: alpino.tagged_sents(simplify_tags=True), 'Hindi: Indian Languages Corpus': lambda: indian.tagged_sents(files='hindi.pos'), 'Hindi: Indian Languages Corpus (simplified)': lambda: indian.tagged_sents(files='hindi.pos', simplify_tags=True), 'Portuguese: Floresta Corpus (Portugal)': lambda: floresta.tagged_sents(), 'Portuguese: Floresta Corpus (Portugal, simplified)': lambda: floresta.tagged_sents(simplify_tags=True), 'Portuguese: MAC-MORPHO Corpus (Brazil)':
lambda: brown.tagged_sents(categories='science_fiction', simplify_tags=True), 'English: Brown Corpus (Romance, simplified)': lambda: brown.tagged_sents(categories='romance', simplify_tags=True), 'English: Brown Corpus (Humor, simplified)': lambda: brown.tagged_sents(categories='humor', simplify_tags=True), 'English: NPS Chat Corpus': lambda: nps_chat.tagged_posts(), 'English: NPS Chat Corpus (simplified)': lambda: nps_chat.tagged_posts(simplify_tags=True), 'English: Wall Street Journal Corpus': lambda: treebank.tagged_sents(), 'English: Wall Street Journal Corpus (simplified)': lambda: treebank.tagged_sents(simplify_tags=True), 'Chinese: Sinica Corpus': lambda: sinica_treebank.tagged_sents(), 'Chinese: Sinica Corpus (simplified)': lambda: sinica_treebank.tagged_sents(simplify_tags=True), 'Dutch: Alpino Corpus': lambda: alpino.tagged_sents(), 'Dutch: Alpino Corpus (simplified)': lambda: alpino.tagged_sents(simplify_tags=True), 'Hindi: Indian Languages Corpus': lambda: indian.tagged_sents(files='hindi.pos'), 'Hindi: Indian Languages Corpus (simplified)': lambda: indian.tagged_sents(files='hindi.pos', simplify_tags=True), 'Portuguese: Floresta Corpus (Portugal)': lambda: floresta.tagged_sents(), 'Portuguese: Floresta Corpus (Portugal, simplified)': lambda: floresta.tagged_sents(simplify_tags=True), 'Portuguese: MAC-MORPHO Corpus (Brazil)':
), "English: Brown Corpus (Romance, simplified)": lambda: brown.tagged_sents( categories="romance", tagset="universal" ), "English: Brown Corpus (Humor, simplified)": lambda: brown.tagged_sents( categories="humor", tagset="universal" ), "English: NPS Chat Corpus": lambda: nps_chat.tagged_posts(), "English: NPS Chat Corpus (simplified)": lambda: nps_chat.tagged_posts( tagset="universal" ), "English: Wall Street Journal Corpus": lambda: treebank.tagged_sents(), "English: Wall Street Journal Corpus (simplified)": lambda: treebank.tagged_sents( tagset="universal" ), "Chinese: Sinica Corpus": lambda: sinica_treebank.tagged_sents(), "Chinese: Sinica Corpus (simplified)": lambda: sinica_treebank.tagged_sents( tagset="universal" ), "Dutch: Alpino Corpus": lambda: alpino.tagged_sents(), "Dutch: Alpino Corpus (simplified)": lambda: alpino.tagged_sents( tagset="universal" ), "Hindi: Indian Languages Corpus": lambda: indian.tagged_sents(files="hindi.pos"), "Hindi: Indian Languages Corpus (simplified)": lambda: indian.tagged_sents( files="hindi.pos", tagset="universal" ), "Portuguese: Floresta Corpus (Portugal)": lambda: floresta.tagged_sents(), "Portuguese: Floresta Corpus (Portugal, simplified)": lambda: floresta.tagged_sents( tagset="universal" ),
"English: Brown Corpus (simplified)": lambda: brown.tagged_sents(tagset="simple"), "English: Brown Corpus (Press, simplified)": lambda: brown.tagged_sents( categories=["news", "editorial", "reviews"], tagset="simple" ), "English: Brown Corpus (Religion, simplified)": lambda: brown.tagged_sents(categories="religion", tagset="simple"), "English: Brown Corpus (Learned, simplified)": lambda: brown.tagged_sents(categories="learned", tagset="simple"), "English: Brown Corpus (Science Fiction, simplified)": lambda: brown.tagged_sents( categories="science_fiction", tagset="simple" ), "English: Brown Corpus (Romance, simplified)": lambda: brown.tagged_sents(categories="romance", tagset="simple"), "English: Brown Corpus (Humor, simplified)": lambda: brown.tagged_sents(categories="humor", tagset="simple"), "English: NPS Chat Corpus": lambda: nps_chat.tagged_posts(), "English: NPS Chat Corpus (simplified)": lambda: nps_chat.tagged_posts(tagset="simple"), "English: Wall Street Journal Corpus": lambda: treebank.tagged_sents(), "English: Wall Street Journal Corpus (simplified)": lambda: treebank.tagged_sents(tagset="simple"), "Chinese: Sinica Corpus": lambda: sinica_treebank.tagged_sents(), "Chinese: Sinica Corpus (simplified)": lambda: sinica_treebank.tagged_sents(tagset="simple"), "Dutch: Alpino Corpus": lambda: alpino.tagged_sents(), "Dutch: Alpino Corpus (simplified)": lambda: alpino.tagged_sents(tagset="simple"), "Hindi: Indian Languages Corpus": lambda: indian.tagged_sents(files="hindi.pos"), "Hindi: Indian Languages Corpus (simplified)": lambda: indian.tagged_sents(files="hindi.pos", tagset="simple"), "Portuguese: Floresta Corpus (Portugal)": lambda: floresta.tagged_sents(), "Portuguese: Floresta Corpus (Portugal, simplified)": lambda: floresta.tagged_sents(tagset="simple"), "Portuguese: MAC-MORPHO Corpus (Brazil)": lambda: mac_morpho.tagged_sents(), "Portuguese: MAC-MORPHO Corpus (Brazil, simplified)": lambda: mac_morpho.tagged_sents(tagset="simple"), "Spanish: CESS-ESP Corpus (simplified)": lambda: cess_esp.tagged_sents(tagset="simple"), } class ConcordanceSearchView(object): _BACKGROUND_COLOUR = "#FFF" # white
VP -> V NP | V NP PP PP -> P NP V -> "saw" | "ate" | "walked" | '爱' NP -> "John" | "Mary" | "Bob" | Det N | Det N PP | '我' | '你' Det -> "a" | "an" | "the" | "my" N -> "man" | "dog" | "cat" | "telescope" | "park" P -> "in" | "on" | "by" | "with" ''') s = '我 爱 你' tokens = nltk.word_tokenize(s) from nltk.corpus import sinica_treebank sinica_treebank_tagged_sents = sinica_treebank.tagged_sents() # 以句为单位标 size = int(len(sinica_treebank_tagged_sents) * 0.9) train_sents = sinica_treebank_tagged_sents[:size] # 90% 数据作为训练集 test_sents = sinica_treebank_tagged_sents[size:] # 10% 数据作为测试集 t0 = nltk.DefaultTagger('Nab') # 词性的默认值为名词 t1 = nltk.UnigramTagger(train_sents, backoff=t0) # 一元标注 t2 = nltk.BigramTagger(train_sents, backoff=t1) # 多元(二元)标注 dump_result(t2.tag(tokens)) print(t2.evaluate(test_sents)) # 根据带标注的文本,评估标注器的正确率 exit() tag = nltk.pos_tag(tokens) for i in tag:
import nltk from nltk.corpus import sinica_treebank as sinica import Segmenter from Segmenter import sent_segment import PCFG from PCFG import PCFGChino import time #################################### ## ESTA PARTE NO ES IMPORTANTE PARA TESTEAR EL PARSER tagged_sents= sinica.tagged_sents() sents= sinica.sents() size= int(len(tagged_sents) * 0.9) train_set= tagged_sents[:size] test_set= tagged_sents[size:] ##trigram_tagger= nltk.TrigramTagger(train_set) ##score= trigram_tagger.evaluate(test_set) print "Entrenando" ini= time.time() t0= nltk.DefaultTagger('Nab') t1= nltk.UnigramTagger(train_set, backoff=t0) t2= nltk.BigramTagger(train_set, backoff=t1) t3= nltk.TrigramTagger(train_set, backoff=t2) fin= time.time() score= t3.evaluate(test_set) print("Entrenamiento terminado ", str(fin-ini)) print "Evaluation Tagger= ",score #################################### ## Se crea el parser