def __init__(self, tsents=floresta.tagged_sents()): """ :param tsents: list of annotated sententeces """ self.__corpus = tsents self.__is_trained = False self.__tagger = None
def accuracy_measure(): basicConfig(format='%(levelname)s %(message)s', level=INFO) info('reading tagged sentences') info('simplifying tags') # tagged sentences flo_tsents = simplified_sents_floresta(floresta.tagged_sents()) mac_tsents = mac_morpho.tagged_sents() # FLORESTA test and train data flo_size = int(len(flo_tsents) * 0.9) flo_train = flo_tsents[:flo_size] flo_test = flo_tsents[flo_size:] # MAC MORPHO test and train data mac_size = int(len(mac_tsents) * 0.9) mac_train = mac_tsents[:mac_size] mac_test = mac_tsents[mac_size:] no_backoff_taggers(flo_test, flo_train) no_backoff_taggers(mac_test, mac_train, corpus='macmorpho') if not pt.check_for_taggers(): save = True else: save = False backoff_taggers(flo_test, flo_train, save) backoff_taggers(mac_test, mac_train, save, corpus='macmorpho')
def add_start_end(start, end): tags_words = [] tsents = floresta.tagged_sents() for sent in tsents[start:end]: tags_words.append(("START", "START")) tags_words.extend([(simplify_tag(t), w) for (w, t) in sent]) tags_words.append(("END", "END")) return tags_words
def tokenize(): tsents = floresta.tagged_sents() tsents = [[(w.lower(),simplify_tag(t)) for (w,t) in sent] for sent in tsents if sent] train = tsents[100:] test = tsents[:100] tagger0 = nltk.DefaultTagger('n') tagger1 = nltk.UnigramTagger(train, backoff=tagger0) return tagger1
def demo(corpus, num_sents): """ Loads a few sentences from the Brown corpus or the Wall Street Journal corpus, trains them, tests the tagger's accuracy and tags an unseen sentence. @type corpus: C{str} @param corpus: Name of the corpus to load, either C{brown} or C{treebank}. @type num_sents: C{int} @param num_sents: Number of sentences to load from a corpus. Use a small number, as training might take a while. """ if corpus.lower() == "brown": from nltk.corpus import brown tagged_sents = brown.tagged_sents()[:num_sents] elif corpus.lower() == "treebank": from nltk.corpus import treebank tagged_sents = treebank.tagged_sents()[:num_sents] elif corpus.lower() == "floresta": from nltk.corpus import floresta tagged_sents = floresta.tagged_sents()[:num_sents] elif corpus.lower() == "cintil": print "Loading CINTIL" #column_types = ['ignore','words','ignore','ignore','pos','ignore'] #cintil = ConllCorpusReader('/home/dsbatista/cintil/','cintil-fixed.conll',column_types) column_types = ['words','pos','ignore'] #cintil = ConllCorpusReader('/home/dsbatista/extract-publico-relationships/pos-tagger','cintil-fixed.conll',column_types) cintil = ConllCorpusReader('/home/dsbatista/extract-publico-relationships/pos-tagger','cintil-fixed-reduced.conll',column_types) tagged_sents = cintil.tagged_sents()[:num_sents] else: print "Please load either the 'brown' or the 'treebank' corpus." size = int(len(tagged_sents) * 0.1) train_sents, test_sents = tagged_sents[size:], tagged_sents[:size] maxent_tagger = MaxentPosTagger() maxent_tagger.train(train_sents) maxent_tagger.evaluate(test_sents) """ print "tagger accuracy (test %i sentences, after training %i):" % \ (size, (num_sents - size)), maxent_tagger.evaluate(test_sents) print "\n\n" print "classify unseen sentence: ", maxent_tagger.tag(["Isto", "é", "bastante","rápido", "!"]) print "\n\n" print "show the 40 most informative features:" print maxent_tagger.classifier.show_most_informative_features(40) """ fModel = open('test.pkl',"wb") pickle.dump(maxent_tagger, fModel,1) fModel.close()
def floresta_tagger(): # import nltk.data # tagger = nltk.data.load("taggers/NAME_OF_TAGGER.pickle") from nltk.corpus import floresta tsents = floresta.tagged_sents() tsents = [[(w.lower(), simplify_tag(t)) for (w, t) in sent] for sent in tsents if sent] train = tsents[100:] test = tsents[:100]
def retrieve_traindata(): tsents = floresta.tagged_sents() tsents = [[(w.lower(), simplify_tag(t)) for (w, t) in sent] for sent in tsents if sent] tsents = [[(w.lower(), convert_to_universal_tag(t)) for (w, t) in sent] for sent in tsents if sent] return tsents
def tagger_training_corpus(): corpus = [] for sent in floresta.tagged_sents(): new_sent = [] for word in sent: if word[0].lower() not in string.punctuation: new_sent.append(word) corpus.append(new_sent) return corpus
def run(self, corpus=Corpus.FLORESTA, force=False): self.should_force = force if corpus == Corpus.FLORESTA: print("\n##### Floresta Corpus #####") floresta_sent = floresta.tagged_sents() self.train("floresta", floresta_sent) elif corpus == Corpus.MAC_MORPHO: print("\n###### Mac Morpho Corpus #####") mac_morpho_sent = mac_morpho.tagged_sents() self.train("mac_morpho", mac_morpho_sent)
def train_tagger(): print("Training taggers, please wait...") # Simplificar as tags das frases que estão no módulo floresta tsents = floresta.tagged_sents() tsents = [[(w.lower(), simplify_tag(t)) for (w, t) in sent] for sent in tsents if sent] tagger0 = nltk.DefaultTagger("n") tagger1 = nltk.UnigramTagger(tsents, backoff=tagger0) tagger2 = nltk.BigramTagger(tsents, backoff=tagger1) return tagger2
def train_pos_tagger(): tsents = floresta.tagged_sents() #Tira informações desnecessárias que acompanham as tags tsents = [[(w, simplify_tag(t)) for (w, t) in sent] for sent in tsents if sent] #Pega as 7 mil primeiras sentenças do floresta train_data = tsents[:7000] #Pega as 2266 sentenças finais do floresta test_data = tsents[7000:] tnt_pos_tagger = tnt.TnT() tnt_pos_tagger.train(train_data) #Descomente as duas linhas abaixo se tiver curiosidade de avaliar o desempenho do classificador treinado #res = tnt_pos_tagger.evaluate(test_data) #print('Desempenho do tnt_pos_tagger: ', res) return tnt_pos_tagger
def normalize(s, punctuation="!?.:;,()[] "): s = s.decode("iso-8859-1") s = s.strip() s = s.strip(punctuation) return s with open("sentence-pt") as content_file: content = content_file.read() content = normalize(content) tokens = word_tokenize(content) sents = floresta.tagged_sents() uni_tag = ut(sents) print uni_tag.tag(tokens) # Split corpus into training and testing set. train = int(len(sents) * 90 / 100) # 90% # Train a bigram tagger with only training data. bi_tag = bt(sents[:train]) # Evaluates on testing data remaining 10% bi_tag.evaluate(sents[train + 1 :]) # Using the tagger. print bi_tag.tag(tokens)
from nltk.corpus import floresta import nltk tags = [] for sent in floresta.tagged_sents(): for (palavra, tag) in sent: tags.append(tag) freq = nltk.FreqDist(tags) print(freq.most_common(50))
'English: Wall Street Journal Corpus (simplified)': lambda: treebank.tagged_sents(tagset='simple'), 'Chinese: Sinica Corpus': lambda: sinica_treebank.tagged_sents(), 'Chinese: Sinica Corpus (simplified)': lambda: sinica_treebank.tagged_sents(tagset='simple'), 'Dutch: Alpino Corpus': lambda: alpino.tagged_sents(), 'Dutch: Alpino Corpus (simplified)': lambda: alpino.tagged_sents(tagset='simple'), 'Hindi: Indian Languages Corpus': lambda: indian.tagged_sents(files='hindi.pos'), 'Hindi: Indian Languages Corpus (simplified)': lambda: indian.tagged_sents(files='hindi.pos', tagset='simple'), 'Portuguese: Floresta Corpus (Portugal)': lambda: floresta.tagged_sents(), 'Portuguese: Floresta Corpus (Portugal, simplified)': lambda: floresta.tagged_sents(tagset='simple'), 'Portuguese: MAC-MORPHO Corpus (Brazil)': lambda: mac_morpho.tagged_sents(), 'Portuguese: MAC-MORPHO Corpus (Brazil, simplified)': lambda: mac_morpho.tagged_sents(tagset='simple'), 'Spanish: CESS-ESP Corpus (simplified)': lambda: cess_esp.tagged_sents(tagset='simple'), } class ConcordanceSearchView(object): _BACKGROUND_COLOUR='#FFF' #white #Colour of highlighted results _HIGHLIGHT_WORD_COLOUR='#F00' #red
def get_training(): training = floresta.tagged_sents()[10:] target = floresta.tagged_sents()[:10] return training, target
'English: Wall Street Journal Corpus (simplified)': lambda: treebank.tagged_sents(simplify_tags=True), 'Chinese: Sinica Corpus': lambda: sinica_treebank.tagged_sents(), 'Chinese: Sinica Corpus (simplified)': lambda: sinica_treebank.tagged_sents(simplify_tags=True), 'Dutch: Alpino Corpus': lambda: alpino.tagged_sents(), 'Dutch: Alpino Corpus (simplified)': lambda: alpino.tagged_sents(simplify_tags=True), 'Hindi: Indian Languages Corpus': lambda: indian.tagged_sents(files='hindi.pos'), 'Hindi: Indian Languages Corpus (simplified)': lambda: indian.tagged_sents(files='hindi.pos', simplify_tags=True), 'Portuguese: Floresta Corpus (Portugal)': lambda: floresta.tagged_sents(), 'Portuguese: Floresta Corpus (Portugal, simplified)': lambda: floresta.tagged_sents(simplify_tags=True), 'Portuguese: MAC-MORPHO Corpus (Brazil)': lambda: mac_morpho.tagged_sents(), 'Portuguese: MAC-MORPHO Corpus (Brazil, simplified)': lambda: mac_morpho.tagged_sents(simplify_tags=True), 'Spanish: CESS-ESP Corpus (simplified)': lambda: cess_esp.tagged_sents(simplify_tags=True), } class ConcordanceSearchView(object): _BACKGROUND_COLOUR='#FFF' #white #Colour of highlighted results _HIGHLIGHT_WORD_COLOUR='#F00' #red
def get_dataset(): nltk.download("floresta") tsents = floresta.tagged_sents() tsents = [[(w.lower(), simplify_tag(t)) for (w, t) in sent] for sent in tsents if sent] return tsents
def main() -> None: """ Program entry point. Main execution flow of the POS tagger: - load corpus data - split data into train/testing sets - handle unknown words in both sets - train tagger (calculate HMM's word emission and tag transition probabilities) - test tagger (Viberbi algorithm, backtracing and accuracy measure) :return: None. """ parse_command_line_arguments() if config.corpus == "brown": # Default corpus. # Only need to do this once. # download_brown_corpus() # Retrieve tagged sentences from the Brown corpus tagged_sentences = brown.tagged_sents(tagset='universal') print("Corpus used: Brown Corpus (universal tagset)") if config.debug: print_corpus_information(brown, "Brown Corpus") elif config.corpus == "floresta": # Only need to do this once. # download_floresta_corpus() tagged_sentences = floresta.tagged_sents() print("Corpus used: Floresta Treebank") if config.debug: print_corpus_information(brown, "Floresta Treebank") # Start measuring runtime. start_time = time.time() # Split data into a training and a testing set (default split 95/5 sentences). training_set, testing_set = split_train_test_data(tagged_sentences) if config.debug: print_number_of_sentences(training_set, "training dataset") print_number_of_sentences(testing_set, "testing dataset") # Replace infrequent words with special 'UNK' tags. training_words = extract_words(training_set) unique_training_words = remove_list_duplicates(training_words) training_set = handle_unknown_words(training_set, unique_training_words, is_training_set=True) testing_set = handle_unknown_words(testing_set, unique_training_words, is_training_set=False) # Store all words and all tags from the training dataset in a ordered lists (and make lists without duplicates). training_tags = extract_tags(training_set) unique_training_tags = remove_list_duplicates(training_tags) # Train the POS tagger by generating the tag transition and word emission probability matrices of the HMM. tag_transition_probabilities, emission_probabilities = train_tagger( training_set, training_tags) # Test the POS tagger on the testing data using the Viterbi back-tracing algorithm. test_tagger(testing_set, unique_training_tags, tag_transition_probabilities, emission_probabilities) print_runtime(round(time.time() - start_time, 2)) # Record and print runtime.
'English: Wall Street Journal Corpus (simplified)': lambda: treebank.tagged_sents(tagset='simple'), 'Chinese: Sinica Corpus': lambda: sinica_treebank.tagged_sents(), 'Chinese: Sinica Corpus (simplified)': lambda: sinica_treebank.tagged_sents(tagset='simple'), 'Dutch: Alpino Corpus': lambda: alpino.tagged_sents(), 'Dutch: Alpino Corpus (simplified)': lambda: alpino.tagged_sents(tagset='simple'), 'Hindi: Indian Languages Corpus': lambda: indian.tagged_sents(files='hindi.pos'), 'Hindi: Indian Languages Corpus (simplified)': lambda: indian.tagged_sents(files='hindi.pos', tagset='simple'), 'Portuguese: Floresta Corpus (Portugal)': lambda: floresta.tagged_sents(), 'Portuguese: Floresta Corpus (Portugal, simplified)': lambda: floresta.tagged_sents(tagset='simple'), 'Portuguese: MAC-MORPHO Corpus (Brazil)': lambda: mac_morpho.tagged_sents(), 'Portuguese: MAC-MORPHO Corpus (Brazil, simplified)': lambda: mac_morpho.tagged_sents(tagset='simple'), 'Spanish: CESS-ESP Corpus (simplified)': lambda: cess_esp.tagged_sents(tagset='simple'), } class ConcordanceSearchView(object): _BACKGROUND_COLOUR = '#FFF' #white #Colour of highlighted results
'English: Wall Street Journal Corpus (simplified)': lambda: treebank.tagged_sents(simplify_tags=True), 'Chinese: Sinica Corpus': lambda: sinica_treebank.tagged_sents(), 'Chinese: Sinica Corpus (simplified)': lambda: sinica_treebank.tagged_sents(simplify_tags=True), 'Dutch: Alpino Corpus': lambda: alpino.tagged_sents(), 'Dutch: Alpino Corpus (simplified)': lambda: alpino.tagged_sents(simplify_tags=True), 'Hindi: Indian Languages Corpus': lambda: indian.tagged_sents(files='hindi.pos'), 'Hindi: Indian Languages Corpus (simplified)': lambda: indian.tagged_sents(files='hindi.pos', simplify_tags=True), 'Portuguese: Floresta Corpus (Portugal)': lambda: floresta.tagged_sents(), 'Portuguese: Floresta Corpus (Portugal, simplified)': lambda: floresta.tagged_sents(simplify_tags=True), 'Portuguese: MAC-MORPHO Corpus (Brazil)': lambda: mac_morpho.tagged_sents(), 'Portuguese: MAC-MORPHO Corpus (Brazil, simplified)': lambda: mac_morpho.tagged_sents(simplify_tags=True), 'Spanish: CESS-ESP Corpus (simplified)': lambda: cess_esp.tagged_sents(simplify_tags=True), } class ConcordanceSearchView(object): _BACKGROUND_COLOUR = '#FFF' #white #Colour of highlighted results
# # e http://www.nltk.org/howto/portuguese_en.html) import nltk from nltk.corpus import floresta from tqdm import tqdm # importamos o tqdm para gerar barras de progresso facilmente. # Função auxiliar que realiza a extração da classe sintática de uma palavra da tag (ou anotação) para aquela # palavra no córpus, removendo as partes da tag que não são relevantes neste contexto: def extract_pos(tag): if '+' in tag: return tag.split('+')[1] else: return tag sentences = floresta.tagged_sents() # lista de sentenças taggeadas originais do córpus. # Preenchemos a lista tagged_sentences com os dados do córpus com as tags já filtradas, correspondentes às # classes gramaticais: tagged_sentences = [] print('Extraindo classes gramaticais das tags do córpus:') for sent in tqdm(sentences): tagged_sent = [] for (word, tag) in sent: tagged_sent.append((word.lower(), extract_pos(tag))) tagged_sentences.append(tagged_sent) # Dividimos o córpus em um conjunto de treinamento e umm conjunto de testes: train = tagged_sentences[100:] test = tagged_sentences[:100]
from nltk.tag import hmm from nltk.util import unique_list from nltk.probability import * from nltk import ConditionalProbDist from nltk import ConditionalFreqDist from collections import Counter from HMM import * # Load the Training and Test Sentences print("Downloading Training Sentences from Corpus") trainingSentences_brown = brown.tagged_sents(tagset="universal")[:10000] trainingSentences_conll2000 = conll2000.tagged_sents()[:10000] trainingSentences_alpino = alpino.tagged_sents()[:10000] trainingSentences_floresta = floresta.tagged_sents()[:10000] print "Done!" print("Downloading Test Sentences from Corpus") testSentences_brown = brown.tagged_sents(tagset="universal")[10000:10500] testSentences_conll2000 = conll2000.tagged_sents()[10000:10500] testSentences_alpino = alpino.tagged_sents()[10000:10500] testSentences_floresta = floresta.tagged_sents()[10000:10500] print "Done!" # Extracts words and tags from Sentences def extractWords_and_Tags(sentences): words = {} tags = {} for sentence in sentences:
'English: Wall Street Journal Corpus (simplified)': lambda: treebank.tagged_sents(tagset='universal'), 'Chinese: Sinica Corpus': lambda: sinica_treebank.tagged_sents(), 'Chinese: Sinica Corpus (simplified)': lambda: sinica_treebank.tagged_sents(tagset='universal'), 'Dutch: Alpino Corpus': lambda: alpino.tagged_sents(), 'Dutch: Alpino Corpus (simplified)': lambda: alpino.tagged_sents(tagset='universal'), 'Hindi: Indian Languages Corpus': lambda: indian.tagged_sents(files='hindi.pos'), 'Hindi: Indian Languages Corpus (simplified)': lambda: indian.tagged_sents(files='hindi.pos', tagset='universal'), 'Portuguese: Floresta Corpus (Portugal)': lambda: floresta.tagged_sents(), 'Portuguese: Floresta Corpus (Portugal, simplified)': lambda: floresta.tagged_sents(tagset='universal'), 'Portuguese: MAC-MORPHO Corpus (Brazil)': lambda: mac_morpho.tagged_sents(), 'Portuguese: MAC-MORPHO Corpus (Brazil, simplified)': lambda: mac_morpho.tagged_sents(tagset='universal'), 'Spanish: CESS-ESP Corpus (simplified)': lambda: cess_esp.tagged_sents(tagset='universal'), } class ConcordanceSearchView(object): _BACKGROUND_COLOUR = '#FFF' #white #Colour of highlighted results
"English: Brown Corpus (Science Fiction, simplified)": lambda: brown.tagged_sents( categories="science_fiction", tagset="simple" ), "English: Brown Corpus (Romance, simplified)": lambda: brown.tagged_sents(categories="romance", tagset="simple"), "English: Brown Corpus (Humor, simplified)": lambda: brown.tagged_sents(categories="humor", tagset="simple"), "English: NPS Chat Corpus": lambda: nps_chat.tagged_posts(), "English: NPS Chat Corpus (simplified)": lambda: nps_chat.tagged_posts(tagset="simple"), "English: Wall Street Journal Corpus": lambda: treebank.tagged_sents(), "English: Wall Street Journal Corpus (simplified)": lambda: treebank.tagged_sents(tagset="simple"), "Chinese: Sinica Corpus": lambda: sinica_treebank.tagged_sents(), "Chinese: Sinica Corpus (simplified)": lambda: sinica_treebank.tagged_sents(tagset="simple"), "Dutch: Alpino Corpus": lambda: alpino.tagged_sents(), "Dutch: Alpino Corpus (simplified)": lambda: alpino.tagged_sents(tagset="simple"), "Hindi: Indian Languages Corpus": lambda: indian.tagged_sents(files="hindi.pos"), "Hindi: Indian Languages Corpus (simplified)": lambda: indian.tagged_sents(files="hindi.pos", tagset="simple"), "Portuguese: Floresta Corpus (Portugal)": lambda: floresta.tagged_sents(), "Portuguese: Floresta Corpus (Portugal, simplified)": lambda: floresta.tagged_sents(tagset="simple"), "Portuguese: MAC-MORPHO Corpus (Brazil)": lambda: mac_morpho.tagged_sents(), "Portuguese: MAC-MORPHO Corpus (Brazil, simplified)": lambda: mac_morpho.tagged_sents(tagset="simple"), "Spanish: CESS-ESP Corpus (simplified)": lambda: cess_esp.tagged_sents(tagset="simple"), } class ConcordanceSearchView(object): _BACKGROUND_COLOUR = "#FFF" # white # Colour of highlighted results _HIGHLIGHT_WORD_COLOUR = "#F00" # red _HIGHLIGHT_WORD_TAG = "HL_WRD_TAG" _HIGHLIGHT_LABEL_COLOUR = "#C0C0C0" # dark grey
"English: Wall Street Journal Corpus (simplified)": lambda: treebank.tagged_sents( tagset="universal" ), "Chinese: Sinica Corpus": lambda: sinica_treebank.tagged_sents(), "Chinese: Sinica Corpus (simplified)": lambda: sinica_treebank.tagged_sents( tagset="universal" ), "Dutch: Alpino Corpus": lambda: alpino.tagged_sents(), "Dutch: Alpino Corpus (simplified)": lambda: alpino.tagged_sents( tagset="universal" ), "Hindi: Indian Languages Corpus": lambda: indian.tagged_sents(files="hindi.pos"), "Hindi: Indian Languages Corpus (simplified)": lambda: indian.tagged_sents( files="hindi.pos", tagset="universal" ), "Portuguese: Floresta Corpus (Portugal)": lambda: floresta.tagged_sents(), "Portuguese: Floresta Corpus (Portugal, simplified)": lambda: floresta.tagged_sents( tagset="universal" ), "Portuguese: MAC-MORPHO Corpus (Brazil)": lambda: mac_morpho.tagged_sents(), "Portuguese: MAC-MORPHO Corpus (Brazil, simplified)": lambda: mac_morpho.tagged_sents( tagset="universal" ), "Spanish: CESS-ESP Corpus (simplified)": lambda: cess_esp.tagged_sents( tagset="universal" ), } class ConcordanceSearchView(object): _BACKGROUND_COLOUR = "#FFF" # white
'English: Wall Street Journal Corpus (simplified)': lambda: treebank.tagged_sents(tagset='universal'), 'Chinese: Sinica Corpus': lambda: sinica_treebank.tagged_sents(), 'Chinese: Sinica Corpus (simplified)': lambda: sinica_treebank.tagged_sents(tagset='universal'), 'Dutch: Alpino Corpus': lambda: alpino.tagged_sents(), 'Dutch: Alpino Corpus (simplified)': lambda: alpino.tagged_sents(tagset='universal'), 'Hindi: Indian Languages Corpus': lambda: indian.tagged_sents(files='hindi.pos'), 'Hindi: Indian Languages Corpus (simplified)': lambda: indian.tagged_sents(files='hindi.pos', tagset='universal'), 'Portuguese: Floresta Corpus (Portugal)': lambda: floresta.tagged_sents(), 'Portuguese: Floresta Corpus (Portugal, simplified)': lambda: floresta.tagged_sents(tagset='universal'), 'Portuguese: MAC-MORPHO Corpus (Brazil)': lambda: mac_morpho.tagged_sents(), 'Portuguese: MAC-MORPHO Corpus (Brazil, simplified)': lambda: mac_morpho.tagged_sents(tagset='universal'), 'Spanish: CESS-ESP Corpus (simplified)': lambda: cess_esp.tagged_sents(tagset='universal'), } class ConcordanceSearchView(object): _BACKGROUND_COLOUR='#FFF' #white #Colour of highlighted results _HIGHLIGHT_WORD_COLOUR='#F00' #red
#!/usr/bin/env python # -*- coding: utf-8 -*- import nltk from nltk.corpus import floresta import cPickle FILENAME = "txts/floresta_trigram.pos" def simplify_tag(t): if '+' in t: return t[t.index('+')+1:] else: return t tsents = floresta.tagged_sents() tsents = [[(w.lower(),simplify_tag(t)) for (w,t) in sent] for sent in tsents if sent] train = tsents[100:] test = tsents[:100] tagger0 = nltk.DefaultTagger('n') tagger1 = nltk.UnigramTagger(train, backoff=tagger0) tagger2 = nltk.BigramTagger(train, backoff=tagger1) tagger = nltk.TrigramTagger(train, backoff=tagger2) tagger.evaluate(test) with open(FILENAME, 'wb') as outFile: cPickle.dump(tagger, outFile, -1)
"English: Wall Street Journal Corpus (simplified)": lambda: treebank.tagged_sents(tagset="universal"), "Chinese: Sinica Corpus": lambda: sinica_treebank.tagged_sents(), "Chinese: Sinica Corpus (simplified)": lambda: sinica_treebank.tagged_sents(tagset="universal"), "Dutch: Alpino Corpus": lambda: alpino.tagged_sents(), "Dutch: Alpino Corpus (simplified)": lambda: alpino.tagged_sents(tagset="universal"), "Hindi: Indian Languages Corpus": lambda: indian.tagged_sents(files="hindi.pos"), "Hindi: Indian Languages Corpus (simplified)": lambda: indian.tagged_sents(files="hindi.pos", tagset="universal"), "Portuguese: Floresta Corpus (Portugal)": lambda: floresta.tagged_sents(), "Portuguese: Floresta Corpus (Portugal, simplified)": lambda: floresta.tagged_sents(tagset="universal"), "Portuguese: MAC-MORPHO Corpus (Brazil)": lambda: mac_morpho.tagged_sents(), "Portuguese: MAC-MORPHO Corpus (Brazil, simplified)": lambda: mac_morpho.tagged_sents(tagset="universal"), "Spanish: CESS-ESP Corpus (simplified)": lambda: cess_esp.tagged_sents(tagset="universal"), } class ConcordanceSearchView(object): _BACKGROUND_COLOUR = "#FFF" # white # Colour of highlighted results
def __create_tagger(): train_floresta_sents = floresta.tagged_sents() mac_morpho_sents = nltk.corpus.mac_morpho.tagged_sents() tagger = UnigramTagger(mac_morpho_sents) bigram_tagger = nltk.BigramTagger(train_floresta_sents, backoff=tagger) return bigram_tagger