def main(): treebank_tagged_sents = TreebankNoTraces() # Remove trace tokens. training_set = treebank_tagged_sents[: 3000] # This is the train-test split that we will use. test_set = treebank_tagged_sents[3000:] vocabulary = makeVocab(training_set) training_set_prep = PreprocessText(training_set, vocabulary) test_set_prep = PreprocessText(test_set, vocabulary) # Print the first sentence of each data set. print " ".join(untag(training_set_prep[0])) # See nltk.tag.util module. print " ".join(untag(test_set_prep[0])) # Estimate Bigram HMM from the training set, report level of ambiguity. bigram_hmm = BigramHMM() bigram_hmm.Train(training_set_prep) print "Percent tag ambiguity in training set is %.2f%%." % bigram_hmm.ComputePercentAmbiguous( training_set_prep) print "Joint probability of the first sentence is %s." % bigram_hmm.JointProbability( training_set_prep[0]) # Implement the most common class baseline. Report accuracy of the predicted tags. test_set_predicted_baseline = MostCommonClassBaseline( training_set_prep, test_set_prep) print "--- Most common class baseline accuracy ---" ComputeAccuracy(test_set_prep, test_set_predicted_baseline) # Use the Bigram HMM to predict tags for the test set. Report accuracy of the predicted tags. test_set_predicted_bigram_hmm = bigram_hmm.Test(test_set_prep) print "--- Bigram HMM accuracy ---" ComputeAccuracy(test_set_prep, test_set_predicted_bigram_hmm)
def replace_names(text, method="fake", replacechar="_"): newlines = [] names = [] fakenames = [] print(st.tag_sents(text)) lines = text.splitlines() for line in tqdm(lines): tagline = st.tag(line.split()) for i, tag in enumerate(tagline): if tag[1] == "PERSON": newtag = ["", ""] word, classification = tag if method == "fake": fake = getFakeFirstName() fakenames.append(fake) names.append(word) if tag[0] in names: newtag[0] = fakenames[names.index(word)] print(newtag[0]) else: newtag[0] = fakenames[i] else: newtag[0] = replace_with_char(word, replacechar) newtag[1] = classification newtag = tuple(newtag) tagline[i] = newtag newline = " ".join(untag(tagline)) newlines.append(newline) formatted = "\n".join(newlines) newlines.clear() fakenames.clear() return (formatted, names)
def replace_names_nltk(text, method="fake", replacechar="_"): newlines = [] names = [] fakenames = [] lines = nltk.word_tokenize(text, preserve_line=True) tagline = nltk.pos_tag(lines) namedEnt = nltk.ne_chunk(tagline, binary=False) tree = namedEnt.pos() for i, tag in enumerate(tree): if "PERSON" in tag: print(tag[0][0]) newtag = ["", ""] if method == "fake": fake = getFakeFirstName() names.append(tag) fakenames.append(fake) if tag[0] in names: newtag[0] = fakenames[names.index(tag[0])] else: newtag[0] = fake else: newtag[0] = replace_with_char(tag[0][0], replacechar) newtag[1] = "0" newtag = tuple(newtag) tagline[i] = newtag newline = d.detokenize(untag(tagline)) # remove whitespace around single quotes with regex subbed = re.sub(r'( ’ )', "'", newline) newlines.append(subbed) formatted = "\n".join(newlines) return (formatted, names)
def appenddata(sent_tag): X, y = [], [] for tagged in sent_tag: X.append( [feature(untag(tagged), index) for index in range(len(tagged))]) y.append([tag for _, tag in tagged]) return X, y
def chunk(self, sentence): tagged_tree = self.parse(pos_tag(word_tokenize(sentence.lower()))) chunks = [] for subtree in tagged_tree.subtrees(filter=tree_filter): chunks.append(untag(subtree.leaves())) max_length = 0 for i in range(len(chunks)): if len(chunks[i]) > max_length: chunk = chunks[i] max_length = len(chunks[i]) output = '' if len(chunks) > 0: for i in range(len(chunk)): if not chunk[i] == '.' and not chunk[i] == ',' and not i == 0: output = output + ' ' + chunk[i] else: output = output + chunk[i] index = sentence.lower().find(output) output = sentence[index:len(output) + index] return output
def transform_to_dataset(tagged_sentences): X, y = [], [] for tagged in tagged_sentences: X.append( [features(untag(tagged), index) for index in range(len(tagged))]) y.append([tag for _, tag in tagged]) return X, y
def UDuntagged(path): newfile = open('UDuntagged.txt', 'w') sents = [untag(sent) for sent in read_tagged_sents(path)] for sent in sents: for word in sent: newfile.write(word + ' ') newfile.write('\n')
def transform_to_dataset(tagged_sentences): x, y = [], [] for tagged in tagged_sentences: x.append([features(untag(tagged), index) for index in range(len(tagged))]) y.append([tag for _, tag in tagged]) return x, y
def evaluate(self, gold): "overriding evaluate from nltk.TaggerI, it seems to have a bug" tagged_sents = [ list(s) for s in self.tag_sents(untag(sent) for sent in gold) ] gold_tokens = sum(gold, []) test_tokens = sum(tagged_sents, []) return accuracy(gold_tokens, test_tokens)
def main(): treebank_tagged_sents = TreebankNoTraces() training_set = treebank_tagged_sents[:3000] test_set = treebank_tagged_sents[3000:] # 1. Preprocessing: vocabulary = Set_up_volcabulary(training_set) training_set_prep = PreprocessText(training_set, vocabulary) test_set_prep = PreprocessText(test_set, vocabulary) # Print the first sentence of each data set. print "--- First sentence of each data set after preprocessing ---" print " ".join(untag(training_set_prep[0])) print " ".join(untag(test_set_prep[0])) print "\n" # 2. Implement the most common class baseline. Report accuracy of the predicted tags. test_set_predicted_baseline = MostCommonClassBaseline( training_set_prep, test_set_prep) print "--- Most common class baseline accuracy ---" ComputeAccuracy(test_set_prep, test_set_predicted_baseline) print "\n" # Estimate Bigram HMM from the training set, report level of ambiguity. bigram_hmm = BigramHMM() bigram_hmm.Train(training_set_prep) print "--- Training ---" print "Percent tag ambiguity(tokens) in training set is %.2f%%." % bigram_hmm.ComputePercentAmbiguous( training_set_prep) print "For comparison, percent tag ambiguity(words) in training set is %.2f%%." % bigram_hmm.ComputePercentAmbiguous1( vocabulary) print "Joint probability of the first sentence is %s." % bigram_hmm.JointProbability( training_set_prep[0]) print "\n" # Use the Bigram HMM to predict tags for the test set. Report accuracy of the predicted tags. test_set_predicted_bigram_hmm = bigram_hmm.Test(test_set_prep) print "--- Testing ---" print "--- Bigram HMM accuracy ---" ComputeAccuracy(test_set_prep, test_set_predicted_bigram_hmm) print "\n" # confusion matrix print "--- Confusion matrix ---" ConfusionMatrix(test_set_prep, test_set_predicted_bigram_hmm)
def splitWordsTagsTestingNoDelim(self): sentences = [] tags = [] for s in self.check_sents: # untag the sentence, and append it to the list of sentences sentences.append(untag(s)) tags.append([t for (_, t) in s ]) # add all tags in a sentence to a list of tags return sentences, tags
def BuildTrainVocab(training): first_occur = set() s_occur = set() for sent in training: for token in untag(sent): if (token not in s_occur and token in first_occur): s_occur.add(token) if (token not in first_occur): first_occur.add(token) return s_occur
def main(): character, pinyin = readCorpus('C:\Users\wruan02\Documents\GitHub\Comp150NLP') token_char = tokenizeCharacter(character[0:]) token_pinyin = tokenizePinyin(pinyin[0:]) dataset = organizeSentence(token_char, token_pinyin) training_set = dataset[0:500] test_set = dataset[501:] # get vocabulary first! vocabulary, labelset = getVoc(training_set_prep) print len(vocabulary) print len(labelset) """ Transform the data sets by eliminating unknown words and adding sentence boundary tokens. """ training_set_prep = PreprocessText(training_set, vocabulary) test_set_prep = PreprocessText(test_set, vocabulary) """ Print the first sentence of each data set. """ # print training_set_prep[0] print " ".join(untag(training_set_prep[0])) # See nltk.tag.util module. print " ".join(untag(test_set_prep[0])) print test_set_prep[0] bigram_hmm = BigramHMM(vocabulary, labelset) bigram_hmm.Train(training_set_prep) """ Implement the most common class baseline. Report accuracy of the predicted tags. """ test_set_predicted_baseline = MostCommonClassBaseline(training_set_prep, test_set_prep, vocabulary, labelset) print "--- Most common class baseline accuracy ---" ComputeAccuracy(test_set_prep, test_set_predicted_baseline) print test_set_prep[0] print test_set_predicted_baseline[0] """ Use the Bigram HMM to predict tags for the test set. Report accuracy of the predicted tags. """ test_set_predicted_bigram_hmm = bigram_hmm.Test(test_set_prep) print "--- Bigram HMM accuracy ---" ComputeAccuracy(test_set_prep, test_set_predicted_bigram_hmm)
def RemoveWords_by_tag(text): remove_tag_list = ['JJ', 'JJR', 'JJS', 'RBR', 'RBS'] token = ToktokTokenizer() words = token.tokenize(text) words_tagged = nltk.pos_tag(words) filtered = untag([ w for w in words_tagged if not w[1] in remove_tag_list ]) # Filtre les mots qui n'appartiennt pas à la catégorie à supprimer return ' '.join(map(str, filtered))
def count_phrases(corpus, tagger, chunker): cfd = probability.ConditionalFreqDist() for sent in corpus.sents(): tree = chunker.parse(tagger.tag(sent)) for sub in tree.subtrees(): if sub.node == 'S': continue words = untag(sub.leaves()) if len(words) >= 2: cfd[sub.node].inc(' '.join(words)) return cfd
def extract_address(chunker, sentence): """ returns all addresses in sentence """ def tree_filter(tree): return GPE_TAG == tree.label() tagged_tree = get_tagged_sentence(chunker, sentence) addresses = list() for subtree in tagged_tree.subtrees(filter=tree_filter): addresses.append(untag(subtree.leaves())) return addresses
def evaluate(self, gold): ''' Score the accuracy of the tagger against the gold standard. Strip the tags from the gold standard text,retag it using the tagger,then compute the accuracy score. :param gold: 真实的标记 :return: 准确率 ''' tagged_sents = self.tag_sents(untag(sent) for sent in gold) gold_tokens = sum(gold, []) test_tokens = sum(tagged_sents, []) return accuracy(gold_tokens, test_tokens)
def pass_to_dataframe(taggedSentences): wordList = [] # X tagList = [] # Y for tagged in taggedSentences: wordList.append([ feature_extraction_function(untag(tagged), index) for index in range(len(tagged)) ]) tagList.append([tag for _, tag in tagged]) return wordList, tagList
def metrics(self, gold, printout=True, confusion_matrix=False, oov=True): ''' More sophisticated evalution method gives more numbers. :param gold: The sentences to use for testing :type gold: [[(str, str)]] :param printout: Should I print the results or just return them? :type printout: bool :param confusion_matrix: Should I create a Confusion Matrix? :type confusion_matrix: bool :param oov: Should the out of vocabulary words be calculated :type oov: bool :return: (acc, prec, rec, fsc, aov, None) the first five are the accuracy, precision, recall, fscore, and out of vocabulary words. The last one is the Confusion Matrix if requested, else None :rtype: (double, double, double, double, double, ConfusionMatrix or None) ''' tagger_out = self._tagger.tag_sents(untag(sent) for sent in gold) gold_tokens = sum(gold, []) test_tokens = sum(tagger_out, []) gold_tokens_set = set(gold_tokens) test_tokens_set = set(test_tokens) gold_tags = [t for (_, t) in gold_tokens] test_tags = [t for (_, t) in test_tokens] # calculate out of vocabulary words if oov: d = {word: True for (word, _) in reduce(lambda a, b: a + b, self._tagged_sents, [])} aov = reduce(lambda a, b: a + 1 if not b in d else a, [w for (w, _) in gold_tokens], 0) aov = (aov * 100.0) / len(gold_tokens) else: aov = '-1' acc = accuracy(gold_tokens, test_tokens) prc = precision(gold_tokens_set, test_tokens_set) rec = recall(gold_tokens_set, test_tokens_set) fms = f_measure(gold_tokens_set, test_tokens_set) cfm = None if confusion_matrix: cfm = ConfusionMatrix(gold_tags, test_tags) if printout: print("accuracy: " + str(acc)) print("precision: " + str(prc)) print("recall: " + str(rec)) print("f-score: " + str(fms)) print("out of vocabulary: " + str(aov) + " %") if confusion_matrix: print(cfm) return acc, prc, rec, fms, aov, cfm
def transform_to_dataset(tagged_sentences): """ transform list of tagged sentences to list of untagged sentences and list of tags :param tagged_sentences: list of sentences, each contains tuples of (word, tag) :return: list of sentences, list of sentences tags the wights are currently ignored but still maintained """ X, y = [], [] for tagged, weight in tagged_sentences: X.append( [features(untag(tagged), index) for index in range(len(tagged))]) y.append([tag for _, tag in tagged]) return X, y
def count_phrases(corpus, tagger, chunker): cfd = probability.ConditionalFreqDist() for sent in corpus.sents(): tree = chunker.parse(tagger.tag(sent)) for sub in tree.subtrees(): if sub.node == "S": continue words = untag(sub.leaves()) if len(words) >= 2: cfd[sub.node].inc(" ".join(words)) return cfd
def evaluate(self, gold): """ Score the accuracy of the tagger against the gold standard. Strip the tags from the gold standard text, retag it using the tagger, then compute the accuracy score. :type gold: list(list(tuple(str, str))) :param gold: The list of tagged sentences to score the tagger on. :rtype: float """ tagged_sents = self.tag_sents(untag(sent) for sent in gold) gold_tokens = sum(gold, []) test_tokens = sum(tagged_sents, []) return accuracy(gold_tokens, test_tokens)
def transform_to_dataset(tagged_sentences,args): X, y = [], [] for tagged in tagged_sentences: # sent='CNN reported that Republican leader Bill Frist should have what is known to be dangerous.' # token=nlp.word_tokenize(sent) token=untag(tagged) sent = " ".join(token) pos=nlp.pos_tag(sent) pos_tag=copy.deepcopy(pos) tagged=tupple_allign(tagged,pos_tag) token=[x[0] for x in pos] assert len(tagged) == len(pos) dparse=nlp.dependency_parse(sent) word_lem= [stemmer.stem(x) for x in token] X.append([features(sent,word_lem,token,pos,dparse, index,args) for index in range(len(tagged))]) y.append([tag for _, tag in tagged]) return X, y
def _confusion_cached(self, gold): """ Inner function used after ``gold`` is converted to a ``tuple(tuple(tuple(str, str)))``. That way, we can use caching on creating a ConfusionMatrix. :param gold: The list of tagged sentences to run the tagger with, also used as the reference values in the generated confusion matrix. :type gold: tuple(tuple(tuple(str, str))) :rtype: ConfusionMatrix """ tagged_sents = self.tag_sents(untag(sent) for sent in gold) gold_tokens = [token for _word, token in chain.from_iterable(gold)] test_tokens = [ token for _word, token in chain.from_iterable(tagged_sents) ] return ConfusionMatrix(gold_tokens, test_tokens)
def extract_address(chunker, sentence): """ returns all addresses in sentence """ def tree_filter(tree): return GPE_TAG == tree.label() tagged_tree = get_tagged_sentence(chunker, sentence) addresses = list() realAddresses = [] for subtree in tagged_tree.subtrees(filter=tree_filter): addresses.append(untag(subtree.leaves())) for address in addresses: try: houseNum = int(address[0]) realAddresses.append(address) except ValueError: continue return {"ADDRESSES": addresses, "PARSEDADDRESSES": realAddresses}
def tokenizacion(archivo): # Tokens palabras = nltk.word_tokenize(archivo.read()) palabraSucias = [] for palabra in palabras: palabraSucias.append(str2tuple(palabra)) # print(palabraSucias) textoLimpio = untag(palabraSucias) # Borro los signos de puntuacion textoLimpio = [palabra for palabra in textoLimpio if len(palabra) > 1] # # for texto in textoLimpio: # if (texto == '``'): # textoLimpio=textoLimpio.remove(texto) for palabra in textoLimpio: palabra.lower() return textoLimpio
def person_connotation(tweet, name): """ Decide whether a person is talked favorably about or not, based on the tone of the sentences in which their name appears """ twtcontent = sent_tokenize(tweet) overall = {'compound': 0, 'neg': 0, 'neu': 0, 'pos': 0} mentions = 0 # analyze each sentence talking about `name` person for s in twtcontent: tags = get_tweet_tags(s) # if the name appears in the tagged sentence, get its tone if (name, 'NNP') in tags: sentence = util.untag(tags) scores = tweet_connotation(' '.join(sentence)) # add it up to the overall tweet's tone for i, z in enumerate(scores): overall[z] += scores[z] mentions += 1 # averaging all sentences' scores. don't wanna divide by zero now do we if mentions != 0: for v in overall: overall[v] = round(overall[v] / mentions, 3) return overall
def viterbi(self, sequence): """ Runs the viterbi algorithm and returns the best labeling of the partially unlabeled sequence provided The viterbi modification shown here only predicts the hidden state if the state associated with the sequence is BLANK, otherwise the hidden state is given to us Note this implementation doesn't consider all sates for a obs, just the ones that were seen associated with it at train time Args: sequence -- labeled sequence """ viterbi = defaultdict(lambda: defaultdict(float)) backpointer = defaultdict(lambda: defaultdict(str)) most_common_state = self.state_counts.most_common(1)[0] # init, generalized to account for start token padding for i in range(1, self.n): viterbi[START_TOKEN][i] = 1.0 backpointer[START_TOKEN][i] = (START_TOKEN,) * (self.n - 1) prev_states = [(START_TOKEN,) * (self.n - 1)] allinf = True # if all log probs at time step are -inf t = len(sequence) # recusion for i in range(self.n, t): allinf = True obs = sequence[i - 1][0] potential_state = sequence[i - 1][1] # determine whether we need to attempt to label if potential_state == BLANK_TOKEN: # add most common tag in case no states for tag, # also ensures prev not empty states = self.B[obs].keys() + [most_common_state] else: # if we don't need to label, only give one option states = [potential_state] for q in states: bp, max_prob = self.__viterbi_prob(viterbi, obs, q, \ prev_states, i) viterbi[q][i] = max_prob backpointer[q][i] = bp if max_prob != float("-inf"): # received non -inf log prob allinf = False # if state prob is 0 for all obs, reset viterbi matrix at time i if allinf: for q in states: viterbi[q][i] = 0.0 # get list of obs from previous obs state prev_states = [tuple(backpointer[s][i][1:]) + (s,) for s in states] # finalize bp, max_prob = self.__viterbi_prob(viterbi, sequence[t-1][0], \ END_TOKEN, prev_states, t) viterbi[END_TOKEN][t] = max_prob backpointer[END_TOKEN][t] = bp labeled_seq = zip(untag(sequence), \ self.__follow_backpointers(backpointer, t)) return labeled_seq
def main(): character, pinyin = readCorpus('C:\Users\wruan02\Documents\GitHub\Comp150NLP') token_char = tokenizeCharacter(character[0:]) token_pinyin = tokenizePinyin(pinyin[0:]) # print len(token_char) # print len(token_pinyin) # token_pinyin.pop(45730) # print len(token_pinyin) # print token_pinyin[45700:45734] # characterPrint(token_char[45700:45734]) dataset = organizeSentence(token_char, token_pinyin) print len(dataset) """ Supervised Learning """ training_set = copy.deepcopy(dataset[0:1000]) test_set = copy.deepcopy(dataset[1001:]) # get vocabulary first! vocabulary, labelset = getVoc(training_set) # print len(vocabulary) # print len(labelset) """ Transform the data sets by eliminating unknown words and adding sentence boundary tokens. """ training_set_prep = PreprocessText(training_set, vocabulary) test_set_prep = PreprocessText(test_set, vocabulary) # supervised with laplace smoothing bigram_hmm_laplace = BigramHMM() bigram_hmm_laplace.supervisedLearning(training_set_prep,"Laplace") # supervised with no smoothing in learning probabilities but use most common to smooth in Viterbi bigram_hmm_mostcommon = BigramHMM() bigram_hmm_mostcommon.supervisedLearning(training_set_prep, "no smoothing") bigram_hmm_mostcommon.getMostCommon(training_set_prep) """ Implement the most common class baseline. Report accuracy of the predicted tags. """ test_set_predicted_baseline = MostCommonClassBaseline(training_set_prep, test_set_prep) print "--- Most common class baseline accuracy ---" ComputeAccuracy(test_set_prep, test_set_predicted_baseline) # print test_set_prep[0] # print test_set_predicted_baseline[0] """ Use the Bigram HMM to predict tags for the test set. Report accuracy of the predicted tags. """ test_set_predicted_bigram_hmm_laplace = bigram_hmm_laplace.Test(test_set_prep,"Laplace") test_set_predicted_bigram_hmm_mostcommon = bigram_hmm_mostcommon.Test(test_set_prep, "Most Common") print "--- Bigram HMM with most common accuracy ---" ComputeAccuracy(test_set_prep, test_set_predicted_bigram_hmm_mostcommon) print "--- Bigram HMM with Laplace accuracy ---" ComputeAccuracy(test_set_prep, test_set_predicted_bigram_hmm_laplace) # print test_set_predicted_bigram_hmm[0] # print " ".join(untag(test_set_predicted_bigram_hmm[20])) print "\n" print "A sequence of Pinyin words:" print " ".join(untag(test_set_prep[25])) print "Common baseline results:" displayTag(test_set_predicted_baseline[25]) print "HMM with most common results:" displayTag(test_set_predicted_bigram_hmm_mostcommon[25]) print "HMM with laplace result:" displayTag(test_set_predicted_bigram_hmm_laplace[25]) # """ Semi_supervised Learning starts here # """ # # training_set_tagged = copy.deepcopy(dataset[0:800]) # training_set_untagged = copy.deepcopy(dataset[801:1000]) # test_set = copy.deepcopy(dataset[1001:]) # # # get vocabulary first! # vocabulary, labelset = getVoc(training_set) # # # """ Transform the data sets by eliminating unknown words and adding sentence boundary tokens. # """ # training_set_tagged_prep = PreprocessText(training_set_tagged, vocabulary) # training_set_untagged_prep = PreprocessText(training_set_untagged, vocabulary) # test_set_prep = PreprocessText(test_set, vocabulary) # # training_with_tag = training_set_tagged_prep # training_without_tag = [] # for sent in training_set_untagged_prep: # temp = [] # for tup in sent: # temp.append(tup[0]) # training_without_tag.append(temp) # # bigram_hmm_semi = BigramHMM() # bigram_hmm_semi.semisupervisedLearning(training_with_tag, training_without_tag, "Laplace") # bigram_hmm_semi.getMostCommon(training_with_tag) # # """ Implement the most common class baseline. Report accuracy of the predicted tags. # """ # test_set_predicted_baseline = MostCommonClassBaseline(training_set_tagged_prep, test_set_prep) # print "--- Most common class baseline accuracy ---" # ComputeAccuracy(test_set_prep, test_set_predicted_baseline) # # # """ Use the Bigram HMM to predict tags for the test set. Report accuracy of the predicted tags. # """ # test_set_predicted_bigram_hmm = bigram_hmm.Test(test_set_prep, "Laplace") # print "--- Bigram HMM accuracy ---" # ComputeAccuracy(test_set_prep, test_set_predicted_bigram_hmm) # print test_set_predicted_bigram_hmm[0] # print " ".join(untag(test_set_predicted_bigram_hmm[20])) # print "Common baseline results:" # displayTag(test_set_predicted_baseline[25]) # print "HMM result:" # displayTag(test_set_predicted_bigram_hmm[25]) """ Print the first sentence of each data set. """ print '\n' print "A sequence of Pinyin words:" print " ".join(untag(training_set_prep[0])) # See nltk.tag.util module. print "It's corresponding Chinese characters:" displayTag(training_set_prep[0]) print "A sequence of Pinyin words:" print " ".join(untag(test_set_prep[1])) print "It's corresponding Chinese characters:" displayTag(test_set_prep[1])
import nltk from nltk.tag.util import str2tuple from nltk.tag.util import untag from nltk.tag.util import tuple2str textoSucio = 'It/pps recommended/vbd that/cs Fulton/np legislators/nns act/vb ``/`` to/to have/hv these/dts laws/nns studied/vbn and/cc revised/vbn to/in the/at end/nn of/in modernizing/vbg and/cc improving/vbg them/ppo ' '/' ' ./.' palabras = nltk.word_tokenize(textoSucio) palabraSucias = [] for palabra in palabras: palabraSucias.append(str2tuple(palabra)) # print(palabraSucias) textoLimpio = untag(palabraSucias) textoLimpio = [palabra for palabra in textoLimpio if len(palabra) > 1] for texto in textoLimpio: if (texto == '``'): textoLimpio.remove(texto) # stopwords = set(nltk.corpus.stopwords.words('english')) # StopWords Configuracion # textoLimpio = [palabra for palabra in textoLimpio if palabra not in stopwords] print(textoLimpio)
def evaluate(self, gold): tagged_sents = self.tag_sents(untag(sent) for sent in gold) gold_tokens = list(itertools.chain(*gold)) print(json.dumps(gold_tokens)) print(len(tagged_sents), len(gold_tokens)) return accuracy(gold_tokens, tagged_sents)
tagged_sents = self.tag_sents(untag(sent) for sent in gold) gold_tokens = list(itertools.chain(*gold)) print(json.dumps(gold_tokens)) print(len(tagged_sents), len(gold_tokens)) return accuracy(gold_tokens, tagged_sents) if __name__ == '__main__': sents = treebank.tagged_sents() PT = PerceptronTagger() print("Timing NLTK ...") pt_times = [] for _ in range(5): now = time.time() PT.tag_sents(untag(sent) for sent in sents) pt_times.append(time.time() - now) pt_time = round(sum(pt_times) / len(pt_times), 3) '''NOTE: Moved to tag_test.go print("Timing prose ...") acc = round(APTagger().evaluate(sents), 3) ap_time = round(sum(AP_TIME) / len(AP_TIME), 3) ''' print("Evaluating accuracy ...") headers = ['Library', 'Accuracy', '5-Run Average (sec)'] table = [ ['NLTK', round(PT.evaluate(sents), 3), pt_time], # ['`prose`', acc, ap_time] ] print(tabulate(table, headers, tablefmt='pipe'))
def Viterbi(self, sent): # viterbi is a list of dictionary that maps each tag t to the probablity of the best tag sequence that ends in t viterbi = [] # backpointer is a list of dictionary that maps each tag t to the previous tag in the best tag sequence backpointer = [] first_viterbi = {} first_backpointer = {} for tag in self.dictionary[sent[1][0]]: if tag == start_token: continue # we start with the first meaning for word in the sentence, since all the sentence start with <S> first_viterbi[tag] = self.log_transitions.get( (start_token, tag), -float("inf")) + self.log_emissions.get( (sent[1][0], tag), -float("inf")) first_backpointer[tag] = start_token viterbi.append(first_viterbi) backpointer.append(first_backpointer) for word_index in range(2, len(sent) - 1): this_viterbi = {} this_backpointer = {} prev_viterbi = viterbi[-1] for cur_tag in self.dictionary[sent[word_index][0]]: if cur_tag == start_token: continue best_pre_tag = None best_prob = -float("inf") for pre_tag in self.dictionary[sent[word_index - 1][0]]: this_prob = prev_viterbi.get( pre_tag, -float("inf")) + self.log_transitions.get( (pre_tag, cur_tag), -float("inf")) if this_prob > best_prob: best_pre_tag = pre_tag best_prob = this_prob this_viterbi[cur_tag] = prev_viterbi.get( best_pre_tag, -float("inf")) + self.log_transitions.get( (best_pre_tag, cur_tag), -float("inf")) + self.log_emissions.get( (sent[word_index][0], cur_tag), -float("inf")) this_backpointer[cur_tag] = best_pre_tag viterbi.append(this_viterbi) backpointer.append(this_backpointer) # Done with all meaningful word in the sentence. Now caculate the prob fo each tag followd by </S> prev_viterbi = viterbi[-1] best_pre_tag = None best_prob = -float("inf") for pre_tag in self.dictionary[sent[-2][0]]: this_prob = prev_viterbi.get( pre_tag, -float("inf")) + self.log_transitions.get( (pre_tag, end_token), -float("inf")) if this_prob > best_prob: best_pre_tag = pre_tag best_prob = this_prob log_prob_tag_seq = prev_viterbi.get( best_pre_tag, -float("inf")) + self.log_transitions.get( (best_pre_tag, end_token), -float("inf")) prob_tag_seq = exp(log_prob_tag_seq) # Get the best tag sequence if prob_tag_seq == 0.0: return [] # All branches is 0, so I skip this sentence else: best_tag_seq = [end_token, best_pre_tag] backpointer.reverse() cur_best_tag = best_pre_tag for bp in backpointer: best_tag_seq.append(bp[cur_best_tag]) cur_best_tag = bp[cur_best_tag] best_tag_seq.reverse() sent_predicted = zip(untag(sent), best_tag_seq) return sent_predicted
def all_sents(self): '''set class´s attribute "sents"''' self.sents = [untag(i) for i in self.tagged_sents]
def train(self, train_sents, max_rules=200, min_score=2, min_acc=None): """ Trains the Brill tagger on the corpus *train_sents*, producing at most *max_rules* transformations, each of which reduces the net number of errors in the corpus by at least *min_score*, and each of which has accuracy not lower than *min_acc*. #imports >>> from nltk.tbl.template import Template >>> from nltk.tag.brill import Pos, Word >>> from nltk.tag import RegexpTagger, BrillTaggerTrainer #some data >>> from nltk.corpus import treebank >>> training_data = treebank.tagged_sents()[:100] >>> baseline_data = treebank.tagged_sents()[100:200] >>> gold_data = treebank.tagged_sents()[200:300] >>> testing_data = [untag(s) for s in gold_data] >>> backoff = RegexpTagger([ ... (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers ... (r'(The|the|A|a|An|an)$', 'AT'), # articles ... (r'.*able$', 'JJ'), # adjectives ... (r'.*ness$', 'NN'), # nouns formed from adjectives ... (r'.*ly$', 'RB'), # adverbs ... (r'.*s$', 'NNS'), # plural nouns ... (r'.*ing$', 'VBG'), # gerunds ... (r'.*ed$', 'VBD'), # past tense verbs ... (r'.*', 'NN') # nouns (default) ... ]) >>> baseline = backoff #see NOTE1 >>> baseline.evaluate(gold_data) #doctest: +ELLIPSIS 0.2450142... #templates >>> Template._cleartemplates() #clear any templates created in earlier tests >>> templates = [Template(Pos([-1])), Template(Pos([-1]), Word([0]))] #construct a BrillTaggerTrainer >>> tt = BrillTaggerTrainer(baseline, templates, trace=3) >>> tagger1 = tt.train(training_data, max_rules=10) TBL train (fast) (seqs: 100; tokens: 2417; tpls: 2; min score: 2; min acc: None) Finding initial useful rules... Found 845 useful rules. <BLANKLINE> B | S F r O | Score = Fixed - Broken c i o t | R Fixed = num tags changed incorrect -> correct o x k h | u Broken = num tags changed correct -> incorrect r e e e | l Other = num tags changed incorrect -> incorrect e d n r | e ------------------+------------------------------------------------------- 132 132 0 0 | AT->DT if Pos:NN@[-1] 85 85 0 0 | NN->, if Pos:NN@[-1] & Word:,@[0] 69 69 0 0 | NN->. if Pos:NN@[-1] & Word:.@[0] 51 51 0 0 | NN->IN if Pos:NN@[-1] & Word:of@[0] 47 63 16 161 | NN->IN if Pos:NNS@[-1] 33 33 0 0 | NN->TO if Pos:NN@[-1] & Word:to@[0] 26 26 0 0 | IN->. if Pos:NNS@[-1] & Word:.@[0] 24 24 0 0 | IN->, if Pos:NNS@[-1] & Word:,@[0] 22 27 5 24 | NN->-NONE- if Pos:VBD@[-1] 17 17 0 0 | NN->CC if Pos:NN@[-1] & Word:and@[0] >>> tagger1.rules()[1:3] (Rule('001', 'NN', ',', [(Pos([-1]),'NN'), (Word([0]),',')]), Rule('001', 'NN', '.', [(Pos([-1]),'NN'), (Word([0]),'.')])) >>> train_stats = tagger1.train_stats() >>> [train_stats[stat] for stat in ['initialerrors', 'finalerrors', 'rulescores']] [1775, 1269, [132, 85, 69, 51, 47, 33, 26, 24, 22, 17]] >>> tagger1.print_template_statistics(printunused=False) TEMPLATE STATISTICS (TRAIN) 2 templates, 10 rules) TRAIN ( 2417 tokens) initial 1775 0.2656 final: 1269 0.4750 #ID | Score (train) | #Rules | Template -------------------------------------------- 001 | 305 0.603 | 7 0.700 | Template(Pos([-1]),Word([0])) 000 | 201 0.397 | 3 0.300 | Template(Pos([-1])) <BLANKLINE> <BLANKLINE> >>> tagger1.evaluate(gold_data) # doctest: +ELLIPSIS 0.43996... >>> tagged, test_stats = tagger1.batch_tag_incremental(testing_data, gold_data) >>> tagged[33][12:] == [('foreign', 'IN'), ('debt', 'NN'), ('of', 'IN'), ('$', 'NN'), ('64', 'CD'), ... ('billion', 'NN'), ('*U*', 'NN'), ('--', 'NN'), ('the', 'DT'), ('third-highest', 'NN'), ('in', 'NN'), ... ('the', 'DT'), ('developing', 'VBG'), ('world', 'NN'), ('.', '.')] True >>> [test_stats[stat] for stat in ['initialerrors', 'finalerrors', 'rulescores']] [1855, 1376, [100, 85, 67, 58, 27, 36, 27, 16, 31, 32]] # a high-accuracy tagger >>> tagger2 = tt.train(training_data, max_rules=10, min_acc=0.99) TBL train (fast) (seqs: 100; tokens: 2417; tpls: 2; min score: 2; min acc: 0.99) Finding initial useful rules... Found 845 useful rules. <BLANKLINE> B | S F r O | Score = Fixed - Broken c i o t | R Fixed = num tags changed incorrect -> correct o x k h | u Broken = num tags changed correct -> incorrect r e e e | l Other = num tags changed incorrect -> incorrect e d n r | e ------------------+------------------------------------------------------- 132 132 0 0 | AT->DT if Pos:NN@[-1] 85 85 0 0 | NN->, if Pos:NN@[-1] & Word:,@[0] 69 69 0 0 | NN->. if Pos:NN@[-1] & Word:.@[0] 51 51 0 0 | NN->IN if Pos:NN@[-1] & Word:of@[0] 36 36 0 0 | NN->TO if Pos:NN@[-1] & Word:to@[0] 26 26 0 0 | NN->. if Pos:NNS@[-1] & Word:.@[0] 24 24 0 0 | NN->, if Pos:NNS@[-1] & Word:,@[0] 19 19 0 6 | NN->VB if Pos:TO@[-1] 18 18 0 0 | CD->-NONE- if Pos:NN@[-1] & Word:0@[0] 18 18 0 0 | NN->CC if Pos:NN@[-1] & Word:and@[0] >>> tagger2.evaluate(gold_data) # doctest: +ELLIPSIS 0.44159544... >>> tagger2.rules()[2:4] (Rule('001', 'NN', '.', [(Pos([-1]),'NN'), (Word([0]),'.')]), Rule('001', 'NN', 'IN', [(Pos([-1]),'NN'), (Word([0]),'of')])) # NOTE1: (!!FIXME) A far better baseline uses nltk.tag.UnigramTagger, # with a RegexpTagger only as backoff. For instance, # >>> baseline = UnigramTagger(baseline_data, backoff=backoff) # However, as of Nov 2013, nltk.tag.UnigramTagger does not yield consistent results # between python versions. The simplistic backoff above is a workaround to make doctests # get consistent input. :param train_sents: training data :type train_sents: list(list(tuple)) :param max_rules: output at most max_rules rules :type max_rules: int :param min_score: stop training when no rules better than min_score can be found :type min_score: int :param min_acc: discard any rule with lower accuracy than min_acc :type min_acc: float or None :return: the learned tagger :rtype: BrillTagger """ # FIXME: several tests are a bit too dependent on tracing format # FIXME: tests in trainer.fast and trainer.brillorig are exact duplicates # Basic idea: Keep track of the rules that apply at each position. # And keep track of the positions to which each rule applies. # Create a new copy of the training corpus, and run the # initial tagger on it. We will progressively update this # test corpus to look more like the training corpus. test_sents = [list(self._initial_tagger.tag(untag(sent))) for sent in train_sents] # Collect some statistics on the training process trainstats = {} trainstats['min_acc'] = min_acc trainstats['min_score'] = min_score trainstats['tokencount'] = sum(len(t) for t in test_sents) trainstats['sequencecount'] = len(test_sents) trainstats['templatecount'] = len(self._templates) trainstats['rulescores'] = [] trainstats['initialerrors'] = sum( tag[1] != truth[1] for paired in zip(test_sents, train_sents) for (tag, truth) in zip(*paired) ) trainstats['initialacc'] = 1 - trainstats['initialerrors']/trainstats['tokencount'] if self._trace > 0: print("TBL train (fast) (seqs: {sequencecount}; tokens: {tokencount}; " "tpls: {templatecount}; min score: {min_score}; min acc: {min_acc})".format(**trainstats)) # Initialize our mappings. This will find any errors made # by the initial tagger, and use those to generate repair # rules, which are added to the rule mappings. if self._trace: print("Finding initial useful rules...") self._init_mappings(test_sents, train_sents) if self._trace: print((" Found %d useful rules." % len(self._rule_scores))) # Let the user know what we're up to. if self._trace > 2: self._trace_header() elif self._trace == 1: print("Selecting rules...") # Repeatedly select the best rule, and add it to `rules`. rules = [] try: while (len(rules) < max_rules): # Find the best rule, and add it to our rule list. rule = self._best_rule(train_sents, test_sents, min_score, min_acc) if rule: rules.append(rule) score = self._rule_scores[rule] trainstats['rulescores'].append(score) else: break # No more good rules left! # Report the rule that we found. if self._trace > 1: self._trace_rule(rule) # Apply the new rule at the relevant sites self._apply_rule(rule, test_sents) # Update _tag_positions[rule.original_tag] and # _tag_positions[rule.replacement_tag] for the affected # positions (i.e., self._positions_by_rule[rule]). self._update_tag_positions(rule) # Update rules that were affected by the change. self._update_rules(rule, train_sents, test_sents) # The user can cancel training manually: except KeyboardInterrupt: print("Training stopped manually -- %d rules found" % len(rules)) # Discard our tag position mapping & rule mappings. self._clean() trainstats['finalerrors'] = trainstats['initialerrors'] - sum(trainstats['rulescores']) trainstats['finalacc'] = 1 - trainstats['finalerrors']/trainstats['tokencount'] # Create and return a tagger from the rules we found. return BrillTagger(self._initial_tagger, rules, trainstats)
def processContent(): for item in text4: grammar = r""" ProperNoun: {<NNP|NNS>} Verb: {<BT|VBD>|<VBD>|<BT>*<VBD>} OtherNouns: {<IN><NP>|*<NNS|NN>*|<PRP\$>?<NN>+|<PRP\$>?} jj: {*<JJ>*} Pronoun:{<PRP>|<PRP\$>|<PRO>} NonlivingPronoun: {<NLPRO>} """ train_sents = [ [('it', 'NLPRO')], [('walk', 'NN')], [('discovers', 'VBD')], [('finds','VBD')], [('meets','VBD')] ] tokenized = nltk.word_tokenize(item) tagger = nltk.UnigramTagger(train_sents, backoff=default_tagger) i=tagger.tag(tokenized) cp = nltk.RegexpParser(grammar) result = cp.parse(i) G=nx.DiGraph() Gc=nx.DiGraph() Gcc=nx.DiGraph() flag=0 for a in result: if type(a) is nltk.Tree: if(a.label()=='ProperNoun') : cur=a.leaves() ProperNoun.append(cur) if flag == 0: n1=a.leaves() un1=untag(n1) else: n2=a.leaves() un2=untag(n2) print(un1, uv ,un2) G.add_node(tuple(un1)) G.add_node(tuple(un2)) G.add_edge(tuple(un1),tuple(un2),label=tuple(uv)) n1=n2 un1=untag(n1) flag=0 if(a.label()=='Noun2'): cur2=a.leaves() noun.append(cur2) if flag == 0: n1=a.leaves() un1=untag(n1) else: n2=a.leaves() un2=untag(n2) print(un1, uv ,un2) Gcc.add_node(tuple(un1)) Gcc.add_node(tuple(un2)) Gcc.add_edge(tuple(un1),tuple(un2),label=tuple(uv)) flag=0 if(a.label()=='OtherNouns') : cur1=a.leaves() noun.append(cur1) if flag == 0: n1=a.leaves() un1=untag(n1) else: n2=a.leaves() un2=untag(n2) print(un1, uv ,un2 ) Gc.add_node(tuple(un1)) Gc.add_node(tuple(un2)) Gc.add_edge(tuple(un1),tuple(un2),label=tuple(uv)) flag=0 n1=n2 un1=untag(n1) if(a.label()=='Pronoun') : #print('Pronoun',a.leaves(),'stands for',cur) ProperNoun.append(cur) if flag == 0: n1=cur un1=untag(n1) #print('n1', n1) else: n2=cur un2=untag(n2) print('pronoun::',un1, uv ,un2 ) G.add_node(tuple(un1)) G.add_node(tuple(un2)) G.add_edge(tuple(un1),tuple(un2),label=tuple(uv)) flag=0 n1=n2 un1=untag(n1) if(a.label()=='NonlivingPronoun') : #print('Pronoun',a.leaves(),'stands for',cur1) noun.append(cur1) if flag == 0: n1=cur1 un1=untag(n1) else: n2=cur1 un2=untag(n2) print(un1 , uv , un2) G.add_node(tuple(un1)) G.add_node(tuple(un2)) G.add_edge(tuple(un1),tuple(un2),label=tuple(uv)) flag=0 n1=n2 un1=untag(n1) if(a.label()=='Verb') : flag=1 v=a.leaves() uv=untag(v) VerbList.append(v) print('List of living nouns') print(ProperNoun) print('List of Non living nouns') print(noun) print('List of verbs') print(VerbList) graph_pos=nx.spring_layout(G) graph_pos=nx.spring_layout(Gc) graph_pos=nx.spring_layout(Gcc) nx.draw_networkx_nodes(G,graph_pos,node_size=3000, alpha=0.3, node_color='red',node_shape='o') nx.draw_networkx_nodes(Gc,graph_pos,node_size=2000, alpha=0.3, node_color='green',node_shape='o') nx.draw_networkx_nodes(Gcc,graph_pos,node_size=1000, alpha=0.3, node_color='yellow',node_shape='o') nx.draw_networkx_edges(G,graph_pos,width=1, alpha=0.3,edge_color='blue') nx.draw_networkx_labels(G, graph_pos,font_size=10, font_family='sans-serif') nx.draw_networkx_edges(Gc,graph_pos,width=1,alpha=0.3,edge_color='blue') nx.draw_networkx_labels(G, graph_pos,font_size=10,font_family='sans-serif') nx.draw_networkx_edges(Gcc,graph_pos,width=1,alpha=0.3,edge_color='blue') nx.draw_networkx_labels(G, graph_pos,font_size=10,font_family='sans-serif') nx.draw_networkx_edge_labels(G, graph_pos,font_size=10,label_pos=0.3) nx.draw_networkx_edge_labels(Gc, graph_pos,font_size=10,label_pos=0.3) nx.draw_networkx_edge_labels(Gcc, graph_pos,font_size=10,label_pos=0.3) plt.show() result.draw()
from nltk.corpus import treebank from nltk.tag.util import untag sentences = treebank.tagged_sents() text = [] for s in sentences: text.append(' '.join(untag(s))) print(' '.join(text))
def evaluate(self, gold): tagged_sents = self.tag_sents(untag(sent) for sent in gold) gold_tokens = list(itertools.chain(*gold)) return accuracy(gold_tokens, tagged_sents)
def train(self, train_sents, max_rules=200, min_score=2, min_acc=None): """ Trains the Brill tagger on the corpus *train_sents*, producing at most *max_rules* transformations, each of which reduces the net number of errors in the corpus by at least *min_score*, and each of which has accuracy not lower than *min_acc*. #imports >>> from nltk.tbl.template import Template >>> from nltk.tag.brill import Pos, Word >>> from nltk.tag import RegexpTagger, BrillTaggerTrainer #some data >>> from nltk.corpus import treebank >>> training_data = treebank.tagged_sents()[:100] >>> baseline_data = treebank.tagged_sents()[100:200] >>> gold_data = treebank.tagged_sents()[200:300] >>> testing_data = [untag(s) for s in gold_data] >>> backoff = RegexpTagger([ ... (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers ... (r'(The|the|A|a|An|an)$', 'AT'), # articles ... (r'.*able$', 'JJ'), # adjectives ... (r'.*ness$', 'NN'), # nouns formed from adjectives ... (r'.*ly$', 'RB'), # adverbs ... (r'.*s$', 'NNS'), # plural nouns ... (r'.*ing$', 'VBG'), # gerunds ... (r'.*ed$', 'VBD'), # past tense verbs ... (r'.*', 'NN') # nouns (default) ... ]) >>> baseline = backoff #see NOTE1 >>> baseline.evaluate(gold_data) #doctest: +ELLIPSIS 0.2450142... #templates >>> Template._cleartemplates() #clear any templates created in earlier tests >>> templates = [Template(Pos([-1])), Template(Pos([-1]), Word([0]))] #construct a BrillTaggerTrainer >>> tt = BrillTaggerTrainer(baseline, templates, trace=3) >>> tagger1 = tt.train(training_data, max_rules=10) TBL train (fast) (seqs: 100; tokens: 2417; tpls: 2; min score: 2; min acc: None) Finding initial useful rules... Found 845 useful rules. <BLANKLINE> B | S F r O | Score = Fixed - Broken c i o t | R Fixed = num tags changed incorrect -> correct o x k h | u Broken = num tags changed correct -> incorrect r e e e | l Other = num tags changed incorrect -> incorrect e d n r | e ------------------+------------------------------------------------------- 132 132 0 0 | AT->DT if Pos:NN@[-1] 85 85 0 0 | NN->, if Pos:NN@[-1] & Word:,@[0] 69 69 0 0 | NN->. if Pos:NN@[-1] & Word:.@[0] 51 51 0 0 | NN->IN if Pos:NN@[-1] & Word:of@[0] 47 63 16 161 | NN->IN if Pos:NNS@[-1] 33 33 0 0 | NN->TO if Pos:NN@[-1] & Word:to@[0] 26 26 0 0 | IN->. if Pos:NNS@[-1] & Word:.@[0] 24 24 0 0 | IN->, if Pos:NNS@[-1] & Word:,@[0] 22 27 5 24 | NN->-NONE- if Pos:VBD@[-1] 17 17 0 0 | NN->CC if Pos:NN@[-1] & Word:and@[0] >>> tagger1.rules()[1:3] (Rule('001', 'NN', ',', [(Pos([-1]),'NN'), (Word([0]),',')]), Rule('001', 'NN', '.', [(Pos([-1]),'NN'), (Word([0]),'.')])) >>> train_stats = tagger1.train_stats() >>> [train_stats[stat] for stat in ['initialerrors', 'finalerrors', 'rulescores']] [1775, 1269, [132, 85, 69, 51, 47, 33, 26, 24, 22, 17]] >>> tagger1.print_template_statistics(printunused=False) TEMPLATE STATISTICS (TRAIN) 2 templates, 10 rules) TRAIN ( 2417 tokens) initial 1775 0.2656 final: 1269 0.4750 #ID | Score (train) | #Rules | Template -------------------------------------------- 001 | 305 0.603 | 7 0.700 | Template(Pos([-1]),Word([0])) 000 | 201 0.397 | 3 0.300 | Template(Pos([-1])) <BLANKLINE> <BLANKLINE> >>> tagger1.evaluate(gold_data) # doctest: +ELLIPSIS 0.43996... >>> tagged, test_stats = tagger1.batch_tag_incremental(testing_data, gold_data) >>> tagged[33][12:] == [('foreign', 'IN'), ('debt', 'NN'), ('of', 'IN'), ('$', 'NN'), ('64', 'CD'), ... ('billion', 'NN'), ('*U*', 'NN'), ('--', 'NN'), ('the', 'DT'), ('third-highest', 'NN'), ('in', 'NN'), ... ('the', 'DT'), ('developing', 'VBG'), ('world', 'NN'), ('.', '.')] True >>> [test_stats[stat] for stat in ['initialerrors', 'finalerrors', 'rulescores']] [1855, 1376, [100, 85, 67, 58, 27, 36, 27, 16, 31, 32]] # a high-accuracy tagger >>> tagger2 = tt.train(training_data, max_rules=10, min_acc=0.99) TBL train (fast) (seqs: 100; tokens: 2417; tpls: 2; min score: 2; min acc: 0.99) Finding initial useful rules... Found 845 useful rules. <BLANKLINE> B | S F r O | Score = Fixed - Broken c i o t | R Fixed = num tags changed incorrect -> correct o x k h | u Broken = num tags changed correct -> incorrect r e e e | l Other = num tags changed incorrect -> incorrect e d n r | e ------------------+------------------------------------------------------- 132 132 0 0 | AT->DT if Pos:NN@[-1] 85 85 0 0 | NN->, if Pos:NN@[-1] & Word:,@[0] 69 69 0 0 | NN->. if Pos:NN@[-1] & Word:.@[0] 51 51 0 0 | NN->IN if Pos:NN@[-1] & Word:of@[0] 36 36 0 0 | NN->TO if Pos:NN@[-1] & Word:to@[0] 26 26 0 0 | NN->. if Pos:NNS@[-1] & Word:.@[0] 24 24 0 0 | NN->, if Pos:NNS@[-1] & Word:,@[0] 19 19 0 6 | NN->VB if Pos:TO@[-1] 18 18 0 0 | CD->-NONE- if Pos:NN@[-1] & Word:0@[0] 18 18 0 0 | NN->CC if Pos:NN@[-1] & Word:and@[0] >>> tagger2.evaluate(gold_data) # doctest: +ELLIPSIS 0.44159544... >>> tagger2.rules()[2:4] (Rule('001', 'NN', '.', [(Pos([-1]),'NN'), (Word([0]),'.')]), Rule('001', 'NN', 'IN', [(Pos([-1]),'NN'), (Word([0]),'of')])) # NOTE1: (!!FIXME) A far better baseline uses nltk.tag.UnigramTagger, # with a RegexpTagger only as backoff. For instance, # >>> baseline = UnigramTagger(baseline_data, backoff=backoff) # However, as of Nov 2013, nltk.tag.UnigramTagger does not yield consistent results # between python versions. The simplistic backoff above is a workaround to make doctests # get consistent input. :param train_sents: training data :type train_sents: list(list(tuple)) :param max_rules: output at most max_rules rules :type max_rules: int :param min_score: stop training when no rules better than min_score can be found :type min_score: int :param min_acc: discard any rule with lower accuracy than min_acc :type min_acc: float or None :return: the learned tagger :rtype: BrillTagger """ # FIXME: several tests are a bit too dependent on tracing format # FIXME: tests in trainer.fast and trainer.brillorig are exact duplicates # Basic idea: Keep track of the rules that apply at each position. # And keep track of the positions to which each rule applies. # Create a new copy of the training corpus, and run the # initial tagger on it. We will progressively update this # test corpus to look more like the training corpus. test_sents = [ list(self._initial_tagger.tag(untag(sent))) for sent in train_sents ] # Collect some statistics on the training process trainstats = {} trainstats['min_acc'] = min_acc trainstats['min_score'] = min_score trainstats['tokencount'] = sum(len(t) for t in test_sents) trainstats['sequencecount'] = len(test_sents) trainstats['templatecount'] = len(self._templates) trainstats['rulescores'] = [] trainstats['initialerrors'] = sum( tag[1] != truth[1] for paired in zip(test_sents, train_sents) for (tag, truth) in zip(*paired)) trainstats['initialacc'] = 1 - trainstats[ 'initialerrors'] / trainstats['tokencount'] if self._trace > 0: print( "TBL train (fast) (seqs: {sequencecount}; tokens: {tokencount}; " "tpls: {templatecount}; min score: {min_score}; min acc: {min_acc})" .format(**trainstats)) # Initialize our mappings. This will find any errors made # by the initial tagger, and use those to generate repair # rules, which are added to the rule mappings. if self._trace: print("Finding initial useful rules...") self._init_mappings(test_sents, train_sents) if self._trace: print((" Found %d useful rules." % len(self._rule_scores))) # Let the user know what we're up to. if self._trace > 2: self._trace_header() elif self._trace == 1: print("Selecting rules...") # Repeatedly select the best rule, and add it to `rules`. rules = [] try: while (len(rules) < max_rules): # Find the best rule, and add it to our rule list. rule = self._best_rule(train_sents, test_sents, min_score, min_acc) if rule: rules.append(rule) score = self._rule_scores[rule] trainstats['rulescores'].append(score) else: break # No more good rules left! # Report the rule that we found. if self._trace > 1: self._trace_rule(rule) # Apply the new rule at the relevant sites self._apply_rule(rule, test_sents) # Update _tag_positions[rule.original_tag] and # _tag_positions[rule.replacement_tag] for the affected # positions (i.e., self._positions_by_rule[rule]). self._update_tag_positions(rule) # Update rules that were affected by the change. self._update_rules(rule, train_sents, test_sents) # The user can cancel training manually: except KeyboardInterrupt: print("Training stopped manually -- %d rules found" % len(rules)) # Discard our tag position mapping & rule mappings. self._clean() trainstats['finalerrors'] = trainstats['initialerrors'] - sum( trainstats['rulescores']) trainstats['finalacc'] = 1 - trainstats['finalerrors'] / trainstats[ 'tokencount'] # Create and return a tagger from the rules we found. return BrillTagger(self._initial_tagger, rules, trainstats)
def train(self, train_sents, max_rules=200, min_score=2, min_acc=None): """ Trains the Brill tagger on the corpus *train_sents*, producing at most *max_rules* transformations, each of which reduces the net number of errors in the corpus by at least *min_score*, and each of which has accuracy not lower than *min_acc*. #imports >>> from nltk.tbl.template import Template >>> from nltk.tag.brill import Pos, Word >>> from nltk.tag import RegexpTagger >>> from nltk.tag.brill_trainer_orig import BrillTaggerTrainer #some data >>> from nltk.corpus import treebank >>> training_data = treebank.tagged_sents()[:100] >>> baseline_data = treebank.tagged_sents()[100:200] >>> gold_data = treebank.tagged_sents()[200:300] >>> testing_data = [untag(s) for s in gold_data] >>> backoff = RegexpTagger([ ... (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers ... (r'(The|the|A|a|An|an)$', 'AT'), # articles ... (r'.*able$', 'JJ'), # adjectives ... (r'.*ness$', 'NN'), # nouns formed from adjectives ... (r'.*ly$', 'RB'), # adverbs ... (r'.*s$', 'NNS'), # plural nouns ... (r'.*ing$', 'VBG'), # gerunds ... (r'.*ed$', 'VBD'), # past tense verbs ... (r'.*', 'NN') # nouns (default) ... ]) >>> baseline = backoff #see NOTE1 >>> baseline.evaluate(gold_data) #doctest: +ELLIPSIS 0.2450142... #templates >>> Template._cleartemplates() #clear any templates created in earlier tests >>> templates = [Template(Pos([-1])), Template(Pos([-1]), Word([0]))] #construct a BrillTaggerTrainer >>> tt = BrillTaggerTrainer(baseline, templates, trace=3) >>> tagger1 = tt.train(training_data, max_rules=10) TBL train (orig) (seqs: 100; tokens: 2417; tpls: 2; min score: 2; min acc: None) <BLANKLINE> B | S F r O | Score = Fixed - Broken c i o t | R Fixed = num tags changed incorrect -> correct o x k h | u Broken = num tags changed correct -> incorrect r e e e | l Other = num tags changed incorrect -> incorrect e d n r | e ------------------+------------------------------------------------------- 132 132 0 0 | AT->DT if Pos:NN@[-1] 85 85 0 0 | NN->, if Pos:NN@[-1] & Word:,@[0] 69 69 0 0 | NN->. if Pos:NN@[-1] & Word:.@[0] 51 51 0 0 | NN->IN if Pos:NN@[-1] & Word:of@[0] 47 63 16 161 | NN->IN if Pos:NNS@[-1] 33 33 0 0 | NN->TO if Pos:NN@[-1] & Word:to@[0] 26 26 0 0 | IN->. if Pos:NNS@[-1] & Word:.@[0] 24 24 0 0 | IN->, if Pos:NNS@[-1] & Word:,@[0] 22 27 5 24 | NN->-NONE- if Pos:VBD@[-1] 17 17 0 0 | NN->CC if Pos:NN@[-1] & Word:and@[0] >>> tagger1.rules()[1:3] (Rule('001', 'NN', ',', [(Pos([-1]),'NN'), (Word([0]),',')]), Rule('001', 'NN', '.', [(Pos([-1]),'NN'), (Word([0]),'.')])) >>> train_stats = tagger1.train_stats() >>> [train_stats[stat] for stat in ['initialerrors', 'finalerrors', 'rulescores']] [1775, 1269, [132, 85, 69, 51, 47, 33, 26, 24, 22, 17]] ##FIXME: the following test fails -- why? # #>>> tagger1.print_template_statistics(printunused=False) #TEMPLATE STATISTICS (TRAIN) 2 templates, 10 rules) #TRAIN ( 3163 tokens) initial 2358 0.2545 final: 1719 0.4565 ##ID | Score (train) | #Rules | Template #-------------------------------------------- #001 | 404 0.632 | 7 0.700 | Template(Pos([-1]),Word([0])) #000 | 235 0.368 | 3 0.300 | Template(Pos([-1])) #<BLANKLINE> #<BLANKLINE> >>> tagger1.evaluate(gold_data) # doctest: +ELLIPSIS 0.43996... >>> (tagged, test_stats) = tagger1.batch_tag_incremental(testing_data, gold_data) >>> tagged[33][12:] == [('foreign', 'IN'), ('debt', 'NN'), ('of', 'IN'), ('$', 'NN'), ('64', 'CD'), ... ('billion', 'NN'), ('*U*', 'NN'), ('--', 'NN'), ('the', 'DT'), ('third-highest', 'NN'), ('in', 'NN'), ... ('the', 'DT'), ('developing', 'VBG'), ('world', 'NN'), ('.', '.')] True >>> [test_stats[stat] for stat in ['initialerrors', 'finalerrors', 'rulescores']] [1855, 1376, [100, 85, 67, 58, 27, 36, 27, 16, 31, 32]] ##a high-accuracy tagger >>> tagger2 = tt.train(training_data, max_rules=10, min_acc=0.99) TBL train (orig) (seqs: 100; tokens: 2417; tpls: 2; min score: 2; min acc: 0.99) <BLANKLINE> B | S F r O | Score = Fixed - Broken c i o t | R Fixed = num tags changed incorrect -> correct o x k h | u Broken = num tags changed correct -> incorrect r e e e | l Other = num tags changed incorrect -> incorrect e d n r | e ------------------+------------------------------------------------------- 132 132 0 0 | AT->DT if Pos:NN@[-1] 85 85 0 0 | NN->, if Pos:NN@[-1] & Word:,@[0] 69 69 0 0 | NN->. if Pos:NN@[-1] & Word:.@[0] 51 51 0 0 | NN->IN if Pos:NN@[-1] & Word:of@[0] 36 36 0 0 | NN->TO if Pos:NN@[-1] & Word:to@[0] 26 26 0 0 | NN->. if Pos:NNS@[-1] & Word:.@[0] 24 24 0 0 | NN->, if Pos:NNS@[-1] & Word:,@[0] 19 19 0 6 | NN->VB if Pos:TO@[-1] 18 18 0 0 | CD->-NONE- if Pos:NN@[-1] & Word:0@[0] 18 18 0 0 | NN->CC if Pos:NN@[-1] & Word:and@[0] >>> tagger2.evaluate(gold_data) # doctest: +ELLIPSIS 0.44159544... >>> tagger2.rules()[2:4] (Rule('001', 'NN', '.', [(Pos([-1]),'NN'), (Word([0]),'.')]), Rule('001', 'NN', 'IN', [(Pos([-1]),'NN'), (Word([0]),'of')])) #NOTE1: (!!FIXME) A far better baseline uses nltk.tag.UnigramTagger, #with a RegexpTagger only as backoff. For instance, #>>> baseline = UnigramTagger(baseline_data, backoff=backoff) #However, as of Nov 2013, nltk.tag.UnigramTagger does not yield consistent results #between python versions. The simplistic backoff above is a workaround to make doctests #get consistent input. :param train_sents: training data :type train_sents: list(list(tuple)) :param max_rules: output at most max_rules rules :type max_rules: int :param min_score: stop training when no rules better than min_score can be found :type min_score: int :param min_acc: discard any rule with lower accuracy than min_acc :type min_acc: float or None :return: the learned tagger :rtype: BrillTagger :param train_sents: training data :type train_sents: list(list(tuple)) :param max_rules: output at most max_rules rules :type max_rules: int :param min_score: stop training when no rules better than min_score can be found :type min_score: int :param min_acc: discard any rule with lower accuracy than min_acc :type min_acc: float or None :return: the learned tagger :rtype: BrillTagger """ # Create a new copy of the training corpus, and run the # initial tagger on it. We will progressively update this # test corpus to look more like the training corpus. test_sents = [self._initial_tagger.tag(untag(sent)) for sent in train_sents] trainstats = {} trainstats['min_acc'] = min_acc trainstats['min_score'] = min_score trainstats['tokencount'] = sum(len(t) for t in test_sents) trainstats['sequencecount'] = len(test_sents) trainstats['templatecount'] = len(self._templates) trainstats['rulescores'] = [] trainstats['initialerrors'] = sum(tag[1] != truth[1] for paired in zip(test_sents, train_sents) for (tag, truth) in zip(*paired)) trainstats['initialacc'] = 1 - trainstats['initialerrors']/trainstats['tokencount'] if self._trace > 0: print("TBL train (orig) (seqs: {sequencecount}; tokens: {tokencount}; " "tpls: {templatecount}; min score: {min_score}; min acc: {min_acc})".format(**trainstats)) if self._trace > 2: self._trace_header() # Look for useful rules. rules = [] try: while len(rules) < max_rules: (rule, score, fixscore) = self._best_rule(test_sents, train_sents, min_acc=min_acc) if rule is None or score < min_score: if self._trace > 1: print('Insufficient improvement; stopping') break else: # Add the rule to our list of rules. rules.append(rule) trainstats['rulescores'].append(score) # Use the rules to update the test corpus. Keep # track of how many times the rule applied (k). k = 0 for sent in test_sents: k += len(rule.apply(sent)) # Display trace output. if self._trace > 1: self._trace_rule(rule, score, fixscore, k) # The user can also cancel training manually: except KeyboardInterrupt: print("Training stopped manually -- %d rules found" % len(rules)) trainstats['finalerrors'] = trainstats['initialerrors'] - sum(trainstats['rulescores']) trainstats['finalacc'] = 1 - trainstats['finalerrors']/trainstats['tokencount'] # Create and return a tagger from the rules we found. return BrillTagger(self._initial_tagger, rules, trainstats)
def evaluate(self, gold): tagged_sents = self.batch_tag([untag(sent) for sent in gold]) gold_tokens = sum(gold, []) test_tokens = sum(tagged_sents, []) return accuracy(gold_tokens, test_tokens)
def evaluate(self, gold): tagged_sents = self.tag_sents(untag(sent) for sent in gold) gold_tokens = list(chain(*gold)) test_tokens = list(chain(*tagged_sents)) return accuracy(gold_tokens, test_tokens)