def main(): files = ["wsj_0156.pos", "wsj_0160.pos", "wsj_0163.pos", "wsj_0165.pos", "wsj_0167.pos", "wsj_0170.pos", "wsj_0175.pos", "wsj_0187.pos", "wsj_0195.pos", "wsj_0196.pos"] test_trees = treebank_chunk.chunked_sents(files) NP_rules = ["NP , NP CC NP", "NP CC NP", "NP IN NP", "NP TO NP", "NP NP", "NP NN NP", "NP , NP ,", "RB VBN NP"] #NP_rules = ["DT NN", "JJ NN", "DT NP"] #these rules can be tried to show that multiple kinds of chunking work rules = [] ##create rule objects for ruleString in NP_rules: newRule = Rule(ruleString, "NP") rules.append(newRule) myChunks = open("superchunks.txt","a") for tree in test_trees: for rule in rules: tree = rule.chunk(tree, rule, 15) printTree(tree, myChunks)
def train_parser(): """ 训练分块器 """ # 简单的分块器,抽取NNP(专有名词) def mySimpleChunker(): grammar = 'NP: {<NNP>+}' return nltk.RegexpParser(grammar) # 不抽取任何东西,只用于检验算法能否正常运行 def test_nothing(data): cp = nltk.RegexpParser("") print(cp.evaluate(data)) # 测试mySimpleChunker()函数 def test_mySimpleChunker(data): schunker = mySimpleChunker() print(schunker.evaluate(data)) datasets = [ conll2000.chunked_sents('test.txt', chunk_types=['NP']), treebank_chunk.chunked_sents(), ] # 前50个IOB标注语句 计算分块器的准确率 for dataset in datasets: test_nothing(dataset[:50]) print('---------------------') test_mySimpleChunker(dataset[:50]) print()
def statistical_reqs_chunk(tagged_reqs, ids): train_chunks = treebank_chunk.chunked_sents()[:100] chunker = TagChunker(train_chunks) terms = [] term_index = [] for i, t in enumerate(tagged_reqs): s = chunker.parse(t) for c in s: if not isinstance(c, tuple): if c.label() == 'NP': term = [] for tagged_word in c: if (tagged_word[1] != 'DT') and (tagged_word[1] != 'PRP$'): term = term + [tagged_word[0]] terms.append(term) term_index.append(i) return terms, term_index
def ch07_16a_penn_treebank(): from nltk.corpus import treebank_chunk for sent in treebank_chunk.chunked_sents("wsj_0001.pos"): print "sent=", sent print "chunk2brackets=", _chunk2brackets(sent) print "chunk2iob=", _chunk2iob(sent)
""" Converting chunks to trees and tree manipulation """ # pylint: disable=C0103 import re from nltk.corpus import treebank_chunk, treebank from nltk.tree import Tree # Sample treebank tree tb_tree = treebank_chunk.chunked_sents()[0] print(f"Treebank = {tb_tree}\n") # Naive way joined = " ".join([word for word, _ in tb_tree.leaves()]) print(f"Naive join: {joined}\n") # Using regexp punct_re = re.compile(r'\s([,\.;\?])') def chunk_tree_to_sent(tree, concat=' '): """ Converts a chunk tree to a sentence """ sentence = concat.join([word for word, _ in tree.leaves()]) return re.sub(punct_re, r'\g<1>', sentence) joined = chunk_tree_to_sent(tb_tree) print(f"Regexp join: {joined}\n")
A version of this article appears in print on September 16, 2014, on page A11 of the New York edition with the headline: U.S. Airstrikes Hit Targets Near Baghdad Held by ISIS. """ # split to sentences # using punkt tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') article_sent = tokenizer.tokenize(article) # tokenize words word_tokenizer = TreebankWordTokenizer() word_list = [word_tokenizer.tokenize(sent) for sent in article_sent] # train pos tagger # evaluate accuracy test_sents = treebank.tagged_sents()[3000:] test_chunks = treebank_chunk.chunked_sents()[3000:] conll_test = conll2000.chunked_sents('test.txt') train_new_tagger = False if train_new_tagger: train_sents = treebank.tagged_sents()[:3000] #create dictionary from treeback of most frequent words print("creating dictionary from treeback") model = word_tag_model(treebank.words(), treebank.tagged_words()) #keeping tagger default for chaining purposes print("Training tagger") backoff= DefaultTagger('NN') nt = NamesTagger(backoff=backoff) #taggers = [UnigramTagger, BigramTagger, TrigramTagger]
train_sents = treebank.tagged_sents() tagger=UnigramTagger(train_sents) #Importing replacers from replacers import RegexReplacer from replacers import AntonymReplacer replacer=RegexReplacer() from nltk.tokenize import RegexpTokenizer tokenizer =RegexpTokenizer("[\w']+") from random import shuffle #Importing Chunkers import chunkers from nltk.corpus import treebank_chunk chunker=chunkers.TagChunker(treebank_chunk.chunked_sents()) max_key=100 customstopwords = stopwords.words('english') customstopwords.remove("up") customstopwords.remove("down") customstopwords += ['s&p500','federal','united','states','investors', 'reserve', 'average', 'nikkei' ,'end', 'index','market','cent','wall','street','year','years','industrial', 'dow','jones','it','closing','closed','saw','months','nasdaq','trading','us','day','chase','mortgage'] #Load positive tweets into a list p = open('postweets2.txt', 'r') postxt = p.readlines() #Load negative tweets into a list
self.tagger = tagger def parse(self, tokens): """ Parse sentence to chunks """ if not tokens: return None (words, tags) = zip(*tokens) gen_chunks = self.tagger.tag(tags) wtc = zip(words, gen_chunks) return conlltags2tree([(w, t, c) for (w, (t, c)) in wtc]) # Separating data and getting chunker accuracy train_ck = treebank_chunk.chunked_sents()[:3000] test_ck = treebank_chunk.chunked_sents()[3000:] train_conll = conll2000.chunked_sents("train.txt") test_conll = conll2000.chunked_sents("test.txt") # With unigram and bigram taggers chunker = TagChunker(train_ck) score = chunker.evaluate(test_ck) print(f"Accuracy of tag chunker on treebank: {score.accuracy()}") # Saving pickle with open('pickles/chunkers/tag_chunker_treebank.pickle', 'wb') as file: pickle.dump(chunker, file) chunker = TagChunker(train_conll) score = chunker.evaluate(test_conll)
import nltk from nltk.corpus import treebank_chunk print(treebank_chunk.chunked_sents()[1]) treebank_chunk.chunked_sents()[1].draw()
tree.draw() t = create_sentence_tree(sentence) print t pt = process_sentence_tree(t) pt print_sentence_tree(t) visualize_sentence_tree(t) from nltk.corpus import treebank_chunk data = treebank_chunk.chunked_sents() train_data = data[:4000] test_data = data[4000:] print train_data[7] simple_sentence = 'the quick fox jumped over the lazy dog' from nltk.chunk import RegexpParser from pattern.en import tag tagged_simple_sent = tag(simple_sentence) print tagged_simple_sent chunk_grammar = """ NP: {<DT>?<JJ>*<NN.*>} """
from nltk.chunk import ChunkParserI, tree2conlltags as to_tags from nltk.corpus import treebank_chunk, conll2000 from nltk.tag import UnigramTagger, BigramTagger def tag_chunks(chunk_sents): tag_sents = [to_tags(tree) for tree in chunk_sents] return [[(t, c) for (w, t, c) in chunk_tags] for chunk_tags in tag_sents] CHUNKS = tag_chunks(treebank_chunk.chunked_sents()) + \ tag_chunks(conll2000.chunked_sents()) TAGGER = BigramTagger(CHUNKS, backoff=UnigramTagger(CHUNKS)) class ChunkTagger(ChunkParserI): def parse(self, tokens): (tokens, tags) = zip(*tokens) chunks = TAGGER.tag(tags) return [(token, chunk[1]) for (token, chunk) in zip(tokens, chunks)]
def update_trans_freqs(trans_freqs, tag_seq): tags = ["START"] tags.extend(tag_seq.split(" ")) tags.append("END") bigrams = nltk.bigrams(tags) for bigram in bigrams: row = index_of(bigram[0]) col = index_of(bigram[1]) trans_freqs[row, col] += 1 # generate phrases as a sequence of (normalized) POS tags and # transition probabilities across POS tags. tag_map = normalize_ptb_tags() np_fd = nltk.FreqDist() trans_freqs = np.zeros((len(NORMTAGS) + 2, len(NORMTAGS) + 2)) for tree in treebank_chunk.chunked_sents(): chunks = [] get_chunks(tree, "NP", chunks) for chunk in chunks: tagged_poss = [tagged_word[1] for tagged_word in chunk] normed_tags = [] for tagged_pos in tagged_poss: try: normed_tags.append(tag_map[tagged_pos]) except KeyError: normed_tags.append("OT") np_fd.inc(" ".join(normed_tags)) fout = open("../../data/brown_dict/np_tags.csv", 'wb') for tag_seq in np_fd.keys(): fout.write("%s\t%d\n" % (tag_seq, np_fd[tag_seq]))
def tree2iob(x, prefix="O", label="", super_prefix="O", super_label="", issuperchunk=lambda tree: tree.node=="SNP", issentence=lambda tree: tree.node=="S"): """Given a tree containing chunks and superchunks, yield tuples of the form (word, POS-tag, chunk-IOB-tag, superchunk-IOB-tag).""" if isinstance(x, Tree): if issuperchunk(x): super_prefix = "B-" super_label = x.node elif not issentence(x): prefix = "B-" label = x.node for child in x: for tag in tree2iob(child, prefix, label, super_prefix, super_label, issuperchunk, issentence): yield tag if prefix == "B-": prefix = "I-" if super_prefix == "B-": super_prefix = "I-" else: yield (x[0], x[1], prefix+label, super_prefix+super_label) if __name__ == "__main__": from nltk.corpus import treebank_chunk as corpus # Ensure that str2tree correctly parses the string representations # of all trees in the chunked Treebank. for i, t in enumerate(corpus.chunked_sents()): assert str2tree(str(t)) == t, "incorrect parse for sentence %d" % i
def evaluate_chunker(): train_chunks = treebank_chunk.chunked_sents()[:3000] test_chunks = treebank_chunk.chunked_sents()[3000:] chunker = TagChunker(train_chunks) score = chunker.evaluate(test_chunks) print score.accuracy()
for cl in tagger_classes: backoff = cl(train_sents, backoff=backoff) return backoff def conll_tag_chunks(chunk_sents): tagged_sents = [nltk.chunk.tree2conlltags(tree) for tree in chunk_sents] return [[(t, c) for (w, t, c) in sent] for sent in tagged_sents] class TagChunker(nltk.chunk.ChunkParserI): def __init__(self, train_chunks, tagger_classes=[UnigramTagger, BigramTagger]): train_sents = conll_tag_chunks(train_chunks) self.tagger = backoff_tagger(train_sents, tagger_classes) def parse(self, tagged_sent): if not tagged_sent: return None (words, tags) = zip(*tagged_sent) chunks = self.tagger.tag(tags) wtc = itertools.izip(words, chunks) return nltk.chunk.conlltags2tree([(w,t,c) for (w,(t,c)) in wtc]) # To see how good this is def evaluate_chunker(): train_chunks = treebank_chunk.chunked_sents()[:3000] test_chunks = treebank_chunk.chunked_sents()[3000:] chunker = TagChunker(train_chunks) score = chunker.evaluate(test_chunks) print score.accuracy() # Initialize chunker train_chunks = treebank_chunk.chunked_sents() chunker = TagChunker(train_chunks)
import nltk from nltk.corpus import treebank_chunk print(treebank_chunk.chunked_sents()[1].leaves()) print(treebank_chunk.chunked_sents()[1].pos()) print(treebank_chunk.chunked_sents()[1].productions()) print(nltk.corpus.treebank.tagged_words())
def __init__(self, train_sents, feature_detector=prev_next_pos_iob, **kwargs): if not feature_detector: feature_detector = self.feature_detector train_chunks = chunk_trees2train_chunks(train_sents) self.tagger = ClassifierBasedTagger(train=train_chunks, feature_detector=feature_detector, **kwargs) def parse(self, tagged_sent): if not tagged_sent: return None chunks = self.tagger.tag(tagged_sent) return conlltags2tree([(w, t, c) for ((w, t), c) in chunks]) if __name__ == '__main__': from nltk.corpus import treebank_chunk, conll2000 from nltk import tokenize conll_train = conll2000.chunked_sents('train.txt') train_chunks = treebank_chunk.chunked_sents() a = ClassifierChunker(conll_train) sentense = "I am a boy." tokens = nltk.word_tokenize(sentense) tagged = nltk.pos_tag(tokens) print(tagged) print(a.parse(tagged))
""" William Tarimo PA3: Superchunking - creating larger constituents from chunked data COSI 114 3/25/2013 """ from nltk.tree import Tree from nltk.corpus import treebank_chunk lines = [line[:-1]+'.pos' for line in open('indices.txt','r').readlines()] dev_test = treebank_chunk.chunked_sents(lines) test_sentences =treebank_chunk.chunked_sents(['wsj_0156.pos','wsj_0160.pos',\ 'wsj_0163.pos','wsj_0165.pos','wsj_0167.pos','wsj_0170.pos','wsj_0175.pos',\ 'wsj_0187.pos','wsj_0195.pos','wsj_0196.pos']) sentence = treebank_chunk.chunked_sents('wsj_0154.pos')[0] def super_chunk(tagged_sentence): """Takes in a POS-tagged sentence and returns a super-chunked tree""" #groups identical consecutives POS tokens to ease parsing sentence = merge_sentence(tagged_sentence) #gets indices (start,end) of all chunks from all qualifying rules matches = [rule.match(sentence) for rule in rules if rule.match(sentence)] #Removes overlapping chunks by deleting the right overlapping chunk(s) matches = [range(s,e+1) for (s,e) in [val for subl in matches for val in subl]] for i in range(len(matches)-1,0,-1): if set(matches[i]).intersection(set([val for subl in matches[:i] for val in subl])): junk = matches.pop(i) matches = [(item[0],item[-1]) for item in matches] matches = [actual_index(start,end,sentence) for (start,end) in matches]