示例#1
0
def main():
	files = ["wsj_0156.pos",
		"wsj_0160.pos",
		"wsj_0163.pos",
		"wsj_0165.pos",
		"wsj_0167.pos",
		"wsj_0170.pos",
		"wsj_0175.pos",
		"wsj_0187.pos",
		"wsj_0195.pos",
		"wsj_0196.pos"]
	test_trees = treebank_chunk.chunked_sents(files)
	
	NP_rules = ["NP , NP CC NP", "NP CC NP", "NP IN NP", "NP TO NP", "NP NP", "NP NN NP", "NP , NP ,", "RB VBN NP"]
	#NP_rules = ["DT NN", "JJ NN", "DT NP"] #these rules can be tried to show that multiple kinds of chunking work 
	rules = []
	##create rule objects
	for ruleString in NP_rules:
		newRule = Rule(ruleString, "NP")
		rules.append(newRule)

	myChunks = open("superchunks.txt","a")
	for tree in test_trees:
	        for rule in rules:
			tree = rule.chunk(tree, rule, 15)
		printTree(tree, myChunks)
示例#2
0
def train_parser():
    """
    训练分块器
    """

    # 简单的分块器,抽取NNP(专有名词)
    def mySimpleChunker():
        grammar = 'NP: {<NNP>+}'
        return nltk.RegexpParser(grammar)

    # 不抽取任何东西,只用于检验算法能否正常运行
    def test_nothing(data):
        cp = nltk.RegexpParser("")
        print(cp.evaluate(data))

    # 测试mySimpleChunker()函数
    def test_mySimpleChunker(data):
        schunker = mySimpleChunker()
        print(schunker.evaluate(data))

    datasets = [
        conll2000.chunked_sents('test.txt', chunk_types=['NP']),
        treebank_chunk.chunked_sents(),
    ]

    # 前50个IOB标注语句 计算分块器的准确率
    for dataset in datasets:
        test_nothing(dataset[:50])
        print('---------------------')
        test_mySimpleChunker(dataset[:50])
        print()
示例#3
0
def statistical_reqs_chunk(tagged_reqs, ids):
    train_chunks = treebank_chunk.chunked_sents()[:100]
    chunker = TagChunker(train_chunks)
    terms = []
    term_index = []
    for i, t in enumerate(tagged_reqs):
        s = chunker.parse(t)
        for c in s:
            if not isinstance(c, tuple):
                if c.label() == 'NP':
                    term = []
                    for tagged_word in c:
                        if (tagged_word[1] != 'DT') and (tagged_word[1] !=
                                                         'PRP$'):
                            term = term + [tagged_word[0]]
                    terms.append(term)
                    term_index.append(i)
    return terms, term_index
示例#4
0
def ch07_16a_penn_treebank():
    from nltk.corpus import treebank_chunk
    for sent in treebank_chunk.chunked_sents("wsj_0001.pos"):
        print "sent=", sent
        print "chunk2brackets=", _chunk2brackets(sent)
        print "chunk2iob=", _chunk2iob(sent)
示例#5
0
"""
    Converting chunks to trees and tree manipulation
"""
# pylint: disable=C0103

import re
from nltk.corpus import treebank_chunk, treebank
from nltk.tree import Tree

# Sample treebank tree
tb_tree = treebank_chunk.chunked_sents()[0]
print(f"Treebank = {tb_tree}\n")

# Naive way
joined = " ".join([word for word, _ in tb_tree.leaves()])
print(f"Naive join: {joined}\n")

# Using regexp
punct_re = re.compile(r'\s([,\.;\?])')


def chunk_tree_to_sent(tree, concat=' '):
    """
        Converts a chunk tree to a sentence
    """
    sentence = concat.join([word for word, _ in tree.leaves()])
    return re.sub(punct_re, r'\g<1>', sentence)


joined = chunk_tree_to_sent(tb_tree)
print(f"Regexp join: {joined}\n")
示例#6
0
A version of this article appears in print on September 16, 2014, on page A11 of the New York edition with the headline: U.S. Airstrikes Hit Targets Near Baghdad Held by ISIS. 
"""
# split to sentences
# using punkt
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
article_sent = tokenizer.tokenize(article)

# tokenize words
word_tokenizer = TreebankWordTokenizer()
word_list = [word_tokenizer.tokenize(sent) for sent in article_sent]

# train pos tagger
# evaluate accuracy
test_sents = treebank.tagged_sents()[3000:]
test_chunks = treebank_chunk.chunked_sents()[3000:]
conll_test = conll2000.chunked_sents('test.txt')

train_new_tagger = False 
if train_new_tagger:
  train_sents = treebank.tagged_sents()[:3000]
  #create dictionary from treeback of most frequent words
  print("creating dictionary from treeback")
  model = word_tag_model(treebank.words(), treebank.tagged_words())
  
  #keeping tagger default for chaining purposes
  print("Training tagger")
  
  backoff= DefaultTagger('NN')
  nt = NamesTagger(backoff=backoff)
  #taggers = [UnigramTagger, BigramTagger, TrigramTagger]
train_sents = treebank.tagged_sents()
tagger=UnigramTagger(train_sents)

#Importing replacers
from replacers import RegexReplacer
from replacers import AntonymReplacer
replacer=RegexReplacer()
from nltk.tokenize import RegexpTokenizer
tokenizer =RegexpTokenizer("[\w']+")
from random import shuffle


#Importing Chunkers
import chunkers
from nltk.corpus import treebank_chunk
chunker=chunkers.TagChunker(treebank_chunk.chunked_sents())


max_key=100
customstopwords = stopwords.words('english')
customstopwords.remove("up")
customstopwords.remove("down")
customstopwords += ['s&p500','federal','united','states','investors', 'reserve', 'average', 'nikkei' ,'end',
                   'index','market','cent','wall','street','year','years','industrial',
                   'dow','jones','it','closing','closed','saw','months','nasdaq','trading','us','day','chase','mortgage']

#Load positive tweets into a list
p = open('postweets2.txt', 'r')
postxt = p.readlines()

#Load negative tweets into a list
示例#8
0
            self.tagger = tagger

    def parse(self, tokens):
        """
            Parse sentence to chunks
        """
        if not tokens:
            return None
        (words, tags) = zip(*tokens)
        gen_chunks = self.tagger.tag(tags)
        wtc = zip(words, gen_chunks)
        return conlltags2tree([(w, t, c) for (w, (t, c)) in wtc])


# Separating data and getting chunker accuracy
train_ck = treebank_chunk.chunked_sents()[:3000]
test_ck = treebank_chunk.chunked_sents()[3000:]
train_conll = conll2000.chunked_sents("train.txt")
test_conll = conll2000.chunked_sents("test.txt")

# With unigram and bigram taggers
chunker = TagChunker(train_ck)
score = chunker.evaluate(test_ck)
print(f"Accuracy of tag chunker on treebank: {score.accuracy()}")

# Saving pickle
with open('pickles/chunkers/tag_chunker_treebank.pickle', 'wb') as file:
    pickle.dump(chunker, file)

chunker = TagChunker(train_conll)
score = chunker.evaluate(test_conll)
import nltk
from nltk.corpus import treebank_chunk
print(treebank_chunk.chunked_sents()[1])
treebank_chunk.chunked_sents()[1].draw()
    tree.draw()
    
    
t = create_sentence_tree(sentence)
print t

pt = process_sentence_tree(t)
pt

print_sentence_tree(t)
visualize_sentence_tree(t)



from nltk.corpus import treebank_chunk
data = treebank_chunk.chunked_sents()
train_data = data[:4000]
test_data = data[4000:]
print train_data[7]

simple_sentence = 'the quick fox jumped over the lazy dog'

from nltk.chunk import RegexpParser
from pattern.en import tag

tagged_simple_sent = tag(simple_sentence)
print tagged_simple_sent

chunk_grammar = """
NP: {<DT>?<JJ>*<NN.*>}
"""
示例#11
0
from nltk.chunk import ChunkParserI, tree2conlltags as to_tags
from nltk.corpus import treebank_chunk, conll2000
from nltk.tag import UnigramTagger, BigramTagger


def tag_chunks(chunk_sents):
    tag_sents = [to_tags(tree) for tree in chunk_sents]
    return [[(t, c) for (w, t, c) in chunk_tags] for chunk_tags in tag_sents]


CHUNKS = tag_chunks(treebank_chunk.chunked_sents()) + \
         tag_chunks(conll2000.chunked_sents())
TAGGER = BigramTagger(CHUNKS, backoff=UnigramTagger(CHUNKS))


class ChunkTagger(ChunkParserI):
    def parse(self, tokens):
        (tokens, tags) = zip(*tokens)
        chunks = TAGGER.tag(tags)
        return [(token, chunk[1]) for (token, chunk) in zip(tokens, chunks)]
示例#12
0
def update_trans_freqs(trans_freqs, tag_seq):
    tags = ["START"]
    tags.extend(tag_seq.split(" "))
    tags.append("END")
    bigrams = nltk.bigrams(tags)
    for bigram in bigrams:
        row = index_of(bigram[0])
        col = index_of(bigram[1])
        trans_freqs[row, col] += 1
    
# generate phrases as a sequence of (normalized) POS tags and
# transition probabilities across POS tags.
tag_map = normalize_ptb_tags()
np_fd = nltk.FreqDist()
trans_freqs = np.zeros((len(NORMTAGS) + 2, len(NORMTAGS) + 2))
for tree in treebank_chunk.chunked_sents():
    chunks = []
    get_chunks(tree, "NP", chunks)
    for chunk in chunks:
        tagged_poss = [tagged_word[1] for tagged_word in chunk]
        normed_tags = []
        for tagged_pos in tagged_poss:
            try:
                normed_tags.append(tag_map[tagged_pos])
            except KeyError:
                normed_tags.append("OT")
        np_fd.inc(" ".join(normed_tags))
        
fout = open("../../data/brown_dict/np_tags.csv", 'wb')
for tag_seq in np_fd.keys():
    fout.write("%s\t%d\n" % (tag_seq, np_fd[tag_seq]))
示例#13
0
def tree2iob(x, prefix="O", label="", super_prefix="O", super_label="",
             issuperchunk=lambda tree: tree.node=="SNP",
             issentence=lambda tree: tree.node=="S"):
    """Given a tree containing chunks and superchunks, yield tuples of the
    form (word, POS-tag, chunk-IOB-tag, superchunk-IOB-tag)."""
    if isinstance(x, Tree):
        if issuperchunk(x):
            super_prefix = "B-"
            super_label = x.node
        elif not issentence(x):
            prefix = "B-"
            label = x.node
        for child in x:
            for tag in tree2iob(child, prefix, label,
                                super_prefix, super_label,
                                issuperchunk, issentence):
                yield tag
            if prefix == "B-": prefix = "I-"
            if super_prefix == "B-": super_prefix = "I-"
    else:
        yield (x[0], x[1], prefix+label, super_prefix+super_label)

if __name__ == "__main__":
    from nltk.corpus import treebank_chunk as corpus

    # Ensure that str2tree correctly parses the string representations
    # of all trees in the chunked Treebank.
    for i, t in enumerate(corpus.chunked_sents()):
        assert str2tree(str(t)) == t, "incorrect parse for sentence %d" % i
示例#14
0
文件: Chunker.py 项目: jcccf/cs4740
def evaluate_chunker():
  train_chunks = treebank_chunk.chunked_sents()[:3000]
  test_chunks = treebank_chunk.chunked_sents()[3000:]
  chunker = TagChunker(train_chunks)
  score = chunker.evaluate(test_chunks)
  print score.accuracy()
示例#15
0
文件: Chunker.py 项目: jcccf/cs4740
  for cl in tagger_classes:
     backoff = cl(train_sents, backoff=backoff)
  return backoff

def conll_tag_chunks(chunk_sents):
  tagged_sents = [nltk.chunk.tree2conlltags(tree) for tree in chunk_sents]
  return [[(t, c) for (w, t, c) in sent] for sent in tagged_sents]

class TagChunker(nltk.chunk.ChunkParserI):
  def __init__(self, train_chunks, tagger_classes=[UnigramTagger, BigramTagger]):
    train_sents = conll_tag_chunks(train_chunks)
    self.tagger = backoff_tagger(train_sents, tagger_classes)
    
  def parse(self, tagged_sent):
    if not tagged_sent: return None
    (words, tags) = zip(*tagged_sent)
    chunks = self.tagger.tag(tags)
    wtc = itertools.izip(words, chunks)
    return nltk.chunk.conlltags2tree([(w,t,c) for (w,(t,c)) in wtc])

# To see how good this is
def evaluate_chunker():
  train_chunks = treebank_chunk.chunked_sents()[:3000]
  test_chunks = treebank_chunk.chunked_sents()[3000:]
  chunker = TagChunker(train_chunks)
  score = chunker.evaluate(test_chunks)
  print score.accuracy()
  
# Initialize chunker
train_chunks = treebank_chunk.chunked_sents()
chunker = TagChunker(train_chunks)
import nltk
from nltk.corpus import treebank_chunk
print(treebank_chunk.chunked_sents()[1].leaves())
print(treebank_chunk.chunked_sents()[1].pos())
print(treebank_chunk.chunked_sents()[1].productions())
print(nltk.corpus.treebank.tagged_words())
def evaluate_chunker():
    train_chunks = treebank_chunk.chunked_sents()[:3000]
    test_chunks = treebank_chunk.chunked_sents()[3000:]
    chunker = TagChunker(train_chunks)
    score = chunker.evaluate(test_chunks)
    print score.accuracy()
    tree.draw()
    
    
t = create_sentence_tree(sentence)
print t

pt = process_sentence_tree(t)
pt

print_sentence_tree(t)
visualize_sentence_tree(t)



from nltk.corpus import treebank_chunk
data = treebank_chunk.chunked_sents()
train_data = data[:4000]
test_data = data[4000:]
print train_data[7]

simple_sentence = 'the quick fox jumped over the lazy dog'

from nltk.chunk import RegexpParser
from pattern.en import tag

tagged_simple_sent = tag(simple_sentence)
print tagged_simple_sent

chunk_grammar = """
NP: {<DT>?<JJ>*<NN.*>}
"""
示例#19
0
    def __init__(self,
                 train_sents,
                 feature_detector=prev_next_pos_iob,
                 **kwargs):
        if not feature_detector:
            feature_detector = self.feature_detector

        train_chunks = chunk_trees2train_chunks(train_sents)
        self.tagger = ClassifierBasedTagger(train=train_chunks,
                                            feature_detector=feature_detector,
                                            **kwargs)

    def parse(self, tagged_sent):
        if not tagged_sent: return None
        chunks = self.tagger.tag(tagged_sent)
        return conlltags2tree([(w, t, c) for ((w, t), c) in chunks])


if __name__ == '__main__':
    from nltk.corpus import treebank_chunk, conll2000
    from nltk import tokenize

    conll_train = conll2000.chunked_sents('train.txt')
    train_chunks = treebank_chunk.chunked_sents()
    a = ClassifierChunker(conll_train)
    sentense = "I am a boy."
    tokens = nltk.word_tokenize(sentense)
    tagged = nltk.pos_tag(tokens)
    print(tagged)
    print(a.parse(tagged))
示例#20
0
"""
William Tarimo
PA3: Superchunking - creating larger constituents from chunked data
COSI 114
3/25/2013
"""

from nltk.tree import Tree
from nltk.corpus import treebank_chunk
lines = [line[:-1]+'.pos' for line in open('indices.txt','r').readlines()]
dev_test = treebank_chunk.chunked_sents(lines)
test_sentences =treebank_chunk.chunked_sents(['wsj_0156.pos','wsj_0160.pos',\
    'wsj_0163.pos','wsj_0165.pos','wsj_0167.pos','wsj_0170.pos','wsj_0175.pos',\
    'wsj_0187.pos','wsj_0195.pos','wsj_0196.pos'])

sentence = treebank_chunk.chunked_sents('wsj_0154.pos')[0]

def super_chunk(tagged_sentence):
    """Takes in a POS-tagged sentence and returns a super-chunked tree"""
    #groups identical consecutives POS tokens to ease parsing
    sentence = merge_sentence(tagged_sentence)
    #gets indices (start,end) of all chunks from all qualifying rules
    matches = [rule.match(sentence) for rule in rules if rule.match(sentence)]
    
    #Removes overlapping chunks by deleting the right overlapping chunk(s)
    matches = [range(s,e+1) for (s,e) in [val for subl in matches for val in subl]]
    for i in range(len(matches)-1,0,-1):
        if set(matches[i]).intersection(set([val for subl in matches[:i] for val in subl])):
            junk = matches.pop(i)
    matches = [(item[0],item[-1]) for item in matches]
    matches = [actual_index(start,end,sentence) for (start,end) in matches]
示例#21
0
def ch07_16a_penn_treebank():
  from nltk.corpus import treebank_chunk
  for sent in treebank_chunk.chunked_sents("wsj_0001.pos"):
    print "sent=", sent
    print "chunk2brackets=", _chunk2brackets(sent)
    print "chunk2iob=", _chunk2iob(sent)