def parse(self, question):
		pos_tags = [pos for (word,pos) in question]
		tagged_pos_tags = self.tagger.tag(pos_tags)
		chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags]
		conlltags = [(word,pos,chunktag) for ((word, pos), chunktag) in zip(question, chunktags)]
		print conlltags
		return conlltags2tree(conlltags)
예제 #2
0
 def parse(self, sentence):
     pos_tags = [pos for (word, pos) in sentence]
     tagged_pos_tags = self.tagger.tag(pos_tags)
     chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags]
     conlltags = [(word, pos, chunktag)
                  for ((word, pos), chunktag) in zip(sentence, chunktags)]
     return conlltags2tree(conlltags)
예제 #3
0
파일: Printer.py 프로젝트: magicisland/tpd
	def update(self,data):
		
		
		self.chunks={}
		
		try:
			feature=data
			chunks=feature['chunked']
			tree=conlltags2tree(chunks)
			
			for chunk_name in self.target_chunks:
				succedded_chunk=self.getChunk(tree,chunk_name)
				if succedded_chunk:
					if chunk_name not in self.chunks:
						self.chunks[chunk_name]=succedded_chunk
			
			if "LOCATION" in str(self.chunks):
				print "-------------------------------------------------------------------------------------------------------------------------"
				
				print colored("\n[TWEET ORIGINAL]",'yellow')
				print feature['original'].encode('UTF-8')

				for key in self.chunks:
					print colored('[<<FRASES EXTRAÍDAS>>]:','blue')
					msg="<<Frase: "+key +" >>"
					print colored(msg,'green')
					self.iprint(self.chunks[key],key)
					print " "
				
				# frena un cachito para ver los resultados.

				sleep(4.0)
				
		except Exception,e:
			pass
예제 #4
0
	def parse(self, tagged_sent):
		'''Parsed tagged tokens into parse Tree of chunks'''
		if not tagged_sent: return None
		(words, tags) = zip(*tagged_sent)
		chunks = self.tagger.tag(tags)
		# create conll str for tree parsing
		return conlltags2tree([(w,t,c) for (w,(t,c)) in zip(words, chunks)])
예제 #5
0
 def parse(self, sentence):
     pos_tags = [pos for (word,pos) in sentence]
     tagged_pos_tags = self.tagger.tag(pos_tags)
     chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags]
     conlltags = [(word, pos, chunktag) for ((word,pos),chunktag)
                  in zip(sentence, chunktags)]
     return conlltags2tree(conlltags)
예제 #6
0
    def parse(self, sentence):
        #classify chunks for list of word-tags
        chunked_sents = self.tagger.tag(sentence)

        #convert to tree
        return conlltags2tree([(word, tag, chunk)
            for ((word, tag), chunk) in chunked_sents])
예제 #7
0
 def parse(self, sentence):  # [_code-unigram-chunker-parse]
     pos_tags = [pos for (word, pos) in sentence]
     tagged_pos_tags = self.tagger.tag(pos_tags)
     chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags]
     conlltags = [(word, pos, chunktag)
                  for ((word, pos), chunktag) in zip(sentence, chunktags)]
     #print "input to conlltags", conlltags
     return conlltags2tree(conlltags)
예제 #8
0
 def parse(self, tokens):
     """
         Parse sentence into chunks
     """
     if not tokens:
         return None
     chunked = self.tagger.tag(tokens)
     return conlltags2tree([(w, t, c) for ((w, t), c) in chunked])
예제 #9
0
	def parse(self, tagged_sent):
		'''Parsed tagged tokens into parse Tree of chunks'''
		if not tagged_sent: return None
		(words, tags) = zip(*tagged_sent)
		chunks = self.tagger.tag(tags)
		# create conll str for tree parsing
		wtc = zip(words, chunks)
		return conlltags2tree([(w,t,c) for (w,(t,c)) in wtc])
예제 #10
0
    def parse(self, tagged_sent):
        chunks = self.tagger.tag(tagged_sent)

        # Transform the result from [((w1, t1), iob1), ...]
        # to the preferred list of triplets format [(w1, t1, iob1), ...]
        iob_triplets = [(w, t, c) for ((w, t), c) in chunks]

        # Transform the list of triplets to nltk.Tree format
        return conlltags2tree(iob_triplets)
예제 #11
0
 def parse(self, tagged_sent):
     """Parsed tagged tokens into parse Tree of chunks"""
     if not tagged_sent:
         return None
     (words, tags) = zip(*tagged_sent)
     chunks = self.tagger.tag(tags)
     # create conll str for tree parsing
     wtc = itertools.izip(words, chunks)
     return conlltags2tree([(w, t, c) for (w, (t, c)) in wtc])
 def parse(self, tagged_sent):
     # don't import at top since don't want to fail if not installed
     from pattern.en import parse
     s = ' '.join([word for word, tag in tagged_sent])
     # not tokenizing ensures that the number of tagged tokens returned is
     # the same as the number of input tokens
     sents = parse(s, tokenize=False).split()
     if not sents: return None
     return conlltags2tree([(w, t, c) for w, t, c, p in sents[0]])
예제 #13
0
 def parse(self, sentence):
     tokenized = nltk.pos_tag(nltk.word_tokenize(sentence))
     pos_tags = [pos for (_, pos) in tokenized]
     tagged_pos_tags = self.tagger.tag(pos_tags)
     chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags]
     conlltags = [
         (word, pos, chunktag) for ((word, pos), chunktag) in zip(tokenized, chunktags)
     ]
     return conlltags2tree(conlltags)
예제 #14
0
	def parse(self, tagged_sent):
		# don't import at top since don't want to fail if not installed
		from pattern.en import parse
		s = ' '.join([word for word, tag in tagged_sent])
		# not tokenizing ensures that the number of tagged tokens returned is
		# the same as the number of input tokens
		sents = parse(s, tokenize=False).split()
		if not sents: return None
		return conlltags2tree([(w, t, c) for w, t, c, p in sents[0]])
 def parse(self, tagged_sentence):
   if not tagged_sentence: 
       return None
   pos_tags = [tag for word, tag in tagged_sentence]
   chunk_pos_tags = self.chunk_tagger.tag(pos_tags)
   chunk_tags = [chunk_tag for (pos_tag, chunk_tag) in chunk_pos_tags]
   wpc_tags = [(word, pos_tag, chunk_tag) for ((word, pos_tag), chunk_tag)
                    in zip(tagged_sentence, chunk_tags)]
   return conlltags2tree(wpc_tags)
 def parse(self, tagged_sentence):
   if not tagged_sentence: 
       return None
   pos_tags = [tag for word, tag in tagged_sentence]
   chunk_pos_tags = self.chunk_tagger.tag(pos_tags)
   chunk_tags = [chunk_tag for (pos_tag, chunk_tag) in chunk_pos_tags]
   wpc_tags = [(word, pos_tag, chunk_tag) for ((word, pos_tag), chunk_tag)
                    in zip(tagged_sentence, chunk_tags)]
   return conlltags2tree(wpc_tags)
예제 #17
0
 def parse(self, tokens):
     """
         Parse sentence to chunks
     """
     if not tokens:
         return None
     (words, tags) = zip(*tokens)
     gen_chunks = self.tagger.tag(tags)
     wtc = zip(words, gen_chunks)
     return conlltags2tree([(w, t, c) for (w, (t, c)) in wtc])
예제 #18
0
    def parse(self, sentence):
        tagged_sents = self.tagger.tag(sentence)
        conlltags = [(w,t,c) for ((w,t),c) in tagged_sents]
        
        # returns the tags
        # return conlltags

        # returns the tuple
        response_dict=dict()
        response_dict["tags"]=conlltags
        response_dict["tree"]=conlltags2tree(conlltags)
        #return str(conlltags)+"_SEPARATOR_"+str(conlltags2tree(conlltags))
        #dump=json.dumps(response_dict)
        return response_dict
예제 #19
0
	def parse(self, tagged_sent):
		iobs = []
		in_person = False
		
		for word, tag in tagged_sent:
			if word in self.name_set and in_person:
				iobs.append((word, tag, 'I-PERSON'))
			elif word in self.name_set:
				iobs.append((word, tag, 'B-PERSON'))
				in_person = True
			else:
				iobs.append((word, tag, 'O'))
				in_person = False
		
		return conlltags2tree(iobs)
예제 #20
0
    def parse(self, tagged_sentence):

        if not tagged_sentence:
            return None

        # Separate out POS tags from the sentence
        pos_tags = [tag for word, tag in tagged_sentence]

        # Use the chunk tagger to get IOB Tags (Chunk Tags) for the sentence, using the POS Tags of its words as input
        chunk_pos_tags = self.chunk_tagger.tag(pos_tags)

        chunk_tags = [chunk_tag for (pos_tag, chunk_tag) in chunk_pos_tags]

        # Combine the obtained IOB Tags with the Words and POS Tags to form WTC triples
        wpc_tags = [(word, pos_tag, chunk_tag)
                    for ((word, pos_tag),
                         chunk_tag) in zip(tagged_sentence, chunk_tags)]

        # Build the shallow parse tree from the WTC triples of the sentence
        return conlltags2tree(wpc_tags)
예제 #21
0
	def update(self,data):

		self.chunks={}
		try:
			feature=data
			chunks=feature['chunked']
			tree=conlltags2tree(chunks)
			
			for chunk_type in self.target_chunks:
				succedded_chunk=self.getChunk(tree,chunk_type)
				if succedded_chunk:
					if chunk_type not in self.chunks:
						self.chunks[chunk_type]=succedded_chunk			
			
			if self.to_show in str(self.chunks):
				print "-------------------------------------------------------------------------------------------------------------------------"
				self.pretty_print(feature,self.chunks)
				sleep(self.sleep_time)
				
		except Exception,e:
			print str(e)
			pass
예제 #22
0
파일: main.py 프로젝트: wszamotula/NL_AHK
def cross_val(tag_file):
    """Perform leave one out cross validation for a NER chunker given a tag file"""
    print("\nRunning cross validation score on data set from " + tag_file +
          ":")
    reader = read_gmb(tag_file)
    data = list(reader)
    random.shuffle(data)
    acc = 0
    script_cor = 0
    for i in range(len(data)):
        test_sample = data[i]
        training_samples = data[:]
        del training_samples[i]
        chunker = NamedEntityChunker(training_samples)
        score = chunker.evaluate(
            [conlltags2tree([(w, t, iob) for (w, t), iob in test_sample])])
        acc += score._tags_correct / score._tags_total
        if score._tags_correct == score._tags_total:
            script_cor += 1
    print("Overall tagging accuracy: {0:.2f}%".format(acc / len(data) * 100))
    print("Percentage of scripts correct: {0:.2f}%".format(script_cor /
                                                           len(data) * 100))
    return
예제 #23
0
    def parse(self, text, conlltags=True):
        """
        Given a text, applies tokenization, part of speech tagging and the
        gazetteer words with their tags. Returns an conll tree.

        :param text: The text to parse
        :type text: str
        :param conlltags:
        :type conlltags:
        :return: An conll tree
        :rtype:
        """
        # apply the regular expressions and find all the
        # gazetteer words in text
        for prog, tag in self.progs:
            words_found = set(prog.findall(text))  # keep the unique words
            if len(words_found) > 0:
                for word in words_found:  # words_found may be more than one
                    self.words.append(word)  # keep the words
                    self.iobtags.append(tag)  # and their tag

        # find the pattern with the maximum words.
        # this will be the look ahead variable
        for word in self.words:  # don't care about tags now
            nwords = word.count(' ')
            if nwords > self.lookahead:
                self.lookahead = nwords

        # tokenize and apply part of speech tagging
        tagged_sent = self.pos_tag(self.tokenize(text))
        # find the iob tags
        iobs = self.iob_tags(tagged_sent)

        if conlltags:
            return conlltags2tree(iobs)
        else:
            return iobs
예제 #24
0
 def parse(self, sentence):
     tagged_sents = self.tagger.tag(sentence)
     conlltags = [(w,t,c) for ((w,t),c) in tagged_sents]
     return conlltags2tree(conlltags)
예제 #25
0
 def parse(self, sentence):
     tagged_sents = self.tagger.tag(sentence)
     conlltags = [(w,t,c) for ((w,t),c) in tagged_sents]
     return conlltags2tree(conlltags)
예제 #26
0
	def parse(self, tagged_sent):
		if not tagged_sent: return None
		chunks = self.tagger.tag(tagged_sent)
		return conlltags2tree([(w,t,c) for ((w,t),c) in chunks])
예제 #27
0
def ieer_chunked_sents(tag=nltk.tag.pos_tag):
	for doc in ieer.parsed_docs():
		tagged = ieertree2conlltags(doc.text, tag)
		yield conlltags2tree(tagged)
예제 #28
0
	def parse(self, tagged_sent):
		iobs = self.iob_locations(tagged_sent)
		return conlltags2tree(iobs)
예제 #29
0
 def parse(self, tagged_sent):
     if not tagged_sent: return None
     (words, tags) = zip(*tagged_sent)
     chunks = self.tagger.tag(tags)
     wtc = zip(words, chunks)
     return conlltags2tree([(w, t, c) for (w, (t, c)) in wtc])
print rc.evaluate(test_data)


   


from nltk.chunk.util import tree2conlltags, conlltags2tree

train_sent = train_data[7]
print train_sent

wtc = tree2conlltags(train_sent)
wtc

tree = conlltags2tree(wtc)
print tree
    

def conll_tag_chunks(chunk_sents):
  tagged_sents = [tree2conlltags(tree) for tree in chunk_sents]
  return [[(t, c) for (w, t, c) in sent] for sent in tagged_sents]
  
def combined_tagger(train_data, taggers, backoff=None):
    for tagger in taggers:
        backoff = tagger(train_data, backoff=backoff)
    return backoff
  
from nltk.tag import UnigramTagger, BigramTagger
from nltk.chunk import ChunkParserI
print rc.evaluate(test_data)


   


from nltk.chunk.util import tree2conlltags, conlltags2tree

train_sent = train_data[7]
print train_sent

wtc = tree2conlltags(train_sent)
wtc

tree = conlltags2tree(wtc)
print tree
    

def conll_tag_chunks(chunk_sents):
  tagged_sents = [tree2conlltags(tree) for tree in chunk_sents]
  return [[(t, c) for (w, t, c) in sent] for sent in tagged_sents]
  
def combined_tagger(train_data, taggers, backoff=None):
    for tagger in taggers:
        backoff = tagger(train_data, backoff=backoff)
    return backoff
  
from nltk.tag import UnigramTagger, BigramTagger
from nltk.chunk import ChunkParserI
 def parse(self, tagged_sent): # This function parse sentence and identifies locations
  iobs = self.iob_locations(tagged_sent)
  return conlltags2tree(iobs)
예제 #33
0
            if line.isspace():
                if not iob_pos_tweet:
                    #print(prev_line)
                    #print(counter)
                    print(prev_tokens)
                    continue
                    raise ValueError('tweet empty')
                tweets.append(iob_pos_tweet)
                iob_pos_tweet = []
            else:
                line = line.strip()
                tokens = line.split()
                if not tokens:
                    raise ValueError('tokens empty')
                iob_pos_tweet.append( ((tokens[0], tokens[1]), tokens[2]) )
                prev_tokens = tokens
            prev_line = line
            counter = counter + 1
    return tweets

#research_project/nltk/
training_data = read_data('train.txt')
chunker = NamedEntityChunker(training_data)
test_data = read_data('test.txt')
score = chunker.evaluate([conlltags2tree([(w, t, iob) for (w, t), iob in iobs]) for iobs in test_data])
print('precision: ', score.precision())
print('recall: ', score.recall())
print('f1: ', score.f_measure())
#tweets = [pos_tag(t) for t in tweets]
#sent = nltk.corpus.treebank.tagged_sents()[22]
#tweets = [nltk.ne_chunk(t) for t in tweets]
예제 #34
0
'''
Created on Jul 20, 2015

@author: dongx
'''
import nltk
from nltk.corpus.reader import ConllChunkCorpusReader
from nltk.chunk.util import tree2conlltags, conlltags2tree
from nltk.tree import Tree
from nltk.corpus import treebank
from nltk.corpus import conll2000

iob = tree2conlltags(Tree('S', [Tree('NP', [('the', 'DT'), ('book', 'NN')])]))
tree = conlltags2tree([('the', 'DT', 'B-NP'), ('book', 'NN', 'I-NP')])

print("--------convertion between iob and tree---------------------")
print(iob)
print(tree)