def __init__(self, depth, train_sents): global maxDepth maxDepth = depth self.dicTree = DefDict(1) for s in train_sents: for (w,p,c) in tree2conlltags(s): if c == "O": self.train(w)
def getChunk(self,tree,target_token): target=[] for subtree in tree.subtrees(filter=lambda t:target_token.lower() in t.label().lower()): word,post,iob_chunk=zip(*tree2conlltags(subtree)) word=self.toString(word) target.append(word) return target
def __init__(self, POS): ''' @param POS: the POS tagger is passed through ''' train_sents = conll2000.chunked_sents() train_data = [[(t, c) for w, t, c in tree2conlltags(sent)] for sent in train_sents] self.T = nltk.TrigramTagger(train_data) self.Tagger = POS self.tmp = []
def conll_tag_chunks(chunk_sents): '''Convert each chunked sentence to list of (tag, chunk_tag) tuples, so the final result is a list of lists of (tag, chunk_tag) tuples. >>> from nltk.tree import Tree >>> t = Tree('S', [Tree('NP', [('the', 'DT'), ('book', 'NN')])]) >>> conll_tag_chunks([t]) [[('DT', 'B-NP'), ('NN', 'I-NP')]] ''' tagged_sents = [tree2conlltags(tree) for tree in chunk_sents] return [[(t, c) for (w, t, c) in sent] for sent in tagged_sents]
def conll_tag_chunks(chunk_sents): '''Convert each chunked sentence to list of (tag, chunk_tag) tuples, so the final result is a list of lists of (tag, chunk_tag) tuples. >>> from nltk.tree import Tree >>> t = Tree('S', [Tree('NP', [('the', 'DT'), ('book', 'NN')])]) >>> conll_tag_chunks([t]) [[('DT', 'B-NP'), ('NN', 'I-NP')]] Source: https://github.com/japerk/nltk-trainer/blob/master/nltk_trainer/chunking/chunkers.py ''' tagged_sents = [tree2conlltags(tree) for tree in chunk_sents] return [[(t, c) for (w, t, c) in sent] for sent in tagged_sents]
def test_interactive(self): docs = self.source.find() docs.batch_size(1000) tagger = ngrams.make_backoff_tagger() for ind, doc in enumerate(clean_html.doc_iter(docs)): sentences = pos.tokenize_sents(doc["cleansed_text"]) tags = pos.tokenize_words(sentences) for sent in tags: tagged_sent = tagger.tag(sent) d = ne_chunk(tagged_sent) chunks = tree2conlltags(d) print(chunks) if ind == 10: break
def __init__(self, train_sents): tag_sents = [tree2conlltags(sent)for sent in train_sents] train_chunks = [[((w,t),c) for (w,t,c) in sent] for sent in tag_sents] train_set = [] for tagged_sent in train_chunks: #print tagged_sent untagged_sent = nltk.tag.untag(tagged_sent) history = [] for i, (word, tag) in enumerate(tagged_sent): featureset = self.featx(untagged_sent, i, history) #print featureset,tag train_set.append( (featureset, tag) ) history.append(tag) self.classifier = nltk.naivebayes.NaiveBayesClassifier.train(train_set)
def test_interactive(self): docs = self.source.find_clean(batch_size=1000) tagger = ngrams.make_backoff_tagger() print() for ind, doc in docs: sentences = pos.tokenize_sents(doc["cleansed_text"]) tags = pos.tokenize_words(sentences) for sent in tags: tagged_sent = tagger.tag(sent) d = ne_chunk(tagged_sent) chunks = tree2conlltags(d) print("CHUNKS" + str(chunks)) print("NE" + str(cnll.get_ne(chunks))) print("NOUNS" + str(cnll.get_nouns(chunks))) if ind == 10: break
def fix(self, feature): cleanSentence=feature tree=None try: grammar_pattern_to_clean=r'_.*' # caracter de separacion de niveles dentro de un mismo token. clean_pattern='' modified_chunk_pattern=r'.*_' words,post,iobs=zip(*feature) wiobs=tuple(w+"_"+iob for w,iob in zip(words,iobs)) # las sentencias a parsear ahora no consideran el POS TAG, sino IOBS y palabras. sentence=zip(words,wiobs) tree=self.postChunker.parse(sentence) loc_tags=tree2conlltags(flatten_deeptree(tree)) # voy de arbol a lista de tuplas de nuevo. cleanSentence=cleanIobs(words,post,loc_tags,grammar_pattern_to_clean,modified_chunk_pattern,clean_pattern) except Exception,e: pass
def clean_dict(doc, tagger=nltk.pos_tag): """ Processes NLP features from cleansed_text. All other functions wrap this one. Serves to act as the NLP-front end for reddit corpus parsing. Dictionaries and json strings are accepted and return dictionaries containing additional information. The processing done here represents the general annotations. The following are the new fields added to the dictionary. Classifiers will work to modify or wrap these methods. :: { conlltags : [[(word, pos, BIO)]], nouns : [word], named_entities : [[word, pos, BIO]], cleansed_text : [[word]] } :param doc: dictionary of reddit corpus. :type doc: dict :param tagger: A pos tagger. :type tagger: Tagger :returns: dict """ if "_id" in doc: del (doc["_id"]) sentences = pos.tokenize_sents(doc["cleansed_text"]) tags = pos.tokenize_words(sentences) or [] doc["conlltags"] = [] doc["nouns"] = [] doc["named_entities"] = [] for sent in tags: tagged_sent = nltk.pos_tag(sent) or [] d = ne_chunk(tagged_sent) or [] chunks = tree2conlltags(d) doc["conlltags"].append(chunks) doc["nouns"].extend(cnll.get_nouns(chunks)) doc["named_entities"].extend(cnll.get_ne(chunks)) return doc
def extract(self,text): """ """ tokens = self.tokenizer.tokenize(text) tagged_terms = self.tagger.tag(tokens) terms = {} np_terms = {} noun_phrases = [ node for node in self.np_finder.parse(tagged_terms) if not isinstance(node,tuple)] for node in noun_phrases: coll_tag = tree2conlltags(node) if len(coll_tag) > 1: mterm = [ term.lower() for (term,tag,temp) in coll_tag if len(term)>1 ] mterm = ' '.join(mterm) self._add(mterm,np_terms) for (term,tag,temp) in coll_tag: if tag.startswith('N') and len(term)>1: if tag in ['NNS','NNPS']: term = singularize(term) self._add(term.lower(),terms) for term in terms.keys(): if not self.filter(term,terms[term]): del terms[term] for term in np_terms.keys(): if not self.filter(term,np_terms[term]): del np_terms[term] return (terms,np_terms)
def extract(self, text, locale='en'): """ """ tokenizer = queryUtility(ITokenizer, name=locale) tagger = queryUtility(IPOSTagger, name=locale) if not tagger or not tokenizer: #Non-supported language return tokens = tokenizer.tokenize(text) tagged_terms = tagger.tag(tokens) terms = {} np_terms = {} noun_phrases = [ node for node in tagger.np_grammar.parse(tagged_terms) if not isinstance(node, tuple)] for node in noun_phrases: coll_tag = tree2conlltags(node) if len(coll_tag) > 1: mterm = [ term.lower() for (term, tag, temp) in coll_tag if len(term)>1] mterm = ' '.join(mterm) if mterm: self._add(mterm, np_terms) for (term, tag, temp) in coll_tag: if tag.startswith('N') and len(term)>1: term = tagger.normalize(term, tag) self._add(term.lower(), terms) for term in terms.keys(): if not self.filter(term, terms[term]): del terms[term] for term in np_terms.keys(): if not self.filter(term, np_terms[term]): del np_terms[term] return (terms, np_terms)
def main(): wsjsubset = open("../corpus/wsjsubset", 'r').readlines() genia = open("../corpus/genia", 'r').readlines() txt_esp1 = open("../corpus/espanol1", 'r').readlines() txt_esp2 = open("../corpus/espanol2", 'r').readlines() words = [] postag = [] chunktag = [] for line in txt_esp1: if len(line.split()) > 0: words.append(line.split()[0]) postag.append(line.split()[1]) chunktag.append(line.split()[2]) postag_nltk = pos_tag(words) chunktag_nltk = tree2conlltags(chunker.parse(postag_nltk)) print chunktag_nltk cant_nominales_nltk = 0 cant_nominales_gold = 0 cant_nominales_hit = 0 for i in xrange(len(chunktag_nltk)): if chunktag_nltk[i][2] in ['I-NP', 'B-NP']: cant_nominales_nltk += 1 if chunktag_nltk[i][2] == chunktag[i]: cant_nominales_hit += 1 if chunktag[i] in ['I-NP', 'B-NP']: cant_nominales_gold += 1 precision = cant_nominales_hit / float(cant_nominales_nltk) recall = cant_nominales_hit / float(cant_nominales_gold) print "Precision: ", precision print "Recall: ", recall
def __init__(self, featuremap, train_sents): tagged_sents = [[((w,t),c) for (w,t,c) in tree2conlltags(sent)] for sent in train_sents] self.tagger = _ConsecutiveNPChunkTagger(featuremap, tagged_sents)
def conll_tag_chunks(chunk_sents): tagged_sents = [tree2conlltags(tree) for tree in chunk_sents] return [[(t, c) for (w, t, c) in sent] for sent in tagged_sents]
def chunk_trees2train_chunks(chunk_sents): tag_sents = [tree2conlltags(sent) for sent in chunk_sents] return [[((w,t),c) for (w,t,c) in sent] for sent in tag_sents]
c = rc.parse(tagged_sentence) print c print rc.evaluate(test_data) from nltk.chunk.util import tree2conlltags, conlltags2tree train_sent = train_data[7] print train_sent wtc = tree2conlltags(train_sent) wtc tree = conlltags2tree(wtc) print tree def conll_tag_chunks(chunk_sents): tagged_sents = [tree2conlltags(tree) for tree in chunk_sents] return [[(t, c) for (w, t, c) in sent] for sent in tagged_sents] def combined_tagger(train_data, taggers, backoff=None): for tagger in taggers: backoff = tagger(train_data, backoff=backoff) return backoff
''' Created on Jul 20, 2015 @author: dongx ''' import nltk from nltk.corpus.reader import ConllChunkCorpusReader from nltk.chunk.util import tree2conlltags, conlltags2tree from nltk.tree import Tree from nltk.corpus import treebank from nltk.corpus import conll2000 iob = tree2conlltags(Tree('S', [Tree('NP', [('the', 'DT'), ('book', 'NN')])])) tree = conlltags2tree([('the', 'DT', 'B-NP'), ('book', 'NN', 'I-NP')]) print("--------convertion between iob and tree---------------------") print(iob) print(tree)
def chunk_trees2train_chunks(chunk_sents): """ Convert tuples (word, pos, iob) ot ((word, pos), iob) """ tag_sents = [tree2conlltags(sent) for sent in chunk_sents] return [[((w, t), c) for (w, t, c) in sent] for sent in tag_sents]
from nltk.chunk.util import tree2conlltags import rdt.nlp.conll_get as cnll if __name__ == "__main__": source = rdtcorp.Source(conf_key="source_test") annotated = rdtcorp.Source(conf_key="annotated_test") docs = source.find() docs.batch_size(1000) tagger = ngrams.make_backoff_tagger() buf = [] for ind, doc in enumerate(clean.doc_iter(docs)): del (doc["_id"]) sentences = pos.tokenize_sents(doc["cleansed_text"]) tags = pos.tokenize_words(sentences) doc["conlltags"] = [] doc["nouns"] = [] doc["named_entities"] = [] for sent in tags: tagged_sent = tagger.tag(sent) d = ne_chunk(tagged_sent) chunks = tree2conlltags(d) doc["conlltags"].append(chunks) doc["nouns"].extend(cnll.get_nouns(chunks)) doc["named_entities"].extend(cnll.get_ne(chunks)) buf.append(doc) if ind % 1000: annotated.insert(buf) buf = [] if buf: annotated.insert(buf)
def conll_tag_chunks(chunk_sents): """ Extracts a list of tuples (pos, iob) from a list of trees. """ tagged_sents = [tree2conlltags(tree) for tree in chunk_sents] return [[(t, c) for (w, t, c) in sent] for sent in tagged_sents]
def chunk_trees2train_chunks(chunk_sents): tag_sents = [tree2conlltags(sent) for sent in chunk_sents] return [[((w, t), c) for (w, t, c) in sent] for sent in tag_sents]