def BP_tree_to_nltk_tree(tree): root = Tree(str(tree.keys), children = []) if isinstance(tree, BPnode) or isinstance(tree, Node): for child in tree.children: root.append(BP_tree_to_nltk_tree(child)) return root
def wsjtree2pos(wsj_corpus_path): print >> sys.stderr, "Reading in corpus..." sentences = [] for d in os.listdir(wsj_corpus_path): if os.path.isdir(wsj_corpus_path + "/" + d) and d != "CVS" and int(d) < 8: for f in os.listdir(wsj_corpus_path + "/" + d): if f.endswith(".mrg"): fname = wsj_corpus_path + "/" + d + "/" + f # print fname tree_f = open(fname, "r") tree_string = "" for line in tree_f: if line.strip(): if line.startswith("( (") or line.startswith("(("): if tree_string: tr = Tree(tree_string) sentences.append(tr.pos()) tree_string = line.strip() else: tree_string = line.strip() else: tree_string += line.strip() if tree_string: tr = Tree(tree_string) sentences.append(tr.pos()) return sentences
def attach_tree(head,dep,attachment,chain,indexes,flag,coindex=None): #head,dep: trees; flag: 'right'/'left' """ attach dep's projection chain to head's projection chain """ if isinstance(coindex,int): # handle coindex tag label = attachment['label2'] offset = attachment['offset2'] dep = Tree(dep.label(),['*-'+str(coindex)]) else: label = attachment['label'] offset = attachment['offset'] l_index = [l[0] for l in chain[0]].index(label) count = sum([l[1] for l in chain[0]][:l_index+1])-offset if flag=='right': a_index = indexes[count-1]+1 elif flag=='left': a_index = indexes[count-1] indexes[count-1] += 1 else: return "Invalid flag!" if head.label()=='PRN': s = 'head[0]' else: s = 'head' for i in range(count-1): s += '['+str(indexes[i])+']' eval(s+'.insert('+str(a_index)+',dep)') # insert() vs pop() if 'f_tag' in attachment: if attachment['f_tag'] not in {'PRD','PRDs'}: eval(s+'.set_label('+s+'.label()+"-"+attachment["f_tag"])') else: s += '['+str(indexes[count-1])+']' eval(s+'.set_label('+s+'.label()+"-"+attachment["f_tag"])') return head,indexes
def __init__(self, node, children, parent_node=None, rel=None, attrs=None, head=None): self.parent_node = parent_node self.rel = rel self.attrs = attrs self.head = head Tree.__init__(self, node, children)
def parse_tree(self, text, binary=False, preprocessed=False): nlp_output = self.nlp.annotate(text, properties={ 'annotators': 'tokenize,ssplit,pos,parse', 'outputFormat': 'json', 'parse.binaryTrees': 'true' }) if type(nlp_output) == str: nlp_output = json.loads(nlp_output, strict=False) if len(nlp_output['sentences']) > 1: #merge trees from sentences tree_string = "(Top " for s in nlp_output['sentences']: p_tree = Tree.fromstring(s['parse']) tree_string += str(p_tree[0]) tree_string += ")" merged_tree = Tree.fromstring(tree_string) else: #no merging required merged_tree = Tree.fromstring(nlp_output['sentences'][0]['parse']) #remove root merged_tree = merged_tree[0] if binary: nltk.treetransforms.chomsky_normal_form(merged_tree) if preprocessed: merged_tree = preprocess_parse_tree(merged_tree) return merged_tree
def parser_output_to_parse_deriv_trees(output): lines = output.strip().split("\n") deriv_tree_lines = lines[::2] parse_tree_lines = lines[1::2] parse_trees = [Tree.fromstring(line.replace('\x06', 'epsilon_')) for line in parse_tree_lines if line != ''] deriv_trees = [Tree.fromstring(line) for line in deriv_tree_lines if line != ''] return parse_trees, deriv_trees
def munge(t): if type(t) == Tree: toks = t.leaves() t = Tree(t.label(), [munge(child) for child in t]) setattr(t, "tokens", toks) return t else: return Tree(t, [])
def removeNounMods(tree): tree_str = tsurgeon.remove_internal_mods(tree) if tree_str != '': tree = Tree.fromstring(tree_str) tree_str = tsurgeon.remove_participle_mods(tree) if tree_str != '': tree = Tree.fromstring(tree_str) return tree
def parse(self, tagged_sent): """Parse a tagged sentence. tagged_sent -- the tagged sentence (a list of pairs (word, tag)). """ t = Tree(self.start, [Tree(tag, [word]) for word, tag in tagged_sent]) t.chomsky_normal_form(factor='left', horzMarkov=0) return t
def _get_tense(cls, parse, token_indices, use_gold=False): if len(token_indices) == 1: return 'one_token' parse_tree = Tree(parse['parsetree']) start_index = min(token_indices) end_index = max(token_indices) + 1 tree_position = parse_tree.treeposition_spanning_leaves(start_index, end_index) arg_subtree = parse_tree[tree_position] return cls._recurse_search_tag(arg_subtree, ['VP'], [])
def test_lbranch_parse(self): model = LBranch([], 'S') # empty training set trees = [model.parse(s) for s in self.tagged_sents] trees2 = [ Tree.fromstring("""(S (S|<> (S|<> (S|<> (D El) (N gato)) (V come)) (N pescado)) (P .))"""), Tree.fromstring("""(S (S|<> (S|<> (S|<> (D La) (N gata)) (V come)) (N salmón)) (P .))"""), ] self.assertEqual(trees, trees2)
def test_flat_parse(self): model = Flat([], 'S') # empty training set trees = [model.parse(s) for s in self.tagged_sents] trees2 = [ Tree.fromstring("(S (D El) (N gato) (V come) (N pescado) (P .))"), Tree.fromstring("(S (D La) (N gata) (V come) (N salmón) (P .))"), ] self.assertEqual(trees, trees2)
def tags2tree(sentence, root_label='S', strict=False): tree = Tree(root_label, []) for (word, postag, chunktag) in sentence: if chunktag is None: if strict: raise ValueError("Bad tag sequence") else: # Treat as O tree.append((word, postag)) elif chunktag.startswith('B'): tree.append(Tree(chunktag[2:], [(word, postag)])) elif chunktag.startswith('I'): if (len(tree) == 0 or not isinstance(tree[-1], Tree) or tree[-1].label() != chunktag[2:]): if strict: raise ValueError("Bad tag sequence") else: # Treat as B-* tree.append(Tree(chunktag[2:], [(word, postag)])) else: tree[-1].append((word, postag)) elif chunktag == 'O': tree.append((word, postag)) else: raise ValueError("Bad tag %r" % chunktag) return tree
def conlltags2tree(sentence, chunk_types=('NP','PP','VP'), root_label='S', strict=False): """ Convert the CoNLL IOB format to a tree. """ tree = Tree(root_label, []) for (word, postag, chunktag) in sentence: if chunktag is None: if strict: raise ValueError("Bad conll tag sequence") else: # Treat as O tree.append((word,postag)) elif chunktag.startswith('B-'): tree.append(Tree(chunktag[2:], [(word,postag)])) elif chunktag.startswith('I-'): if (len(tree)==0 or not isinstance(tree[-1], Tree) or tree[-1].label() != chunktag[2:]): if strict: raise ValueError("Bad conll tag sequence") else: # Treat as B-* tree.append(Tree(chunktag[2:], [(word,postag)])) else: tree[-1].append((word,postag)) elif chunktag == 'O': tree.append((word,postag)) else: raise ValueError("Bad conll tag {0!r}".format(chunktag)) return tree
def __build_tree(self, node_num): word_tuple = self.words[node_num] tree_node = Tree(word_tuple[1], []) node_dependencies = self.dependencies.get(node_num) if node_dependencies is not None: for dependency in node_dependencies: dependency_node = self.__build_tree(dependency[0]) tree_node.append(dependency_node) return tree_node
def __str2BguTree(self,text): lines = text.split('\n') tree = Tree('s',[]) for line in lines: if line=='': continue mlist = line.split("\t") word = mlist[0] raw = mlist[1] tree.append((word,bguTag(raw))) return tree
def extractParticiple(tree): part_mod = tsurgeon.hasParticipleMod(tree) if part_mod != '': subject = tsurgeon.findSubject(tree) subject_words = Tree.fromstring(subject).leaves() part_tree = Tree.fromstring(part_mod) part_words = part_tree.leaves() # Ignoring inflection result_words = subject_words + ['is'] + part_words[1:] sentence = ' '.join(result_words).strip() + '.' return sentence pass
def _strip_functional_tags(self, tree: Tree) -> None: """ Removes all functional tags from constituency labels in an NLTK tree. We also strip off anything after a =, - or | character, because these are functional tags which we don't want to use. This modification is done in-place. """ clean_label = tree.label().split("=")[0].split("-")[0].split("|")[0] tree.set_label(clean_label) for child in tree: if not isinstance(child[0], str): self._strip_functional_tags(child)
def postag_tree(tree): # Part-of-speech tagging. words = tree.leaves() tag_iter = (pos for (word, pos) in pos_tag(words)) newtree = Tree('S', []) for child in tree: if isinstance(child, Tree): newtree.append(Tree(child.label(), [])) for subchild in child: newtree[-1].append( (subchild, next(tag_iter)) ) else: newtree.append( (child, next(tag_iter)) ) return newtree
def visualize_sentence_tree(sentence_tree): processed_tree = process_sentence_tree(sentence_tree) processed_tree = [ Tree( item[0], [ Tree(x[1], [x[0]]) for x in item[1] ] ) for item in processed_tree ] tree = Tree('S', processed_tree ) tree.draw()
def _add_entity(t,tpl,entity_type): """ Does the work of adding the entity-type node """ parent_positions=[] parents=[] first_parent_position=t.leaf_treeposition(tpl[0])[:-1] first_grandparent_position=first_parent_position[:-1] for i in range(tpl[0],tpl[-1]): parent_position=t.leaf_treeposition(i)[:-1] parent=t[parent_position] parent_positions.append(parent_position) parents.append(parent) if 'parent_position' in locals(): grandparent_position=parent_position[:-1] grandparent=t[grandparent_position] if grandparent_position==first_grandparent_position: # augment the nodes ONLY if every token in the mention has the same grandparent # i.e., if 'Barack Hussein Obama' is one NP, replace it with (NP (E-PER (NNP Barack)(NNP Hussein)(NNP Obama))) # but if we have "National Rifle" in one NP and "Association" in another NP, we don't bother adding E-ORG at all # (hopefully that doesn't exclude too many trees) aug_node='E-'+entity_type new_tree=Tree(aug_node,parents) if len(parent_positions)>1: if parent_positions[-1][-1]!=len(grandparent.leaves())-1: #if the last member of the tuple is NOT the rightmost child #giving up on slices; collecting all of gp's children, then adding b new_leaves=new_tree.leaves() new_kids=[] for kid in grandparent: if kid[0] not in new_leaves: new_kids.append(kid) elif kid[0]==new_leaves[0]: new_kids.append(new_tree) else: pass new_grandparent=Tree(grandparent.node,new_kids) ggparent=t[grandparent_position[:-1]] ggparent[grandparent_position[-1]]=new_grandparent else: #it is the rightmost child grandparent[parent_positions[0][-1]:len(grandparent.leaves())]=[new_tree] else: #one-word node grandparent[parent_positions[0][-1]]=new_tree
def test_productions(self): t = Tree.fromstring( """ (S (NP (Det el) (Noun gato)) (VP (Verb come) (NP (Noun pescado) (Adj crudo))) ) """) # Bugfix from official test (, start='S') model = UPCFG([t], start='S') prods = model.productions() prods2 = [ ProbabilisticProduction(N('S'), [N('NP'), N('VP')], prob=1.0), ProbabilisticProduction(N('NP'), [N('Det'), N('Noun')], prob=0.5), ProbabilisticProduction(N('Det'), ['Det'], prob=1.0), ProbabilisticProduction(N('Noun'), ['Noun'], prob=1.0), ProbabilisticProduction(N('VP'), [N('Verb'), N('NP')], prob=1.0), ProbabilisticProduction(N('Verb'), ['Verb'], prob=1.0), ProbabilisticProduction(N('NP'), [N('Noun'), N('Adj')], prob=0.5), ProbabilisticProduction(N('Adj'), ['Adj'], prob=1.0), ] self.assertEqual(set(prods), set(prods2))
def read_segtree_file(fn): """reads a string representing a discourse tree (from the seg. annotation) and returns a list of its child tree objects""" with codecs.open(fn, 'r', 'utf-8') as f: s = f.read() text_tree = Tree.fromstring(s, read_leaf=prefix_number_seg_token) return [segment for segment in text_tree]
def test_tree4(): annotator=Annotator() sent = "There are people dying make this world a better place for you and for me." sent = "Biplab is a good boy." sent = "He created the robot and broke it after making it." sent = "Bachelor 's degree in computer science , design or related field." sent = "B.S. in Computer Science , a related degree or its equivalent" sent = "BS , MS , or PhD in Computer Science or a similar field preferred" sent = "Computer Science or related technical degree from an accredited four year university " sent = "Degree in Computer Science or Engineering with a high GPA ." sent = "A Master's degree in Computer Science or Engineering is mandatory ." sent = "A Computer Science or related degree " sent = "I love science and SciFi book" sent = "I love music and SciFi book" result = annotator.getAnnotations(sent) tree_str = result['syntax_tree'] print print tree_str tree = Tree.fromstring(tree_str)[0] print print "Root label=",tree.label() tree.draw()
def test(): """Do some tree drawing tests.""" def print_tree(n, tree, sentence=None, ansi=True, **xargs): print() print('{0}: "{1}"'.format(n, ' '.join(sentence or tree.leaves()))) print(tree) print() drawtree = TreePrettyPrinter(tree, sentence) try: print(drawtree.text(unicodelines=ansi, ansi=ansi, **xargs)) except (UnicodeDecodeError, UnicodeEncodeError): print(drawtree.text(unicodelines=False, ansi=False, **xargs)) from nltk.corpus import treebank for n in [0, 1440, 1591, 2771, 2170]: tree = treebank.parsed_sents()[n] print_tree(n, tree, nodedist=2, maxwidth=8) print() print('ASCII version:') print(TreePrettyPrinter(tree).text(nodedist=2)) tree = Tree.fromstring( '(top (punct 8) (smain (noun 0) (verb 1) (inf (verb 5) (inf (verb 6) ' '(conj (inf (pp (prep 2) (np (det 3) (noun 4))) (verb 7)) (inf (verb 9)) ' '(vg 10) (inf (verb 11)))))) (punct 12))', read_leaf=int) sentence = ('Ze had met haar moeder kunnen gaan winkelen ,' ' zwemmen of terrassen .'.split()) print_tree('Discontinuous tree', tree, sentence, nodedist=2)
def merge_tree_nnps(tree): """ Takes a parse tree and merges any consecutive leaf nodes that come from NNPs For example if there is a segment of: (NP (JJ old) (NNP Pierre) (NNP Vinken) ) Returns: (NP (JJ old) (NNP PierreVinken) ) """ # require a parented tree to get a subtrees tree position p = ParentedTree.convert(tree) # iterates subtrees of height 3. This is where NP's leading to NNP's leading to lexicalizations will be for s in p.subtrees(filter=lambda s: s.height() == 3): # merge NNP's in the list representation of this trees children: [(POS, word), ...] new_noun_phrase = merge_tagged_nnps([(c.label(), c[0]) for c in s]) child_str = " ".join("(%s %s)" % (pos, word) for pos, word in new_noun_phrase) # create new subtree with merged NNP's new_s = ParentedTree.fromstring("(%s %s)" % (s.label(), child_str)) # replace old subtree with new subtree p[s.treeposition()] = new_s return Tree.convert(p)
def rulelogic(sentnece): leaves_list = [] text = (sentnece) output = nlp.annotate(text, properties={ 'annotators': 'tokenize,ssplit,pos,depparse,parse', 'outputFormat': 'json' }) parsetree = output['sentences'][0]['parse'] #print parsetree for i in Tree.fromstring(parsetree).subtrees(): if i.label() == 'PRP': #print i.leaves(), i.label() leaves_list.append(i.leaves()) if i.label() == 'VBP' or i.label() == 'VBZ': #print i.leaves(), i.label() leaves_list.append(i.label()) #print leaves_list if (any("We" in x for x in leaves_list) or any("I" in x for x in leaves_list) or any( "You" in x for x in leaves_list) or any("They" in x for x in leaves_list)) and any("VBZ" in x for x in leaves_list): print "Alert: \nPlease check Subject and verb in the sentence.\nYou may have plural subject and singular verb. " elif(any("He" in x for x in leaves_list) or any("She" in x for x in leaves_list) or any( "It" in x for x in leaves_list)) and any("VBP" in x for x in leaves_list): print "Alert: \nPlease check subject and verb in the sentence.\n" \ "You may have singular subject and plural verb." else: print "You have correct sentence."
def extract_entities(pos_server, assimilator, mode, text, link): """ Extract tokens in the buckets of nouns and other entities pos_server: part of speech tagger address assimilarot: assimilator address mode: metadata or content """ content = get_assimilator_data(mode=mode, assimilator=assimilator, text=text, link=link) if mode == "meta": import json yield json.dumps(json.loads(content.decode()), indent=4) else: import json from .semantic_parser import read_dep from nltk.tree import Tree concept_map = {} pos_generator = process_pos(pos_server, content=content) for line in pos_generator: data = json.loads(line.decode()) tree = Tree.fromstring(data['tree']) tokens = read_dep(tree) yield tokens
def yngve_redux(treestring): """ For the given parsers-tree-string, return the word count and the yngve score. """ tree = Tree.fromstring(treestring) total = float(calc_yngve_score(tree, 0)) words = float(get_word_score(tree)) return [total, words]
def initialize_edu_data(edus): ''' Create a representation of the list of EDUS that make up the input. ''' wnum = 0 # counter for distance features res = [] for edu_index, edu in enumerate(edus): # lowercase all words edu_words = [x[0].lower() for x in edu] edu_pos_tags = [x[1] for x in edu] # make a dictionary for each EDU new_tree = Tree.fromstring('(text)') new_tree.append('{}'.format(edu_index)) tmp_item = {"head_idx": wnum, "start_idx": wnum, "end_idx": wnum, "nt": "text", "head": edu_words, "hpos": edu_pos_tags, "tree": new_tree} wnum += 1 res.append(tmp_item) return res
def tree(bracketed): t = Tree.fromstring(bracketed) t.draw()
def normalize_leaves(self, tree): tree = Tree.fromstring(tree) for pos in tree.treepositions('leaves'): tree[pos] = self.stemmer.stem(tree[pos]).lower() return str(tree).replace("\n", "")
def extractSubConjuncts(tree): (sub1, sub2) = tsurgeon.extract_sub_conjuncts(tree) return (Tree.fromstring(sub1), Tree.fromstring(sub2))
rule_head.update({r[0]:1}) rules.update({r:1}) for r in rules.keys(): p = float("{0:.2f}".format(float(rules[r])/float(rule_head[r[0]]))) if len(r) == 3: r = r[0],(r[1],r[2]) rules_with_p.append((r[0],r[1], p)) return rules_with_p # read rule with open('CNF_rule.txt', 'r') as file: rule = [] for line in file: print(line) line = line.strip('\n') t = Tree.fromstring(line) rule += convert_rule(t) # add rule with probabilty p_rules = p_rule(rule) with open('rule.txt', 'w') as file_rule: for rules in p_rules: for term in rules: if type(term) is list or type(term) is tuple: for p in term: file_rule.write(p) file_rule.write(' ') else: file_rule.write(str(term)) file_rule.write(' ') file_rule.write('\n')
def select(self, tree): if tree is None: raise ValueError("Parse tree not avaialable") return Tree("*CHAIN*", [p.select(tree) for p in self.pieces])
def dertotree(derivation): return Tree(derivation.entity, [dertotree(d) for d in derivation.daughters if type(d) != UdfTerminal])
def _make_tree(self, result): return Tree.fromstring(result)
def read_features(self, flag): all_data = self.read_json( path.join(self.data_dir, flag + '.stanford.json')) all_feature_data = [] for data in all_data: sentence_len = 0 sentence_feature = [] sentence = '' words = [] index = [] sentences = data['sentences'] for sentence in sentences: tokens = sentence['tokens'] for token in tokens: feature_dict = {} feature_dict['word'] = token['originalText'] words.append(token['word'].replace('\xa0', '')) # sentence += token['word'] start_index = token['characterOffsetBegin'] end_index = token['characterOffsetEnd'] feature_dict['char_index'] = [ i for i in range(start_index, end_index) ] feature_dict['length'] = sentence_len + len(sentence) feature_dict['pos'] = token['pos'] sentence_feature.append(feature_dict) # df = df.append([{'word': ' ', 'pos': ' '}], ignore_index=True) deparse = sentence['basicDependencies'] for dep in deparse: dependent_index = dep['dependent'] - 1 sentence_feature[dependent_index]['dep'] = dep['dep'] sentence_feature[dependent_index][ 'governed_index'] = dep['governor'] - 1 c_parse = Tree.fromstring(sentence['parse'].replace( '\xa0', '')) current_index = 0 for s in c_parse.subtrees(lambda t: t.label() in chunk_pos): leaves = s.leaves() if len(leaves) == 0: continue node = s.label() index = words[current_index:].index( leaves[0]) + current_index current_index = index for i, leaf in enumerate(leaves): if 'chunk_tags' not in sentence_feature[index + i]: sentence_feature[index + i]['chunk_tags'] = [] sentence_feature[index + i]['chunk_tags'].append({ 'chunk_tag': node, 'height': 0, 'range': [index, index + len(leaves) - 1] }) for chunk_tag in sentence_feature[index + i]['chunk_tags']: chunk_tag['height'] += 1 for token in sentence_feature: if 'chunk_tags' not in token: token['chunk_tags'] = [{ 'chunk_tag': 'ROOT', 'height': 1, 'range': [0, len(sentence_feature) - 1] }] all_feature_data.append(sentence_feature) return all_feature_data
def bin_question_extract(self, tree): t = Tree.fromstring(tree) t_pos = t.pos() return t_pos
def reparse_tree(self, line): ptree = Tree.fromstring(line) leaves = ptree.leaves()
def draw_text_trees(text): tree = Tree.fromstring(str(text)) return svgling.draw_tree(tree)
#!/usr/bin/env python import sys from nltk.tree import Tree print r"\documentclass[10pt]{article}" print r"\usepackage[landscape]{geometry}" print r"\usepackage{tikz-qtree}" print r"\begin{document}" for line in sys.stdin: tree = Tree.fromstring(line.rstrip()) print r"\begin{tikzpicture}[scale=.5]" print tree.pprint_latex_qtree() print r"\end{tikzpicture}" print "" print r"\end{document}"
def generate_data(self, corpus, pairtypes=("mirna", "protein")): if os.path.isfile(self.temp_dir + self.modelname + ".txt"): os.remove(self.temp_dir + self.modelname + ".txt") xerrors = 0 #print pairs for sentence in corpus.get_sentences("goldstandard"): doc_lines = [] pcount = 0 logging.info("{}".format(sentence.sid)) sentence_entities = [ entity for entity in sentence.entities.elist["goldstandard"] ] # logging.debug("sentence {} has {} entities ({})".format(sentence.sid, len(sentence_entities), len(sentence.entities.elist["goldstandard"]))) for pair in itertools.combinations(sentence_entities, 2): if pair[0].type == pairtypes[0] and pair[1].type == pairtypes[ 1] or pair[1].type == pairtypes[0] and pair[ 0].type == pairtypes[1]: # logging.debug(pair) if pair[0].type == pairtypes[0]: e1id = pair[0].eid e2id = pair[1].eid else: e1id = pair[1].eid e2id = pair[0].eid pair = (pair[1], pair[0]) pid = sentence.did + ".p" + str(pcount) """if sid1 != sid2: sentence1 = corpus.documents[did].get_sentence(sid1) tree1 = self.mask_entity(sentence1, Tree.fromstring(sentence1.parsetree), pair[0], "candidate1") sentence2 = corpus.documents[did].get_sentence(sid2) tree2 = self.mask_entity(sentence2, Tree.fromstring(sentence2.parsetree), pair[1], "candidate2") tree = self.join_trees(tree1, tree2) else:""" sentence1 = corpus.documents[sentence.did].get_sentence( pair[0].sid) if sentence1.parsetree == "SENTENCE_SKIPPED_OR_UNPARSABLE": logging.info("skipped {}=>{} on sentence {}-{}".format( pair[0].text, pair[1].text, sentence1.sid, sentence1.text)) continue tree = Tree.fromstring(sentence1.parsetree) if "candidate1" in sentence1.parsetree: logging.info(sentence1.parsetree) tree = self.mask_entity(sentence1, tree, pair[0], "candidate1") tree = self.mask_entity(sentence1, tree, pair[1], "candidate2") # if tree[0] != '(': # tree = '(S (' + tree + ' NN))' #this depends on the version of nlkt tree, found = self.get_path(tree) #if len(docs[sid][ddi.SENTENCE_ENTITIES]) > 20: #print line # line = "1 |BT| (ROOT (NP (NN candidatedrug) (, ,) (NN candidatedrug))) |ET|" # xerrors += 1 #else: # tree = self.normalize_leaves(tree) line = self.get_svm_train_line(tree, pair) if pair[1].eid not in pair[0].targets: line = '-' + line self.pids[pid] = pair doc_lines.append(line) pcount += 1 logging.debug("writing {} lines to file...".format(len(doc_lines))) with codecs.open(self.temp_dir + self.modelname + ".txt", 'a', "utf-8") as train: for l in doc_lines: train.write(l) logging.info("wrote {}{}.txt".format(self.temp_dir, self.modelname))
def demo(): import random def fill(cw): cw['fill'] = '#%06d' % random.randint(0, 999999) cf = CanvasFrame(width=550, height=450, closeenough=2) t = Tree.fromstring(''' (S (NP the very big cat) (VP (Adv sorta) (V saw) (NP (Det the) (N dog))))''') tc = TreeWidget(cf.canvas(), t, draggable=1, node_font=('helvetica', -14, 'bold'), leaf_font=('helvetica', -12, 'italic'), roof_fill='white', roof_color='black', leaf_color='green4', node_color='blue2') cf.add_widget(tc, 10, 10) def boxit(canvas, text): big = ('helvetica', -16, 'bold') return BoxWidget(canvas, TextWidget(canvas, text, font=big), fill='green') def ovalit(canvas, text): return OvalWidget(canvas, TextWidget(canvas, text), fill='cyan') treetok = Tree.fromstring( '(S (NP this tree) (VP (V is) (AdjP shapeable)))') tc2 = TreeWidget(cf.canvas(), treetok, boxit, ovalit, shapeable=1) def color(node): node['color'] = '#%04d00' % random.randint(0, 9999) def color2(treeseg): treeseg.label()['fill'] = '#%06d' % random.randint(0, 9999) treeseg.label().child()['color'] = 'white' tc.bind_click_trees(tc.toggle_collapsed) tc2.bind_click_trees(tc2.toggle_collapsed) tc.bind_click_nodes(color, 3) tc2.expanded_tree(1).bind_click(color2, 3) tc2.expanded_tree().bind_click(color2, 3) paren = ParenWidget(cf.canvas(), tc2) cf.add_widget(paren, tc.bbox()[2] + 10, 10) tree3 = Tree.fromstring(''' (S (NP this tree) (AUX was) (VP (V built) (PP (P with) (NP (N tree_to_treesegment)))))''') tc3 = tree_to_treesegment(cf.canvas(), tree3, tree_color='green4', tree_xspace=2, tree_width=2) tc3['draggable'] = 1 cf.add_widget(tc3, 10, tc.bbox()[3] + 10) def orientswitch(treewidget): if treewidget['orientation'] == 'horizontal': treewidget.expanded_tree(1, 1).subtrees()[0].set_text('vertical') treewidget.collapsed_tree(1, 1).subtrees()[0].set_text('vertical') treewidget.collapsed_tree(1).subtrees()[1].set_text('vertical') treewidget.collapsed_tree().subtrees()[3].set_text('vertical') treewidget['orientation'] = 'vertical' else: treewidget.expanded_tree(1, 1).subtrees()[0].set_text('horizontal') treewidget.collapsed_tree(1, 1).subtrees()[ 0].set_text('horizontal') treewidget.collapsed_tree(1).subtrees()[1].set_text('horizontal') treewidget.collapsed_tree().subtrees()[3].set_text('horizontal') treewidget['orientation'] = 'horizontal' text = """ Try clicking, right clicking, and dragging different elements of each of the trees. The top-left tree is a TreeWidget built from a Tree. The top-right is a TreeWidget built from a Tree, using non-default widget constructors for the nodes & leaves (BoxWidget and OvalWidget). The bottom-left tree is built from tree_to_treesegment.""" twidget = TextWidget(cf.canvas(), text.strip()) textbox = BoxWidget(cf.canvas(), twidget, fill='white', draggable=1) cf.add_widget(textbox, tc3.bbox()[2] + 10, tc2.bbox()[3] + 10) tree4 = Tree.fromstring('(S (NP this tree) (VP (V is) (Adj horizontal)))') tc4 = TreeWidget(cf.canvas(), tree4, draggable=1, line_color='brown2', roof_color='brown2', node_font=('helvetica', -12, 'bold'), node_color='brown4', orientation='horizontal') tc4.manage() cf.add_widget(tc4, tc3.bbox()[2] + 10, textbox.bbox()[3] + 10) tc4.bind_click(orientswitch) tc4.bind_click_trees(tc4.toggle_collapsed, 3) # Run mainloop cf.mainloop()
import json from jsonrpc import ServerProxy, JsonRpc20, TransportTcpIp from pprint import pprint class StanfordNLP: def __init__(self): self.server = ServerProxy(JsonRpc20(), TransportTcpIp(addr=("127.0.0.1", 8080))) def parse(self, text): return json.loads(self.server.parse(text)) nlp = StanfordNLP() result = nlp.parse("Hello world! It is so beautiful.") pprint(result) from nltk.tree import Tree tree = Tree.parse(result['sentences'][0]['parsetree']) pprint(tree)
def select(self, tree): if tree is None: raise ValueError('Parse tree not avaialable') return Tree('*SPLIT*', [p.select(tree) for p in self.pieces])
import sys import re trees = list() tree = '' for line in sys.stdin: if (line[0] == '('): if (tree != ''): trees.append(tree) tree = '' tree += line if (tree != ''): trees.append(tree) for i, t in enumerate(trees): tree = Tree.fromstring(t) # remove punctuation for sub in tree.subtrees(): remove = list() for n, child in enumerate(sub): if isinstance(child, str): if (re.match( r'^(\.|,|\?|!|;|:|\'|\'\'|`|``|'|"|-[LR][RSC]B-|-|--)$', child)): remove.append(n) #del sub[n] for n in sorted(remove, reverse=True): del sub[n] #sys.stderr.write(str(len(tree.leaves())) + ' ') # remove brackets with one item for sub in tree.subtrees():
import sys from nltk.tree import Tree fi = open(sys.argv[1]).readlines() for line in fi: line = line.strip() if line == '': continue print ' '.join(Tree(line).leaves())
def get_features(self, doc): for sent in sent_tokenize(doc['text']): sent = sent.lower() if self.feats == 'WordNgram': tokens = word_tokenize(sent) for n in range(1, 3): if len(tokens) < n: sent_ngrams = ngrams(tokens, len(tokens)) else: sent_ngrams = ngrams(tokens, n) for ngram in sent_ngrams: yield ngram elif self.feats == 'CharNgram': chrs = [c for c in sent] for n in range(1, 7): sent_ngrams = ngrams(chrs, n) for ngram in sent_ngrams: yield ngram elif self.feats == 'PosNgram': token = word_tokenize(sent) tagged = pos_tag(token) # doctest: +SKIP tags = [] for tagtoken in tagged: tags.append(tagtoken[1]) for n in range(1, 5): taggrams = ngrams(tags, n) for ngram in taggrams: yield ngram elif self.feats == 'ProdRules': parse = list(parser.raw_parse(sent)) parse2 = [''.join(str(tree)) for tree in parse] parse3 = ''.join(parse2) ptree = Tree.fromstring(parse3) for rule in ptree.productions(): yield rule elif self.feats == 'FunctWordsSkipgram': skip = [] tokens = wordpunct_tokenize(sent) for token in tokens: if token in funct_words: skip.append(token) skipgrams = ngrams(skip, 2) for ngram in skipgrams: yield ngram elif self.feats == "ContentSkipGram": skip = [] tokens = wordpunct_tokenize(sent) for token in tokens: if token not in funct_words: skip.append(token) skipgrams = ngrams(skip, 2) for ngram in skipgrams: yield ngram elif self.feats == 'FunctWordCount': frequency = defaultdict(int) tokens = wordpunct_tokenize(sent) for token in tokens: if token in funct_words: frequency[token] += 1 functwordfreqs = [] for funct_word in funct_words: functwordfreqs.append(frequency[funct_word]) return functwordfreqs elif self.feats == 'OverallFunctWordCount': functcount = 0 tokens = wordpunct_tokenize(sent) for token in tokens: if token in funct_words: functcount += 1 # print(functcount) return functcount elif self.feats == 'Dependency': result = dependency_parser.raw_parse(sent) for dep in result: triples = list(dep.triples()) for triple in triples: trip = triple[0][1] + '.' + triple[1] + '.' + triple[2][1] yield trip
#coding=utf8 import json from jsonrpc import ServerProxy, JsonRpc20, TransportTcpIp from pprint import pprint class StanfordNLP: def __init__(self): self.server = ServerProxy(JsonRpc20(), TransportTcpIp(addr=("127.0.0.1", 8080))) def parse(self, text): return json.loads(self.server.parse(text)) nlp = StanfordNLP() #result = nlp.parse(u"Hello world! It is so beautiful.") result = nlp.parse(u"今天天气真不错啊!") pprint(result) from nltk.tree import Tree tree = Tree.fromstring(result['sentences'][0]['parsetree']) #pprint(tree) tree.pretty_print()
''' Insert empty trees for empty sentences ''' import sys, argparse from util import tokenize_words from itertools import izip from nltk.tree import Tree def opts(): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument('trees', type=argparse.FileType('r'), help='File with parse trees') parser.add_argument('sentences', type=argparse.FileType('r'), help='File with original sentences') return parser if __name__ == "__main__": args = opts().parse_args() for tree, sentence in izip(args.trees, args.sentences): parse = Tree.fromstring(tree) words = tokenize_words(sentence) if len(parse.leaves()) != len(words): print "Parse tree does not match sentence!" print parse.leaves() print words
def flatten_deeptree(tree): return Tree(tree.label(), flatten_childtrees([c for c in tree]))
def chomsky_normal_form( tree, factor="right", horzMarkov=None, vertMarkov=0, childChar="|", parentChar="^" ): # assume all subtrees have homogeneous children # assume all terminals have no siblings # A semi-hack to have elegant looking code below. As a result, # any subtree with a branching factor greater than 999 will be incorrectly truncated. if horzMarkov is None: horzMarkov = 999 # Traverse the tree depth-first keeping a list of ancestor nodes to the root. # I chose not to use the tree.treepositions() method since it requires # two traversals of the tree (one to get the positions, one to iterate # over them) and node access time is proportional to the height of the node. # This method is 7x faster which helps when parsing 40,000 sentences. nodeList = [(tree, [tree.label()])] while nodeList != []: node, parent = nodeList.pop() if isinstance(node, Tree): # parent annotation parentString = "" originalNode = node.label() if vertMarkov != 0 and node != tree and isinstance(node[0], Tree): parentString = "%s<%s>" % (parentChar, "-".join(parent)) node.set_label(node.label() + parentString) parent = [originalNode] + parent[: vertMarkov - 1] # add children to the agenda before we mess with them for child in node: nodeList.append((child, parent)) # chomsky normal form factorization if len(node) > 2: childNodes = [child.label() for child in node] nodeCopy = node.copy() node[0:] = [] # delete the children curNode = node numChildren = len(nodeCopy) for i in range(1, numChildren - 1): if factor == "right": newHead = "%s%s<%s>%s" % ( originalNode, childChar, "-".join( childNodes[i : min([i + horzMarkov, numChildren])] ), parentString, ) # create new head newNode = Tree(newHead, []) curNode[0:] = [nodeCopy.pop(0), newNode] else: newHead = "%s%s<%s>%s" % ( originalNode, childChar, "-".join( childNodes[max([numChildren - i - horzMarkov, 0]) : -i] ), parentString, ) newNode = Tree(newHead, []) curNode[0:] = [newNode, nodeCopy.pop()] curNode = newNode curNode[0:] = [child for child in nodeCopy]
def join_trees(self, tree1, tree2): ptree = Tree("ROOTROOT", [tree1, tree2]) return ptree
def map(tree, fn): mapped_children = [map(tree[i], fn) for i in range(len(tree))] mapped_label = fn(tree.label()) new_label = tree.label() if mapped_label is None else mapped_label return Tree(new_label, mapped_children)
def flatten_deeptree(tree): """ Flattens a deep tree """ return Tree(tree.label(), flatten_childtrees([child for child in tree]))
def divide_chemical_expression(s1, s2, ignore_state=False): '''Compare two chemical expressions for equivalence up to a multiplicative factor: - If they are not the same chemicals, returns False. - If they are the same, "divide" s1 by s2 to returns a factor x such that s1 / s2 == x as a Fraction object. - if ignore_state is True, ignores phases when doing the comparison. Examples: divide_chemical_expression("H2O", "3H2O") -> Fraction(1,3) divide_chemical_expression("3H2O", "H2O") -> 3 # actually Fraction(3, 1), but compares == to 3. divide_chemical_expression("2H2O(s) + 2CO2", "H2O(s)+CO2") -> 2 divide_chemical_expression("H2O(s) + CO2", "3H2O(s)+2CO2") -> False Implementation sketch: - extract factors and phases to standalone lists, - compare expressions without factors and phases, - divide lists of factors for each other and check for equality of every element in list, - return result of factor division ''' # parsed final trees treedic = {} treedic['1'] = _get_final_tree(s1) treedic['2'] = _get_final_tree(s2) # strip phases and factors # collect factors in list for i in ('1', '2'): treedic[i + ' cleaned_mm_list'] = [] treedic[i + ' factors'] = [] treedic[i + ' phases'] = [] for el in treedic[i].subtrees(filter=lambda t: t.node == 'multimolecule'): count_subtree = [t for t in el.subtrees() if t.node == 'count'] group_subtree = [t for t in el.subtrees() if t.node == 'group'] phase_subtree = [t for t in el.subtrees() if t.node == 'phase'] if count_subtree: if len(count_subtree[0]) > 1: treedic[i + ' factors'].append( int(count_subtree[0][0][0]) / int(count_subtree[0][2][0])) else: treedic[i + ' factors'].append(int(count_subtree[0][0][0])) else: treedic[i + ' factors'].append(1.0) if phase_subtree: treedic[i + ' phases'].append(phase_subtree[0][0]) else: treedic[i + ' phases'].append(' ') treedic[i + ' cleaned_mm_list'].append( Tree('multimolecule', [Tree('molecule', group_subtree)])) # order of factors and phases must mirror the order of multimolecules, # use 'decorate, sort, undecorate' pattern treedic['1 cleaned_mm_list'], treedic['1 factors'], treedic['1 phases'] = zip( *sorted(zip(treedic['1 cleaned_mm_list'], treedic['1 factors'], treedic['1 phases']))) treedic['2 cleaned_mm_list'], treedic['2 factors'], treedic['2 phases'] = zip( *sorted(zip(treedic['2 cleaned_mm_list'], treedic['2 factors'], treedic['2 phases']))) # check if expressions are correct without factors if not _check_equality(treedic['1 cleaned_mm_list'], treedic['2 cleaned_mm_list']): return False # phases are ruled by ingore_state flag if not ignore_state: # phases matters if treedic['1 phases'] != treedic['2 phases']: return False if any( map(lambda x, y: x / y - treedic[ '1 factors'][0] / treedic['2 factors'][0], treedic['1 factors'], treedic['2 factors'])): # factors are not proportional return False else: # return ratio return Fraction(treedic['1 factors'][0] / treedic['2 factors'][0])
def load_ace_file(textfile, fmt): print(f" - {os.path.split(textfile)[1]}") annfile = textfile + ".tmx.rdc.xml" # Read the xml file, and get a list of entities entities = [] with open(annfile) as infile: xml = ET.parse(infile).getroot() for entity in xml.findall("document/entity"): typ = entity.find("entity_type").text for mention in entity.findall("entity_mention"): if mention.get("TYPE") != "NAME": continue # only NEs s = int(mention.find("head/charseq/start").text) e = int(mention.find("head/charseq/end").text) + 1 entities.append((s, e, typ)) # Read the text file, and mark the entities. with open(textfile) as infile: text = infile.read() # Strip XML tags, since they don't count towards the indices text = re.sub("<(?!/?TEXT)[^>]+>", "", text) # Blank out anything before/after <TEXT> def subfunc(m): return " " * (m.end() - m.start() - 6) text = re.sub(r"[\s\S]*<TEXT>", subfunc, text) text = re.sub(r"</TEXT>[\s\S]*", "", text) # Simplify quotes text = re.sub("``", ' "', text) text = re.sub("''", '" ', text) entity_types = {typ for (s, e, typ) in entities} # Binary distinction (NE or not NE) if fmt == "binary": i = 0 toks = Tree("S", []) for (s, e, typ) in sorted(entities): if s < i: s = i # Overlapping! Deal with this better? if e <= s: continue toks.extend(word_tokenize(text[i:s])) toks.append(Tree("NE", text[s:e].split())) i = e toks.extend(word_tokenize(text[i:])) yield toks # Multiclass distinction (NE type) elif fmt == "multiclass": i = 0 toks = Tree("S", []) for (s, e, typ) in sorted(entities): if s < i: s = i # Overlapping! Deal with this better? if e <= s: continue toks.extend(word_tokenize(text[i:s])) toks.append(Tree(typ, text[s:e].split())) i = e toks.extend(word_tokenize(text[i:])) yield toks else: raise ValueError("bad fmt value")
def make_tree(self, result): return Tree.fromstring(result["parse"])
def test_parsed_sents(self): parsed_sents = conll2007.parsed_sents('esp.train')[0] self.assertEqual( parsed_sents.tree(), Tree('fortaleció', [ Tree('aumento', [ 'El', Tree('del', [ Tree('índice', [ Tree('de', [Tree('desempleo', ['estadounidense'])]) ]) ]) ]), 'hoy', 'considerablemente', Tree('al', [ Tree('euro', [ Tree('cotizaba', [ ',', 'que', Tree('a', [Tree('15.35', ['las', 'GMT'])]), 'se', Tree('en', [ Tree('mercado', [ 'el', Tree('de', ['divisas']), Tree('de', ['Fráncfort']) ]) ]), Tree('a', ['0,9452_dólares']), Tree('frente_a', [ ',', Tree('0,9349_dólares', [ 'los', Tree('de', [Tree('mañana', ['esta'])]) ]) ]) ]) ]) ]), '.' ]))