def main(): sents = 0 words_tot = 0 yngve_tot = 0 frazier_tot = 0 nodes_tot = 0 for line in sys.stdin: if line.strip() == "": continue t = Tree.parse(line) words = calc_words(t) words_tot += words sents += 1 yngve = calc_yngve(t, 0) yngve_avg = float(yngve)/words yngve_tot += yngve_avg nodes = calc_nodes(t) nodes_avg = float(nodes)/words nodes_tot += nodes_avg frazier = calc_frazier(t, 0, "") frazier_avg = float(frazier)/words frazier_tot += frazier_avg # print "Sentence=%d\twords=%d\tyngve=%f\tfrazier=%f\tnodes=%f" % (sents, words, yngve_avg, frazier_avg, nodes_avg) yngve_avg = float(yngve_tot)/sents frazier_avg = float(frazier_tot)/sents nodes_avg = float(nodes_tot)/sents words_avg = float(words_tot)/sents print "Total\tsents=%d\twords=%f\tyngve=%f\tfrazier=%f\tnodes=%f" % (sents, words_avg, yngve_avg, frazier_avg, nodes_avg)
def load_ace_file(textfile, fmt): print(' - %s' % os.path.split(textfile)[1]) annfile = textfile+'.tmx.rdc.xml' # Read the xml file, and get a list of entities entities = [] xml = ET.parse(open(annfile)).getroot() for entity in xml.findall('document/entity'): typ = entity.find('entity_type').text for mention in entity.findall('entity_mention'): if mention.get('TYPE') != 'NAME': continue # only NEs s = int(mention.find('head/charseq/start').text) e = int(mention.find('head/charseq/end').text)+1 entities.append( (s, e, typ) ) # Read the text file, and mark the entities. text = open(textfile).read() # Strip XML tags, since they don't count towards the indices text = re.sub('<(?!/?TEXT)[^>]+>', '', text) # Blank out anything before/after <TEXT> def subfunc(m): return ' '*(m.end()-m.start()-6) text = re.sub('[\s\S]*<TEXT>', subfunc, text) text = re.sub('</TEXT>[\s\S]*', '', text) # Simplify quotes text = re.sub("``", ' "', text) text = re.sub("''", '" ', text) entity_types = set(typ for (s,e,typ) in entities) # Binary distinction (NE or not NE) if fmt == 'binary': i = 0 toks = nltk.Tree('S', []) for (s,e,typ) in sorted(entities): if s < i: s = i # Overlapping! Deal with this better? if e <= s: continue toks.extend(nltk.word_tokenize(text[i:s])) toks.append(nltk.Tree('NE', text[s:e].split())) i = e toks.extend(nltk.word_tokenize(text[i:])) yield toks # Multiclass distinction (NE type) elif fmt == 'multiclass': i = 0 toks = nltk.Tree('S', []) for (s,e,typ) in sorted(entities): if s < i: s = i # Overlapping! Deal with this better? if e <= s: continue toks.extend(nltk.word_tokenize(text[i:s])) toks.append(nltk.Tree(typ, text[s:e].split())) i = e toks.extend(nltk.word_tokenize(text[i:])) yield toks else: raise ValueError('bad fmt value')
def test_current_production(self): inputs_ = [(""" (S (sentence (type_1_sentence_coord_1 (type_1_sentence_coord_2 (type_2_sentence (THERE There) (AUX is) (Noun_Phrase (det (DET an)) (Noun_w_support (Adj_phrase (Adj_core (JJ small)) (AND and) (Adj_phrase (Adj_core (JJ red)))) (Noun_Count (NN apple))))))) (PERIOD .))) """, Production(Nonterminal("S"), [Nonterminal("sentence")]))] for i, (input_, expect_) in enumerate(inputs_): tree = Tree.parse(input_) production = current_production(tree) self.assertEqual(expect_, production)
def parse_trees(self, flatten=False): trees = [] for sentence in self.result['sentences']: ptree = Tree.parse(sentence['parsetree']) if flatten: ptree = flatten_deeptree(ptree) trees.append(ptree) return trees
def loadHeadTrees(self,filename): """load trees with head annotated with ps2ds""" trees = [] inf = codecs.open(filename,'r','utf-8') for s in inf.readlines(): head_tree = Tree.parse(s) head_tree = Tree('TOP',[head_tree]) # coordinate with original tree structure trees.append(head_tree) return trees
def get_semantics_from_parse_tree(parse_tree_string): """Take a string representing the parse tree as input, and print the semantic parse. The result list consists of a list of tuples, with each tuple containing the VerbNet frame and its associated tree.""" parse_tree = Tree.parse(parse_tree_string) # parse_tree.draw() split_clause_dict = split_clauses(parse_tree) for key, (clause, conjunction) in split_clause_dict.items(): activized_clause = activize_clause(clause) split_clause_dict[key] = (activized_clause, conjunction) result_list = [] for position, (clause, conjunction) in split_clause_dict.items(): split_tree_dict = split_conjunctions(clause) if conjunction != "": result_list.append(conjunction) for split, (split_tree, conjunction) in split_tree_dict.items(): if conjunction != "": result_list.append(conjunction) for tree in split_tree: tree = existential_there_insertion(tree) tree = invert_clause(tree) tree = wh_movement(tree) tree.draw() # Regex for finding verbs verb_finder = re.compile(r"(?<=VB[ DGNPZ]) *\w*(?=\))") # Get the lemma of the verb for searching verbnet verbs = (word.strip().lower() for word in verb_finder.findall(str(tree))) for verb in verbs: lemmatized_verb = lemmatizer.lemmatize(verb, "v") vfo_list = create_VerbFrameObjects(lemmatized_verb) match_list = [] for vfo in vfo_list: match = vfo.match_parse(tree) if match: match_list.append(match) best_match = pick_best_match(match_list) if not best_match is None: result_list.append((best_match, tree)) return result_list
def _parse_trees_output(output_): res = [] cur_lines = [] for line in output_.splitlines(False): if line == '': res.append(Tree.parse('\n'.join(cur_lines))) cur_lines = [] else: cur_lines.append(line) return res
def _parse(self, t): try: return Tree.parse(self._normalize(t)) except ValueError, e: sys.stderr.write("Bad tree detected; trying to recover...\n") # Try to recover, if we can: if e.args == ('mismatched parens',): for n in range(1, 5): try: v = Tree.parse(self._normalize(t+')'*n)) sys.stderr.write(" Recovered by adding %d close " "paren(s)\n" % n) return v except ValueError: pass # Try something else: sys.stderr.write(" Recovered by returning a flat parse.\n") #sys.stderr.write(' '.join(t.split())+'\n') return Tree('S', self._tag(t))
def load_parse_doc(parse_path): parse_path = os.path.abspath(parse_path) parses = [] with open(parse_path, 'r') as fp: for line in fp: line = line.strip() if line == '': continue parse = Tree.parse(line) parses.append(parse) return parses
def build_tagged_sents(files): """ Build the corpus of tagged sentences from the files of the sequoia corpus. """ sents = [] for fname in files: fin = codecs.open(fname, "r", "utf-8") for line in fin: t = Tree.parse(line) sents.append(t.pos()) fin.close() return sents
def _load_sent_token(self): print "Loading sentences and tokens..." sent_elmts = self.c_root.findall(CTAKES_PREFIX + 'textspan.Sentence') t_counter = 0 for sent_elmt in sent_elmts: sent_begin = int(sent_elmt.get('begin')) sent_end = int(sent_elmt.get('end')) sent_num = int(sent_elmt.get('sentenceNumber')) cursor = sent_begin sent_span = [] token_offset = 0 while cursor < sent_end: buf = self._find_token_elmt_with_attrib_of_val('begin', cursor) if len(buf) == 0: cursor = cursor + 1 continue elif len(buf) > 1: print 'More than one token appear to begin at ' + str(cursor) + \ '\nLoading ctakes xml file terminated' return else: token_elmt = buf[0] t = Token(self.ds_id + '_t_' + str(t_counter)) t.type = token_elmt.tag.split('.')[-1][:-5] # skipping 'newline' token when counting up tid t_num = int(token_elmt.get('tokenNumber')) - sent_num if t_num != t_counter: print 'CAUTION: t_num does not equal to counter t_counter' t.offset = token_offset t.begin = int(token_elmt.get('begin')) t.end = int(token_elmt.get('end')) t.pos = token_elmt.get('partOfSpeech') t.n_form = token_elmt.get('normalizedForm') #t.c_form = token_elmt.get('canonicalForm') #t.cap = int(token_elmt.get('capitalization')) #t.num_p = int(token_elmt.get('numPosition')) self.tokens.append(t) sent_span.append(t) cursor = t.end + 1 token_offset = token_offset + 1 t_counter += 1 s = Sentence(self.ds_id + '_s_' + str(sent_num)) s.span = sent_span s.num = sent_num #s.begin = sent_begin #s.end = sent_end s.parse = Tree.parse(self.p_fp.next()) for t in s.span: t.sent = s self.sents.append(s) return
def __init__(self, json_file): data = json.load(json_file) for k, v in data.iteritems(): self.__setattr__(k, v) self.__raw_data = data # for future reference #print data self.spantree = SpanTree.parse(self.goldparse) self.spantree.convert() self.goldparse = Tree.parse(self.goldparse) self.text = data['text'].split() self.treebank_sentence = data['treebank_sentence'].split()
def findAmbiguities(self, line): result = self.parse(line) #if 'coref' in result: # return 1 trees = [] retval = 0 for i in range(len(result['sentences'])): tree = Tree.parse(result['sentences'][i]['parsetree']) trees.append(tree) # Since tree[0] is a S for subtree in tree: retval = max(retval, self.exploreSubTree(subtree)) return retval
def findAmbiguities(self,line): result = self.parse(line) #if 'coref' in result: # return 1 trees = [] retval = 0 for i in range(len(result['sentences'])): tree = Tree.parse(result['sentences'][i]['parsetree']) trees.append(tree) # Since tree[0] is a S for subtree in tree: retval = max(retval, self.exploreSubTree(subtree)) return retval
def read(klass, path=KNOWLEDGE_PATH): if not path: raise Exception("Specify a path to the verbframes.json as $WIMKB") with open(path, 'rb') as kbfile: data = json.load(kbfile, encoding="utf8") kwargs = {} for frame in data['frames']: for mapping in frame['mappings']: # Update mapping with frame object mapping['frame'] = frame['frame'] # Convert string reprs of Trees mapping['verbmap'] = Tree.parse(mapping['verbmap']) if 'parse' in mapping: mapping['parse'] = Tree.parse(mapping['parse']) # Convert kwargs kwargs[frame['frame']] = frame['mappings'] return klass(**kwargs)
def tag_ptree(ptree, coreflist): """Tags given parse tree with coreferences Args: ptree: string, parenthesized str represenation of parse tree coreflist: list of tuples, [('1', {'text': 'dog', 'ref': None})] Returns: string, tagged parse tree >>> ptree = '(S NP( (NN He)) VP( (V ran)))' >>> coreflist = [('1', {'text': 'He', 'ref': None})] >>> tag_ptree(ptree, coreflist) '(S NP( COREF_TAG_1( (NN He))) VP( (V ran)))' """ pattern = r"""(?P<lp>\(?\s*) # left parenthesis (?P<tg>[a-zA-Z$]+)? # POS tag (?P<data>\s*%s) # subtree of tag (?P<rp>(?:\s*\))*) # right parenthesis """ for cid, coref in coreflist[::-1]: words = ''.join(word_tokenize(coref['text'])) nltktree = Tree.parse(ptree) nltktree.reverse() # perform search right to left data = None for subtree in nltktree.subtrees(): # BFS if ''.join(subtree.leaves()) == words: # equal ignoring whitespace data = subtree.pprint() break # If found via breadth-first search of parse tree if data: ptree = ptree.replace(data, '( COREF_TAG_%s%s)' % (cid, data)) else: # Try finding via regex matching instead dpattern = r'\s*'.join([r'\(\s*[a-zA-Z$]+\s+%s\s*\)' % word for word in word_tokenize(coref['text'])]) found = re.findall(pattern % dpattern, ptree, re.X) if found: repl = '%s%s ( COREF_TAG_%s%s) %s' % (found[0][0], found[0][1], cid, found[0][2], found[0][3]) ptree = re.sub(pattern % dpattern, repl, ptree, 1, re.X) return ptree
def parseQuestion(self, text): question = Question() print "RECEIVED DATA IS\n" + text wordList = nltk.word_tokenize(text) i = 0 tokens = list() for word in wordList: print "WORD: "+str(word) if not str(word).strip() is "" and not str(word).strip() is "." and not str(word).strip() is "?" and not str(word).strip() is "!" and not str(word).strip() is ",": tokens.append(word) i+=1 print tokens question.setTokens(tokens) result = self.parse(text) tree = Tree.parse(result['sentences'][0]['parsetree']) print TreeUtils.findPocs(tree)
def create_trees_nltk(filename): f = open(filename, "r") response = f.readlines(); f.close() valid_tree_texts = [] tree_text = '' for line in response: line = line.strip() if(line == ""): valid_tree_texts.append(tree_text) tree_text = "" else: tree_text += line+" " trees = [Tree.parse(line) for line in valid_tree_texts] for i in range(len(trees)): trees[i].chomsky_normal_form() return trees
def create_trees_nltk(filename): f = open(filename, "r") response = f.readlines() f.close() valid_tree_texts = [] tree_text = '' for line in response: line = line.strip() if (line == ""): valid_tree_texts.append(tree_text) tree_text = "" else: tree_text += line + " " trees = [Tree.parse(line) for line in valid_tree_texts] for i in range(len(trees)): trees[i].chomsky_normal_form() return trees
def test_nltk_trees(parsed_text): ''' Example of parsed_text, stanford parser output : (ROOT (S (ADVP (RB However)) (NP (NP (DT the) (NNS talks)) (, ,) (VP (VBN hosted) (PP (IN by) (NP (NNP Douglas) (NNP Hurd)))) (, ,)) (VP (VBD ended) (PP (IN in) (NP (NN stalemate)))) (. .))) ''' nltree = Tree.parse(parsed_text) nltree.chomsky_normal_form() nltree.draw()
def _process_parse(parse, coreflist): """Tags parse tree with corefs and returns the tree, lexicon, dependencies and raw text as tuple Args: parse: list of stanford corenlp parsed sentences coreflist: list of coreferences from tagged xml Returns: tuple, (ptree, lexicon, dependencies, rawtext) if parse contains a sentence, else returns None """ sentence = parse.get('sentences') if sentence: ptree = Tree.parse(tag_ptree(sentence[0]['parsetree'], coreflist)) words = [(w[0], w[1]) for w in sentence[0]['words']] depends = [(d[0], d[1], d[2]) for d in sentence[0]['dependencies']] text = sentence[0]['text'] return ptree, words, depends, text else: return None
def read_trees(filename, treelist, check=True): buffer = [] for line in open(filename): if not line.strip(): continue if line.startswith("(") and buffer: tree = ' '.join(buffer) tree = re.sub('\s+', ' ', tree) treelist.append(tree) buffer = [] buffer.append(line.rstrip()) if buffer: tree = ' '.join(buffer) tree = re.sub('\s+', ' ', tree) treelist.append(tree) if check: for idx, tree in enumerate(treelist): try: t = Tree.parse(tree) s = " ".join(t.leaves()) except ValueError: assert False, "f: %s, i: %s, t: %s" %(filename, idx, tree)
def test_nltk_trees(self): parsed_text = """ (S (NP (PRP He)) (VP (VBZ reckons) (SBAR (S (NP (DT the) (JJ current) (NN account) (NN deficit)) (VP (MD will) (VP (VB narrow) (PP (TO to) (NP (QP (RB only) (# #) (CD 1.8) (CD billion)))) (PP (IN in) (NP (NNP September)))))))) (. .)) """ # parsed_text = """(S # (S # (NP # (NP (JJS Most)) # (PP (IN of) # (NP (DT the) (NN commodity) (NN traffic)))) # (VP (VBD was) # (ADJP (RP off)))) # (, ,) # (NP (DT the) (NN company)) # (VP (VBD said)) # (. .)) """ # """(S # (NP (DT The) (NN cat)) # (VP (VBD sat) # (PP (IN on) # (NP (DT a) (NN mat)))) # (. .))""" nltree = Tree.parse(parsed_text) nltree.chomsky_normal_form() nltree.draw()
def main(): for line in sys.stdin: t = Tree.parse(line) t.draw()
# sys.exit() mode = 0 parse = "" first_tree = True for line in sys.stdin: # print 'mode:', mode line = line[:-1] # remove newline if line == "Leaves:": assert mode == -2 if mode == -2: t = Tree.parse(parse) assert t if not first_tree: print "" first_tree = False print t.pprint() parse = "" mode = 0 continue if line.startswith("Tree:"): mode -= 1 elif line == "-----": mode -= 1 else: assert abs(mode) < 3
def demo(): import random def fill(cw): cw['fill'] = '#%06d' % random.randint(0,999999) cf = CanvasFrame(width=550, height=450, closeenough=2) t = Tree.parse(''' (S (NP the very big cat) (VP (Adv sorta) (V saw) (NP (Det the) (N dog))))''') tc = TreeWidget(cf.canvas(), t, draggable=1, node_font=('helvetica', -14, 'bold'), leaf_font=('helvetica', -12, 'italic'), roof_fill='white', roof_color='black', leaf_color='green4', node_color='blue2') cf.add_widget(tc,10,10) def boxit(canvas, text): big = ('helvetica', -16, 'bold') return BoxWidget(canvas, TextWidget(canvas, text, font=big), fill='green') def ovalit(canvas, text): return OvalWidget(canvas, TextWidget(canvas, text), fill='cyan') treetok = Tree.parse('(S (NP this tree) (VP (V is) (AdjP shapeable)))') tc2 = TreeWidget(cf.canvas(), treetok, boxit, ovalit, shapeable=1) def color(node): node['color'] = '#%04d00' % random.randint(0,9999) def color2(treeseg): treeseg.node()['fill'] = '#%06d' % random.randint(0,9999) treeseg.node().child()['color'] = 'white' tc.bind_click_trees(tc.toggle_collapsed) tc2.bind_click_trees(tc2.toggle_collapsed) tc.bind_click_nodes(color, 3) tc2.expanded_tree(1).bind_click(color2, 3) tc2.expanded_tree().bind_click(color2, 3) paren = ParenWidget(cf.canvas(), tc2) cf.add_widget(paren, tc.bbox()[2]+10, 10) tree3 = Tree.parse(''' (S (NP this tree) (AUX was) (VP (V built) (PP (P with) (NP (N tree_to_treesegment)))))''') tc3 = tree_to_treesegment(cf.canvas(), tree3, tree_color='green4', tree_xspace=2, tree_width=2) tc3['draggable'] = 1 cf.add_widget(tc3, 10, tc.bbox()[3]+10) def orientswitch(treewidget): if treewidget['orientation'] == 'horizontal': treewidget.expanded_tree(1,1).subtrees()[0].set_text('vertical') treewidget.collapsed_tree(1,1).subtrees()[0].set_text('vertical') treewidget.collapsed_tree(1).subtrees()[1].set_text('vertical') treewidget.collapsed_tree().subtrees()[3].set_text('vertical') treewidget['orientation'] = 'vertical' else: treewidget.expanded_tree(1,1).subtrees()[0].set_text('horizontal') treewidget.collapsed_tree(1,1).subtrees()[0].set_text('horizontal') treewidget.collapsed_tree(1).subtrees()[1].set_text('horizontal') treewidget.collapsed_tree().subtrees()[3].set_text('horizontal') treewidget['orientation'] = 'horizontal' text = """ Try clicking, right clicking, and dragging different elements of each of the trees. The top-left tree is a TreeWidget built from a Tree. The top-right is a TreeWidget built from a Tree, using non-default widget constructors for the nodes & leaves (BoxWidget and OvalWidget). The bottom-left tree is built from tree_to_treesegment.""" twidget = TextWidget(cf.canvas(), text.strip()) textbox = BoxWidget(cf.canvas(), twidget, fill='white', draggable=1) cf.add_widget(textbox, tc3.bbox()[2]+10, tc2.bbox()[3]+10) tree4 = Tree.parse('(S (NP this tree) (VP (V is) (Adj horizontal)))') tc4 = TreeWidget(cf.canvas(), tree4, draggable=1, line_color='brown2', roof_color='brown2', node_font=('helvetica', -12, 'bold'), node_color='brown4', orientation='horizontal') tc4.manage() cf.add_widget(tc4, tc3.bbox()[2]+10, textbox.bbox()[3]+10) tc4.bind_click(orientswitch) tc4.bind_click_trees(tc4.toggle_collapsed, 3) # Run mainloop cf.mainloop()
def demo(): import random def fill(cw): cw["fill"] = "#%06d" % random.randint(0, 999999) cf = CanvasFrame(width=550, height=450, closeenough=2) t = Tree.parse( """ (S (NP the very big cat) (VP (Adv sorta) (V saw) (NP (Det the) (N dog))))""" ) tc = TreeWidget( cf.canvas(), t, draggable=1, node_font=("helvetica", -14, "bold"), leaf_font=("helvetica", -12, "italic"), roof_fill="white", roof_color="black", leaf_color="green4", node_color="blue2", ) cf.add_widget(tc, 10, 10) def boxit(canvas, text): big = ("helvetica", -16, "bold") return BoxWidget(canvas, TextWidget(canvas, text, font=big), fill="green") def ovalit(canvas, text): return OvalWidget(canvas, TextWidget(canvas, text), fill="cyan") treetok = Tree.parse("(S (NP this tree) (VP (V is) (AdjP shapeable)))") tc2 = TreeWidget(cf.canvas(), treetok, boxit, ovalit, shapeable=1) def color(node): node["color"] = "#%04d00" % random.randint(0, 9999) def color2(treeseg): treeseg.node()["fill"] = "#%06d" % random.randint(0, 9999) treeseg.node().child()["color"] = "white" tc.bind_click_trees(tc.toggle_collapsed) tc2.bind_click_trees(tc2.toggle_collapsed) tc.bind_click_nodes(color, 3) tc2.expanded_tree(1).bind_click(color2, 3) tc2.expanded_tree().bind_click(color2, 3) paren = ParenWidget(cf.canvas(), tc2) cf.add_widget(paren, tc.bbox()[2] + 10, 10) tree3 = Tree.parse( """ (S (NP this tree) (AUX was) (VP (V built) (PP (P with) (NP (N tree_to_treesegment)))))""" ) tc3 = tree_to_treesegment(cf.canvas(), tree3, tree_color="green4", tree_xspace=2, tree_width=2) tc3["draggable"] = 1 cf.add_widget(tc3, 10, tc.bbox()[3] + 10) def orientswitch(treewidget): if treewidget["orientation"] == "horizontal": treewidget.expanded_tree(1, 1).subtrees()[0].set_text("vertical") treewidget.collapsed_tree(1, 1).subtrees()[0].set_text("vertical") treewidget.collapsed_tree(1).subtrees()[1].set_text("vertical") treewidget.collapsed_tree().subtrees()[3].set_text("vertical") treewidget["orientation"] = "vertical" else: treewidget.expanded_tree(1, 1).subtrees()[0].set_text("horizontal") treewidget.collapsed_tree(1, 1).subtrees()[0].set_text("horizontal") treewidget.collapsed_tree(1).subtrees()[1].set_text("horizontal") treewidget.collapsed_tree().subtrees()[3].set_text("horizontal") treewidget["orientation"] = "horizontal" text = """ Try clicking, right clicking, and dragging different elements of each of the trees. The top-left tree is a TreeWidget built from a Tree. The top-right is a TreeWidget built from a Tree, using non-default widget constructors for the nodes & leaves (BoxWidget and OvalWidget). The bottom-left tree is built from tree_to_treesegment.""" twidget = TextWidget(cf.canvas(), text.strip()) textbox = BoxWidget(cf.canvas(), twidget, fill="white", draggable=1) cf.add_widget(textbox, tc3.bbox()[2] + 10, tc2.bbox()[3] + 10) tree4 = Tree.parse("(S (NP this tree) (VP (V is) (Adj horizontal)))") tc4 = TreeWidget( cf.canvas(), tree4, draggable=1, line_color="brown2", roof_color="brown2", node_font=("helvetica", -12, "bold"), node_color="brown4", orientation="horizontal", ) tc4.manage() cf.add_widget(tc4, tc3.bbox()[2] + 10, textbox.bbox()[3] + 10) tc4.bind_click(orientswitch) tc4.bind_click_trees(tc4.toggle_collapsed, 3) # Run mainloop cf.mainloop()
fout = open('tree.tex', 'w') print >> fout, r'''\documentclass[tikz]{standalone} \usepackage{CJKutf8} \usepackage{color} \usepackage{tikz} \usepackage{tikz-qtree} \thispagestyle{empty} \begin{document} \begin{CJK}{UTF8}{gbsn} \begin{tikzpicture}''' f = open(parse_file) for i, s in enumerate(f): if i == line_num: s = s.replace('$', '\$') tree = Tree.parse(s) if flag == '0': h = tree.height() print >> fout, '''\\begin{{scope}}[frontier/.style={{distance from root={}}}]\n'''.format( h * 28) for pos in tree.treepositions('leaves'): tree[pos] = r'\edge[dotted]; {' + tree[pos] + '}' idx = 0 for line in tree.pprint_latex_qtree().split('\n'): if ';' in line: line = line.replace('{', '\\node(n{}) {{'.format(idx)).replace( '}', '};').replace('%', '\%') idx += 1 print >> fout, line for i in range(idx):
TreeView(*trees).mainloop() return ##////////////////////////////////////////////////////// ## Demo Code ##////////////////////////////////////////////////////// import random if __name__ == '__main__': def fill(cw): cw['fill'] = '#%06d' % random.randint(0,999999) cf = CanvasFrame(width=550, height=450, closeenough=2) tree = Tree.parse(''' (S (NP the very big cat) (VP (Adv sorta) (V saw) (NP (Det the) (N dog)))) ''', leafparser = lambda t: Token(TEXT=t)) tc = TreeWidget(cf.canvas(), tree, draggable=1, node_font=('helvetica', -14, 'bold'), leaf_font=('helvetica', -12, 'italic'), roof_fill='white', roof_color='black', leaf_color='green4', node_color='blue2') cf.add_widget(tc,10,10) def boxit(canvas, text): big = ('helvetica', -16, 'bold') return BoxWidget(canvas, TextWidget(canvas, text, font=big), fill='green') def ovalit(canvas, text): return OvalWidget(canvas, TextWidget(canvas, text),
fout = open('tree.tex','w') print >>fout,r'''\documentclass[tikz]{standalone} \usepackage{CJKutf8} \usepackage{color} \usepackage{tikz} \usepackage{tikz-qtree} \thispagestyle{empty} \begin{document} \begin{CJK}{UTF8}{gbsn} \begin{tikzpicture}''' f = open(parse_file) for i,s in enumerate(f): if i == line_num: s = s.replace('$','\$') tree = Tree.parse(s) if flag == '0': h = tree.height() print >>fout,'''\\begin{{scope}}[frontier/.style={{distance from root={}}}]\n'''.format(h*28) for pos in tree.treepositions('leaves'): tree[pos] = r'\edge[dotted]; {' + tree[pos] + '}' idx = 0 for line in tree.pprint_latex_qtree().split('\n'): if ';' in line: line = line.replace('{','\\node(n{}) {{'.format(idx)).replace('}','};').replace('%','\%') idx += 1 print >>fout,line for i in range(idx): print >>fout,'\draw (n{} |- 0,{}pt) node {{{}}};'.format(i,-h*28-10,i) else: print >>fout,r'\begin{scope}'
#!/usr/bin/python from nltk.tree import Tree import sys # A program to display parse trees (in Penn treebank format) with NLTK # # To install NLTK on ubuntu: sudo apt-get install python-nltk for line in sys.stdin: t = Tree.parse(line) t.draw()
parser = argparse.ArgumentParser() parser.add_argument('ptb', action='store', help="ptb.json file") parser.add_argument('json', action='store', help="json input file") parser.add_argument('jsonout', action='store', help="json output file") parser.add_argument('-verbose', action='store_true') arguments = parser.parse_args(sys.argv[1:]) treebank = json.load(open(arguments.ptb)) docId, sentNr = re.search(r'wsj_(\d+).(\d+).json', arguments.json).groups() #print treebank.keys() #print docId #int(docId) sentNr = int(sentNr) data = json.load(open(arguments.json)) if arguments.verbose: from nltk.tree import Tree sys.stderr.write("text:\n"), data['text'] sys.stderr.write("%s\n" %(treebank[docId][sentNr])) t = Tree.parse(treebank[docId][sentNr]) sys.stderr.write("%s\n" %(" ".join(t.leaves()))) assert docId in treebank #print treebank[docId] assert int(sentNr) < len(treebank[docId]) data['ptbparse'] = treebank[docId][sentNr] json.dump(data, open(arguments.jsonout, 'w'), indent=2, sort_keys=True)
#!/usr/bin/env python # check if parse from .onf is equal to parse obtained from penn treebank import sys from collections import defaultdict from itertools import imap, izip import json import re from nltk.tree import Tree if __name__ == "__main__": import argparse parser = argparse.ArgumentParser() parser.add_argument('json', action='store', help="json input file") arguments = parser.parse_args(sys.argv[1:]) data = json.load(open(arguments.json)) ptb = Tree.parse(data['ptbparse']) onf = Tree.parse(data['goldparse']) equal = ptb[0].pprint() == onf[0].pprint() if not equal: print "0 parses from pbt and .onf differ in %s" %arguments.json if equal: print "1 parses from pbt and .onf do NOT differ in %s" %arguments.json #print ptb[0].pprint() #print onf[0].pprint()
are brackets annotated. Export the content as a regular annotated corpus for pos tagging learning. """ import sys, codecs from nltk.tree import Tree def treeSentenceToTuples(sent): """ :param sent: a Tree representing a sentence :type sent: nltk.tree.Tree """ return [u"%s/%s"%(t,p) for t,p in sent.pos() if not t in ["-LRB-", "-RRB-"]] if __name__ == "__main__": if len(sys.argv) < 3: print "Usage:\n\t%s <destination> <corpus>" % sys.argv[0] sys.exit(-1) dest = sys.argv[1] fout = codecs.open(dest, "w", "utf-8") for fname in sys.argv[2:]: fin = codecs.open(fname, "r", "utf-8") for line in fin: t = Tree.parse(line) tokens = treeSentenceToTuples(t) fout.write(u" ".join(tokens)) fout.write("\n") fin.close() fout.close()
import json # from jsonrpc import ServerProxy, JsonRpc20, TransportTcpIp import jsonrpclib from pprint import pprint class StanfordNLP: def __init__(self, port_number=8080): self.server = jsonrpclib.Server("http://localhost:%d" % port_number) def parse(self, text): return json.loads(self.server.parse(text)) nlp = StanfordNLP() result = nlp.parse("Hello world! It is so beautiful.") pprint(result) from nltk.tree import Tree tree = Tree.parse(result['sentences'][0]['parsetree']) pprint(tree)
def process_file(json_filename, nb): docId, sentNr = re.search(r'wsj_(\d+).(\d+).json', json_filename).groups() sentNr = int(sentNr) data = json.load(open(json_filename)) data['nom'] = [] # index adjustments for consistency with ontonotes parses ptb_tree = Tree.parse(data['ptbparse']) ptbstring = tree_to_string(ptb_tree) # wrap traces onftree = Tree.parse(data['goldparse']) onfstring = tree_to_string(onftree) # wrap traces raw_onfstring = tree_to_string(onftree, wrap_traces=False) ptbstring_tok = add_spaces(ptbstring, onfstring) tokenize_offsets = split_offsets(ptbstring, ptbstring_tok) trace_offsets = Offset(ptbstring_tok.split(), onfstring.split(), ignore_braces=True) #print ptbstring #print ptbstring_tok #print onfstring #print tokenize_offsets #print trace_offsets pt = SpanTree.parse(data['ptbparse']) for nb_data in nb[docId][sentNr]: args = nb_data['args'] # TODO: arguments that are chains or concatenations of multiple nodes new_args = [] for pos, role in args: words, start, end = [], None, None leaf_id, depth = pt.parse_pos(pos) if leaf_id != None and depth != None: treepos = pt.get_treepos(leaf_id, depth) while is_trace(pt[treepos]): trace_id = int(pt[treepos].leaves()[0].split('-')[-1]) print 'looking for trace', trace_id tracepos = pt.find_trace(trace_id) if tracepos != None: print 'trace %s found! Here:', tracepos print pt[tracepos].pprint() treepos = tracepos else: break # could not follow trace words = pt[treepos].leaves() start, end = span_from_treepos(pt, treepos) #print start, end, # adjust of different tokenization assert start in tokenize_offsets start = min(tokenize_offsets[start]) assert end in tokenize_offsets end = max(tokenize_offsets[end]) # adjust of inserted traces in ontonotes start = trace_offsets.map_to_longer(start) end = trace_offsets.map_to_longer(end) #print '->', start, end phrase = '' if words: phrase = ' '.join(raw_onfstring.split()[start:end+1]) new_args.append( [role, pos, start, end, phrase] ) nb_data['args'] = new_args data['nom'].append(nb_data) #print nb_data json.dump(data, open(json_filename, 'w'), indent=2, sort_keys=True)