def match(self, tree): try: if tree.label() != 'ROOT': raise IndexError if tree[0].label() != 'SBARQ': raise IndexError if tree[0][0][0].label() != 'WRB': raise IndexError if tree[0][0][0][0].lower() != 'when': raise IndexError if tree[0][1].label() != 'SQ': raise IndexError if tree[0][1][0].label() != 'VBD': raise IndexError if tree[0][1][1].label() != 'NP': raise IndexError if tree[0][1][2].label() != 'VP': raise IndexError part = Pattern.Part() part.object = ParentedTree.fromstring(str(tree[0][1][1])) part.property = ParentedTree.fromstring(str(Tree('VP', [ Tree.fromstring(str(tree[0][0][0])), Tree.fromstring(str(tree[0][1][0])), Tree.fromstring(str(tree[0][1][2])) ]))) return [part] except IndexError: return []
def test_Date(self): dcs = Tree.fromstring( '(fb:soccer.football_team_management_tenure.to (date 2004 -1 -1))') constituent = dcs2constituent(dcs)[0] expected_constituent = \ Tree.fromstring('(ID fb:soccer.football_team_management_tenure.to (DATE 2004_-1_-1))') self.assertEqual(expected_constituent, constituent)
def read_treefile(hyptreefile, reftreefile): hfile = codecs.open(hyptreefile, "r", encoding='utf-8') rfile = codecs.open(reftreefile, "r", encoding='utf-8') scoredic = {} #store rtree into rtreelist suppose there are more than one reference rtreel = [] for i in rfile: refl = [] if i.strip() != "": refl.append(i.strip()) rstr = " ".join(refl) rtree = Tree.fromstring(rstr) rtreel.append(rtree) #store hyptree into hyplist htreel = [] senl = [] for i in hfile: if i.strip() != "": senl.append(i.strip()) else: htreel.append(Tree.fromstring(" ".join(senl))) senl = [] #loop and score for r in rtreel: for h in htreel: score, hword, rword = score_similarity(h, r) scoredic[" ".join(hword)] = score return scoredic
def calculate_attributes(parsed_text): # Prods contains each production rule and the number of occurences it has prods = {} # heights contains the heights for each sentence syntax tree heights = [] sents = [] for row in parsed_text.split('\n'): if (row != '' and row != 'SENTENCE_SKIPPED_OR_UNPARSABLE'): try: sents.append(Tree.fromstring(row)) except: try: #sometimes a tree can have an extra bracket; this tries to parse it without the last bracket sents.append(Tree.fromstring(row[:-1])) except: #if it is still wrong, just skip it try: sents.append(Tree.fromstring(row + ')')) except: print(row) for tree in sents: heights.append(tree.height()) for production in tree.productions(): if production.is_nonlexical(): if (not production in prods): prods[production] = 1 else: prods[production] += 1 # If i have an empty height if len(heights) == 0: print('Empty height', flush=True) heights.append(10) return (heights, {str(key): prods[key] for key in prods})
def read_treefile(hyptreefile,reftreefile): hfile = codecs.open(hyptreefile,"r",encoding='utf-8') rfile = codecs.open(reftreefile,"r",encoding='utf-8') scoredic = {} #store rtree into rtreelist suppose there are more than one reference rtreel = [] for i in rfile: refl = [] if i.strip() != "": refl.append(i.strip()) rstr = " ".join(refl) rtree = Tree.fromstring(rstr) rtreel.append(rtree) #store hyptree into hyplist htreel = [] senl = [] for i in hfile: if i.strip() != "": senl.append(i.strip()) else: htreel.append(Tree.fromstring(" ".join(senl))) senl = [] #loop and score for r in rtreel: for h in htreel: score,hword,rword= score_similarity(h,r) scoredic[" ".join(hword)] = score return scoredic
def _load_cached(self, domain): train_cached = ujson.load(open(os.path.join(os.path.dirname(__file__), self._pcache, f"{domain}.train.json"), "r")) trainexamples = [(x, Tree.fromstring(y)) for x, y in train_cached] test_cached = ujson.load(open(os.path.join(os.path.dirname(__file__), self._pcache, f"{domain}.test.json"), "r")) testexamples = [(x, Tree.fromstring(y)) for x, y in test_cached] print("loaded from cache") return trainexamples, testexamples
def get_pprint(self, tree): content = "" stream = io.StringIO(content) # Some issue with the tree structure causes a problem with pprint # That's why we have to convert to string and then parse Tree.fromstring(str(tree)).pretty_print(stream=stream) return stream.getvalue()
def old_main(): g = [] x = range(10) for param in x: d = DifferenceTeacher(param) cmp = TreeComparator(0, 20, 20) d.setTreeComparator(cmp) mla = open('output_mla_manual2.txt') mla_list = json.load(mla) di = mla_list['cogs_dict']['reverse_dict'] di = {int(key): di[key] for key in di} mla_list = [(Tree.fromstring(tup[0]), Tree.fromstring(tup[1])) for tup in mla_list['trees']] for tree, weights in mla_list: update_weights(weights) d.addPositiveExample(tree, weights) c = learn(d, di) p, nt = measure_generalization([tup[0] for tup in mla_list], c) g.append(p) if param == 2 or param == 8: print(c) print("param: {0}, generalization: {1}, nt: {2}".format(param, p, nt)) plt.plot(x, g) plt.show() exit()
def get_json(input_file, output_file, parser, count): f = open(input_file, 'r') f1 = open(output_file, 'w') for line in f: if line == "\n": continue else: test = {} test["pairID"] = line.strip() test["sentence1"] = f.readline().strip() parse_string = remove_formatting( parser.raw_parse(test["sentence1"])) test["sentence1_parse"] = parse_string test["sentence1_binary_parse"] = format_binary_tree( Tree.fromstring(parse_string)) test["sentence2"] = f.readline().strip() parse_string = remove_formatting( parser.raw_parse(test["sentence2"])) test["sentence2_parse"] = parse_string test["sentence2_binary_parse"] = format_binary_tree( Tree.fromstring(parse_string)) test["gold_label"] = f.readline().strip() test = json.dumps(test) print(test) f1.write(test) f1.write("\n") count = count + 1 f.close() f1.close()
def test_Unary(self): dcs = Tree.fromstring( '(!fb:tv.tv_series_episode.writer fb:en.straight_and_true)') constituent = dcs2constituent(dcs)[0] expected_constituent = \ Tree.fromstring('(ID !fb:tv.tv_series_episode.writer fb:en.straight_and_true)') self.assertEqual(expected_constituent, constituent)
def test_AndNoParentPredicate(self): dcs = Tree.fromstring( '(and fb:en.doom (fb:cvg.computer_videogame.gameplay_modes fb:en.multiplayer_game))' ) constituent = dcs2constituent(dcs)[0] expected_constituent = \ Tree.fromstring('(ID fb:en.doom (ID fb:cvg.computer_videogame.gameplay_modes fb:en.multiplayer_game))') self.assertEqual(expected_constituent, constituent)
def parser_output_to_parse_deriv_trees(output): lines = output.strip().split("\n") deriv_tree_lines = lines[::2] parse_tree_lines = lines[1::2] parse_trees = [Tree.fromstring(line.replace('\x06', 'epsilon_')) for line in parse_tree_lines if line != ''] deriv_trees = [Tree.fromstring(line) for line in deriv_tree_lines if line != ''] return parse_trees, deriv_trees
def is_tree(line): """Simple `oracle` to see if line is a tree.""" assert isinstance(line, str), line try: Tree.fromstring(line) return True except ValueError: return False
def test_Count(self): dcs = Tree.fromstring( '(count (!fb:military.armed_force.units fb:en.u_army))') # from pudb import set_trace; set_trace() constituent = dcs2constituent(dcs)[0] expected_constituent = \ Tree.fromstring('(ID COUNT (ID !fb:military.armed_force.units fb:en.u_army))') self.assertEqual(expected_constituent, constituent)
def main(): rules = loadrules("pokemon.yaml") trees = [] trees.append(Tree.fromstring("(S let me show you my Pokémon)")) trees.append(Tree.fromstring("(S let me show you my cats)")) for tree in trees: translate(tree, rules)
def test_Number(self): dcs = Tree.fromstring( '(fb:government.us_president.presidency_number (number 22.0 fb:en.unitless))' ) constituent = dcs2constituent(dcs)[0] expected_constituent = \ Tree.fromstring('(ID fb:government.us_president.presidency_number (NUMBER 22.0 fb:en.unitless))') self.assertEqual(expected_constituent, constituent)
def test_evalb_correctly_scores_identical_trees(self): tree1 = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))") tree2 = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))") evalb_scorer = EvalbBracketingScorer() evalb_scorer([tree1], [tree2]) metrics = evalb_scorer.get_metric() assert metrics["evalb_recall"] == 1.0 assert metrics["evalb_precision"] == 1.0 assert metrics["evalb_f1_measure"] == 1.0
def test_evalb_correctly_calculates_bracketing_metrics_over_multiple_trees(self): tree1 = Tree.fromstring("(S (VP (D the) (NP dog)) (VP (V chased) (NP (D the) (N cat))))") tree2 = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))") evalb_scorer = EvalbBracketingScorer() evalb_scorer([tree1, tree2], [tree2, tree2]) metrics = evalb_scorer.get_metric() assert metrics["evalb_recall"] == 0.875 assert metrics["evalb_precision"] == 0.875 assert metrics["evalb_f1_measure"] == 0.875
def my_is_tree_same(str_input1, str_input2): root1 = Tree.fromstring(str_input1) root2 = Tree.fromstring(str_input2) str_output1 = my_oneline(root1) str_output2 = my_oneline(root2) if str_output1 == str_output2: return True else: return False
def test_evalb_correctly_scores_imperfect_trees(self): # Change to constiutency label (VP ... )should effect scores, but change to POS # tag (NP dog) should have no effect. tree1 = Tree.fromstring("(S (VP (D the) (NP dog)) (VP (V chased) (NP (D the) (N cat))))") tree2 = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))") evalb_scorer = EvalbBracketingScorer() evalb_scorer([tree1], [tree2]) metrics = evalb_scorer.get_metric() assert metrics["evalb_recall"] == 0.75 assert metrics["evalb_precision"] == 0.75 assert metrics["evalb_f1_measure"] == 0.75
def preprocess_eval(self): with open(self.ground_truth_path) as f: lines = f.readlines() lines = list(map(lambda x: x.rstrip(),lines)) for i in range(len(lines)): self.ground_truth.append(Tree.fromstring(lines[i]).productions()) with open(self.predicted_path) as f: self.lines_1 = f.readlines() self.lines_1 = list(map(lambda x: x.rstrip(),self.lines_1)) for i in range(len(self.lines_1)): self.predicted.append(Tree.fromstring(self.lines_1[i]).productions())
def tree_from_string(s): try: tree_string = s tree_string = tree_str_post_process(tree_string) tree_line = Tree.fromstring(tree_string) except Exception as e: # print(f'Tree.fromstring(tree_string) failed, try to omit the post_process') # print(s) tree_string = s tree_line = Tree.fromstring(tree_string) return tree_line
def test_JoinAnd(self): dcs = Tree.fromstring(( '(!fb:education.academic_post.institution' ' (and (fb:education.academic_post.person fb:en.marshall_hall)' ' (fb:education.academic_post.position_or_title fb:en.professor)))' )) constituent = dcs2constituent(dcs)[0] expected_constituent = \ Tree.fromstring(('(ID !fb:education.academic_post.institution' ' (ID fb:education.academic_post.person fb:en.marshall_hall)' ' (ID fb:education.academic_post.position_or_title fb:en.professor))')) self.assertEqual(expected_constituent, constituent)
def test_evalb_with_terrible_trees_handles_nan_f1(self): # If precision and recall are zero, evalb returns nan f1. # This checks that we handle the zero division. tree1 = Tree.fromstring(u"(PP (VROOT (PP That) (VROOT (PP could) " u"(VROOT (PP cost) (VROOT (PP him))))) (PP .))") tree2 = Tree.fromstring(u"(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))") evalb_scorer = EvalbBracketingScorer() evalb_scorer([tree1], [tree2]) metrics = evalb_scorer.get_metric() assert metrics[u"evalb_recall"] == 0.0 assert metrics[u"evalb_precision"] == 0.0 assert metrics[u"evalb_f1_measure"] == 0.0
def parse_treestr(self, treestr): treestr = treestr.strip() tree = Tree.fromstring(treestr) if tree.label() != ROOT_NODE_NAME: new_root = Tree.fromstring("({})".format(ROOT_NODE_NAME)) new_root.insert(0, tree) tree = new_root tree.chomsky_normal_form() self.starts[tree.label()] += 1 # print(tree) # tree.pretty_print() self.traverse_tree(tree)
def evaluate(gold_str_list: list, pred_str_list: list): """ :param gold_str_list: [str] Ground Truth 树字符串列表 :param pred_str_list: [str] Prediction 树字符串列表 :return:评估结果字符串 """ assert len(gold_str_list) == len(pred_str_list) gold_trees = [Tree.fromstring(s) for s in gold_str_list] pred_trees = [Tree.fromstring(s) for s in pred_str_list] ret = MyEvaluation.evaluate_trees(gold_trees, pred_trees) return ret
def test_evalb_with_terrible_trees_handles_nan_f1(self): # If precision and recall are zero, evalb returns nan f1. # This checks that we handle the zero division. tree1 = Tree.fromstring("(PP (VROOT (PP That) (VROOT (PP could) " "(VROOT (PP cost) (VROOT (PP him))))) (PP .))") tree2 = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))") evalb_scorer = EvalbBracketingScorer() evalb_scorer([tree1], [tree2]) metrics = evalb_scorer.get_metric() assert metrics["evalb_recall"] == 0.0 assert metrics["evalb_precision"] == 0.0 assert metrics["evalb_f1_measure"] == 0.0
def main(): rules = loadrules("german.yaml") trees = [] ## I like eating / Ich esse gern trees.append( Tree.fromstring("(S (NP (PRP I)) (VP (VB like) (VBG eating)))")) ## I am hungry / Ich habe Hunger trees.append(Tree.fromstring("(S (NP (PRP I)) (VP (VB am) (JJ hungry)))")) for tree in trees: translate(tree, rules)
def load_gold_tree(json_path): trees = {} with open(json_path) as f: for line in f: data = json.loads(line.strip()) doc_id = data['doc_id'] if 'labelled_attachment_tree' in data: tree = Tree.fromstring(data['labelled_attachment_tree']) else: tree = Tree.fromstring(data['attach_tree']) trees[doc_id] = tree return trees
def test_JoinAnd(self): constituent = Tree.fromstring(( '(ID !fb:education.academic_post.institution' ' (ID fb:education.academic_post.person fb:en.marshall_hall)' ' (ID fb:education.academic_post.position_or_title fb:en.professor))' )) expected_dcs = Tree.fromstring(( '(!fb:education.academic_post.institution' ' (and (fb:education.academic_post.person fb:en.marshall_hall)' ' (fb:education.academic_post.position_or_title fb:en.professor)))' )) dcs = constituent2dcs(constituent)[0] self.assertEqual(expected_dcs, dcs)
def verify_f1(path): f1_list = [] with codecs.open(path, encoding='utf-8') as f: for line in f: try: line = line.encode('UTF-8') except UnicodeError as e: print "ENCODING ERROR:", line, e line = "{}" loaded_example = json.loads(line) t1 = Tree.fromstring(loaded_example['sentence1_parse']) l1 = len(t1.leaves()) t1 = tree2list(t1) t2 = Tree.fromstring(loaded_example['sentence2_parse']) l2 = len(t2.leaves()) t2 = tree2list(t2) # print t1 # print l1 # print t2 # print l2 bt1 = get_balanced_tree(l1) bt2 = get_balanced_tree(l2) # print bt1 # print bt2 print t1 t1 = get_brackets(t1)[0] print t1 sys.exit(0) t2 = get_brackets(t2)[0] bt1 = get_brackets(bt1)[0] bt2 = get_brackets(bt2)[0] # t1.add((0,l1)) # bt1.add((0,l1)) # t2.add((0,l2)) # bt2.add((0,l2)) # print t1 # print t2 # print bt1 # print bt2 f1 = compute_f1(t1 & bt1, t1, bt1) f1_list.append(f1) f1 = compute_f1(t2 & bt2, t2, bt2) f1_list.append(f1) return sum(f1_list) / len(f1_list), len(f1_list)
def tree_from_string(tree_string): try: s = tree_string s = tree_str_post_process(s) tree = Tree.fromstring(s) except Exception as e: # print(f'Tree.fromstring(tree_string) failed, try to omit the post_process') try: tree = Tree.fromstring(tree_string) except Exception as e: print(f'ERROR: unable to parse the tree') print(tree_string) raise e return tree
def __visualize(self, gold_tree: str, parsed_tree: str): if self.GOLD_tc != None: self.CANVAS.destroy_widget(self.GOLD_tc) self.CANVAS.destroy_widget(self.PARSED_tc) GOLD = Tree.fromstring('(' + gold_tree + ')') PARSED = Tree.fromstring('(' + parsed_tree + ')') self.GOLD_tc = TreeWidget(self.CANVAS.canvas(), GOLD) self.PARSED_tc = TreeWidget(self.CANVAS.canvas(), PARSED) self.CANVAS.add_widget(self.GOLD_tc, 0, 0) self.CANVAS.add_widget(self.PARSED_tc, 0, self.GOLD_tc.height() + 10) self.CANVAS.pack(expand=True)
def treebank_bracket_parse(t): try: return Tree.fromstring(t, remove_empty_top_bracketing=True) except IndexError: # in case it's the real treebank format, # strip first and last brackets before parsing return tree.bracket_parse(t.strip()[1:-1])
def testConvert(self): sample_tree = Tree.fromstring("(S (NP I) (VP (V saw) (NP him)))") converter = DotLanguageConverter() str = converter.convert(sample_tree) expected_tree_string = ("digraph parse_tree {\n" "\t\"S\" [label=\"S\"];\n" "\t\"NP\" [label=\"NP\"];\n" "\t\"S\"-> \"NP\";\n" "\t\"I\" [label=\"I\"];\n" "\t\"NP\"-> \"I\";\n" "\t\"VP\" [label=\"VP\"];\n" "\t\"S\"-> \"VP\";\n" "\t\"V\" [label=\"V\"];\n" "\t\"VP\"-> \"V\";\n" "\t\"saw\" [label=\"saw\"];\n" "\t\"V\"-> \"saw\";\n" "\t\"NP_1\" [label=\"NP\"];\n" "\t\"VP\"-> \"NP_1\";\n" "\t\"him\" [label=\"him\"];\n" "\t\"NP_1\"-> \"him\";\n" "}") self.assertEqual(str, expected_tree_string)
def calc(param): p = ["He", "he", "Him", "him", "She", "she", "Her", "her", "It", "it", "They", "they"] r = ["Himself", "himself", "Herself", "herself", "Itself", "itself", "Themselves", "themselves"] fname = param[1] pro = param[2] with open(fname) as f: sents = f.readlines() trees = [Tree.fromstring(s) for s in sents] pos = get_pos(trees[-1], pro) pos = pos[:-1] if pro in p: tree, pos = hobbs(trees, pos) #for t in trees: # print t, '\n' #print "Proposed antecedent for '"+pro+"':", tree[pos] return tree, tree[pos] elif pro in r: tree, pos = resolve_reflexive(trees, pos) #for t in trees: # print t, '\n' #print "Proposed antecedent for '"+pro+"':", tree[pos] return tree, tree[pos]
def syntactic_parse_features(paragraph, parse): """ Returns the count for the usage of S, SBAR units in the syntactic parse, plus statistics about the height of the trees """ KEPT_FEATURES = ['S', 'SBAR'] # Increment the count for the part-of-speech of each head of phrase counts_of_heads = Counter() tree_heights = [] for t_string in parse: t = Tree.fromstring(t_string) for st in t.subtrees(): counts_of_heads[st.label()] += 1 tree_heights.append(t.height()) # Keep only the head parts-of-speech that appear in KEPT_FEATURES features = dict(("syntactic_head_"+key, counts_of_heads[key]) for key in counts_of_heads if key in KEPT_FEATURES) features = Counter(features) # Add in the features related to tree height features["tree_height_mean"] = np.mean(tree_heights) features["tree_height_median"] = np.median(tree_heights) features["tree_height_max"] = np.max(tree_heights) features["tree_height_min"] = np.min(tree_heights) features["tree_height_spread"] = np.max(tree_heights) - np.min(tree_heights) return Counter(features)
def main(argv): if len(sys.argv) == 2 and argv[1] == "demo": demo() else: if len(sys.argv) > 3 or len(sys.argv) < 2: print "Enter the file and the pronoun to resolve." elif len(sys.argv) == 3: p = ["He", "he", "Him", "him", "She", "she", "Her", "her", "It", "it", "They", "they"] r = ["Himself", "himself", "Herself", "herself", "Itself", "itself", "Themselves", "themselves"] fname = sys.argv[1] pro = sys.argv[2] with open(fname) as f: sents = f.readlines() trees = [Tree.fromstring(s) for s in sents] pos = get_pos(trees[-1], pro) pos = pos[:-1] if pro in p: tree, pos = hobbs(trees, pos) for t in trees: print t, '\n' print "Proposed antecedent for '"+pro+"':", tree[pos] elif pro in r: tree, pos = resolve_reflexive(trees, pos) for t in trees: print t, '\n' print "Proposed antecedent for '"+pro+"':", tree[pos]
def test_construct_tree_from_spans_handles_nested_labels(self): # The tree construction should split the "S-NP" into (S (NP ...)). tree_spans = [((0, 1), 'D'), ((1, 2), 'N'), ((0, 2), 'S-NP')] sentence = ["the", "dog"] tree = self.model.construct_tree_from_spans({x:y for x, y in tree_spans}, sentence) correct_tree = Tree.fromstring("(S (NP (D the) (N dog)))") assert tree == correct_tree
def __render_tree(self): string = self.output_text_area.get("1.0", END) string = string.replace("\n", "") tree = Tree.fromstring(string) tree.draw()
def extract_trees(filename="./out/toy_pcfg2.gen"): trees = [] with open(filename) as fh: for line in fh: trees.append(Tree.fromstring(line)) return trees
def pprint(self, **kwargs): """Returns a representation of the tree compatible with the LaTeX qtree package. Requires the nltk module. See http://www.nltk.org/_modules/nltk/tree.html.""" from nltk import Tree as NLTKTree tree = NLTKTree.fromstring(self.ptb()) return tree.pprint(**kwargs)
def process_sentence(sentence): global corenlp result = {} if len(sentence) == 0: return result parse = json.loads(corenlp.parse(sentence)) tree = Tree.fromstring(parse['sentences'][0]['parsetree']) return extract_phrases(tree, 'PP')
def set_parse_info(self, tokens, pos_tag, parse_string, dependency_tree): self.tokens = tokens self.pos_tag = pos_tag self.dependency_tree = dependency_tree self.parse_tree = Tree.fromstring(parse_string) for i in range(len(self.parse_tree.leaves())): self.parse_tree.__setitem__( self.parse_tree.leaf_treeposition(i), i+1)
def test_tree_construction_with_too_few_spans_creates_trees_with_depth_one_word_nodes(self): # We only have a partial tree here: (S (NP (D the) (N dog)). Decoding should # recover this from the spans, whilst attaching all other words to the root node with # XX POS tag labels, as the right hand side splits will not occur in tree_spans. tree_spans = [((0, 1), 'D'), ((1, 2), 'N'), ((0, 2), 'NP'), ((0, 5), 'S')] sentence = ["the", "dog", "chased", "the", "cat"] tree = self.model.construct_tree_from_spans({x:y for x, y in tree_spans}, sentence) correct_tree = Tree.fromstring("(S (NP (D the) (N dog)) (XX chased) (XX the) (XX cat))") assert tree == correct_tree
def test_construct_tree_from_spans(self): # (S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat)))) tree_spans = [((0, 1), 'D'), ((1, 2), 'N'), ((0, 2), 'NP'), ((2, 3), 'V'), ((3, 4), 'D'), ((4, 5), 'N'), ((3, 5), 'NP'), ((2, 5), 'VP'), ((0, 5), 'S')] sentence = ["the", "dog", "chased", "the", "cat"] tree = self.model.construct_tree_from_spans({x:y for x, y in tree_spans}, sentence) correct_tree = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))") assert tree == correct_tree
def latex(self): """Returns a representation of the tree compatible with the LaTeX qtree package. Requires the nltk module. See http://www.nltk.org/_modules/nltk/tree.html.""" from nltk import Tree as NLTKTree string = self.ptb().replace('[', '\[').replace(']', '\]') tree = NLTKTree.fromstring(string) latex = tree.pformat_latex_qtree() return latex.replace('-LRB-', '(').replace('-RRB-', ')')
def add_top_to_tree(treebank_file): f = open(treebank_file, "r") root_set = set([]) for sentence in f: t = Tree.fromstring(sentence, remove_empty_top_bracketing=False) top_node = Tree("TOP", []) top_node.append(t) print NewTree.flat_print(top_node) f.close()
def determiner_usage(paragraph, parse, verbose=False): """ Gets the count of times determiners are used per context. Central insight: Differences in determiner usage reflect whether the author assumes the noun phrase is a category that is coherent within common knowledge. For instance: * We observe _the depravity_ of our age * Poverty, hunger, mental illness - they were _the inevitable result_ of life in this world. * the Reagan administration is testing _the gullibility_ of world opinion * Abortion threatens _the moral and Christian character_ of this nation In all of these (pulled from low-complexity paragraphs), the author is assuming/presupposing a state of the world without having established its truth. Note that the presence of a detreminer in the subject doesn't have this meaning as often, because of the discourse rule that subjects are almost always *already* shared common knowledge -- we would expect subjects to often begin with "the" because they almost always have been introduced in a prior sentence. Returns: dict, potentially with keys of: old info (for # times the determiner is in the subject, e.g., "The man wore a hat" -- the man was probably already introduced earlier in the paragraph) knowledge assumed (for # times the determine assumes facts, e.g., "Some man wore the hat" -- err, what hat?) SBAR (for # times a determiner was accompanied by an SBAR, e.g., "I saw the man wearing the hat" -- the man both assumed and explained in terms of the knowledge assumed, "the hat") Notes: - Dependency parse may be better (cleaner and more accurate) - Might benefit from excluding proper noun phrases like country names """ DETERMINER_LIST = ["the", "The"] features = Counter() for t_string in parse: t = Tree.fromstring(t_string) sent_not_shown = True for pos in t.treepositions('postorder'): if t[pos] in DETERMINER_LIST: phrase_of_interest = " ".join(t[pos[:-2]].leaves()) while len(pos): match = utils_parsing.check_for_match(t, pos) if match: features["determiner_"+match] += 1 if verbose: if sent_not_shown: print " ".join(t.leaves()) sent_not_shown = False print "'%s' -- %s" % (phrase_of_interest, match) break pos = pos[:-1] return features
def gen_root(treebank_file): # if you use unicode here, there is a bug... f = open(treebank_file, "r") root_set = set([]) for sentence in f: t = Tree.fromstring(sentence, remove_empty_top_bracketing=True) root = t.label() root_set.add(root) f.close() for r in root_set: print r
def drawrst(strtree, fname): """ Draw RST tree into a file """ if not fname.endswith(".ps"): fname += ".ps" cf = CanvasFrame() t = Tree.fromstring(strtree) tc = TreeWidget(cf.canvas(), t) cf.add_widget(tc,10,10) # (10,10) offsets cf.print_to_file(fname) cf.destroy()
def process_sentence(sentence): global corenlp result = {} if len(sentence) == 0: return result parse = json.loads(corenlp.parse(sentence)) tree = Tree.fromstring(parse['sentences'][0]['parsetree']) return [pair[1] for pair in tree.pos()], [leaf.lower() for leaf in tree.leaves()]
def _get_arg_product_rules(self, a_doc_id, a_arg, a_rel, a_parses): """Extract syntactic production rules for the given arg. Args: a_doc_id (str): id of the document a_arg (str): argument to extract productions for a_rel (dict): discourse relation to extract features for a_parses (dict): parsed sentences Returns: set: set of syntactic productions """ ret = set() # obtain token indices for each arg sentence snt_id = None snt2tok = self._get_snt2tok(a_rel[a_arg][TOK_LIST]) # obtain set of leaves corresponding to that argument arg_leaves = set() subt_leaves = set() processed_leaves = set() itree = itree_str = inode_path = None for snt_id, toks in snt2tok.iteritems(): itree_str = a_parses[a_doc_id][SENTENCES][snt_id][PARSE_TREE] itree = Tree.fromstring(itree_str) if not itree.leaves(): print("Invalid parse tree for sentence {:d}".format(snt_id), file=sys.stderr) continue # obtain all terminal syntactic nodes from the arg for itok in toks: inode_path = itree.leaf_treeposition(itok) arg_leaves.add(itree[inode_path]) # check all subtrees (not efficient, but easy to implement) for s_t in itree.subtrees(): subt_leaves.update(s_t.leaves()) if subt_leaves.issubset(arg_leaves) and \ not subt_leaves.issubset(processed_leaves): ret.update(str(p) for p in itree.productions() if any(is_nonterminal(n) for n in p.rhs())) processed_leaves.update(subt_leaves) subt_leaves.clear() if processed_leaves == arg_leaves: break arg_leaves.clear() processed_leaves.clear() return ret
def tag_phrase_tree(treebank_file, corpus): f = codecs.open(treebank_file, "r", "utf-8") s_ind = -1 for sentence in f: s_ind += 1 if s_ind % 10 == 0: sys.stderr.write(unicode(s_ind) + u"\n") tree = Tree.fromstring(sentence, remove_empty_top_bracketing=False) preterminals = [t for t in tree.subtrees(lambda t: t.height() == 2)] for i in xrange(len(preterminals)): preterminals[i].set_label(corpus[s_ind][i][1]) sys.stdout.write(NewTree.flat_print(tree) + u"\n")
def draw_tree(tree_string): raise NotImplementedError() from nltk import Tree from nltk.draw.util import CanvasFrame from nltk.draw import TreeWidget cf = CanvasFrame() tree = Tree.fromstring(tree_string.replace('[','(').replace(']',')') ) cf.add_widget(TreeWidget(cf.canvas(), tree), 10, 10) cf.print_to_file('tree.ps') cf.destroy
def __to_dot(self): string = self.output_text_area.get("1.0", END) string = string.replace("\n", "") tree = Tree.fromstring(string) converter = DotLanguageConverter() dotstring = converter.convert(tree) self.output_dot_text_area.delete(1.0, END) self.output_dot_text_area.insert(INSERT, dotstring) pass