def match(self, tree):
        try:
            if tree.label() != 'ROOT':
                raise IndexError
            if tree[0].label() != 'SBARQ':
                raise IndexError
            if tree[0][0][0].label() != 'WRB':
                raise IndexError
            if tree[0][0][0][0].lower() != 'when':
                raise IndexError
            if tree[0][1].label() != 'SQ':
                raise IndexError
            if tree[0][1][0].label() != 'VBD':
                raise IndexError
            if tree[0][1][1].label() != 'NP':
                raise IndexError
            if tree[0][1][2].label() != 'VP':
                raise IndexError

            part = Pattern.Part()
            part.object = ParentedTree.fromstring(str(tree[0][1][1]))
            part.property = ParentedTree.fromstring(str(Tree('VP', [
                Tree.fromstring(str(tree[0][0][0])),
                Tree.fromstring(str(tree[0][1][0])),
                Tree.fromstring(str(tree[0][1][2]))
            ])))

            return [part]
        except IndexError:
            return []
Пример #2
0
 def test_Date(self):
     dcs = Tree.fromstring(
         '(fb:soccer.football_team_management_tenure.to (date 2004 -1 -1))')
     constituent = dcs2constituent(dcs)[0]
     expected_constituent = \
       Tree.fromstring('(ID fb:soccer.football_team_management_tenure.to (DATE 2004_-1_-1))')
     self.assertEqual(expected_constituent, constituent)
def read_treefile(hyptreefile, reftreefile):
    hfile = codecs.open(hyptreefile, "r", encoding='utf-8')
    rfile = codecs.open(reftreefile, "r", encoding='utf-8')
    scoredic = {}
    #store rtree into rtreelist suppose there are more than one reference
    rtreel = []
    for i in rfile:
        refl = []
        if i.strip() != "":
            refl.append(i.strip())
            rstr = " ".join(refl)
            rtree = Tree.fromstring(rstr)
        rtreel.append(rtree)
    #store hyptree into hyplist
    htreel = []
    senl = []
    for i in hfile:
        if i.strip() != "":
            senl.append(i.strip())
        else:
            htreel.append(Tree.fromstring(" ".join(senl)))
            senl = []

    #loop and score
    for r in rtreel:
        for h in htreel:
            score, hword, rword = score_similarity(h, r)
            scoredic[" ".join(hword)] = score

    return scoredic
Пример #4
0
def calculate_attributes(parsed_text):
    # Prods contains each production rule and the number of occurences it has
    prods = {}
    # heights contains the heights for each sentence syntax tree
    heights = []
    sents = []
    for row in parsed_text.split('\n'):
        if (row != '' and row != 'SENTENCE_SKIPPED_OR_UNPARSABLE'):
            try:
                sents.append(Tree.fromstring(row))
            except:
                try:
                    #sometimes a tree can have an extra bracket; this tries to parse it without the last bracket
                    sents.append(Tree.fromstring(row[:-1]))
                except:
                    #if it is still wrong, just skip it
                    try:
                        sents.append(Tree.fromstring(row + ')'))
                    except:
                        print(row)
    for tree in sents:
        heights.append(tree.height())
        for production in tree.productions():
            if production.is_nonlexical():
                if (not production in prods):
                    prods[production] = 1
                else:
                    prods[production] += 1
    # If i have an empty height
    if len(heights) == 0:
        print('Empty height', flush=True)
        heights.append(10)
    return (heights, {str(key): prods[key] for key in prods})
Пример #5
0
def read_treefile(hyptreefile,reftreefile):
    hfile = codecs.open(hyptreefile,"r",encoding='utf-8')
    rfile = codecs.open(reftreefile,"r",encoding='utf-8')
    scoredic = {}
    #store rtree into rtreelist suppose there are more than one reference
    rtreel = []
    for i in rfile:
        refl = []
        if i.strip() != "":
            refl.append(i.strip())
            rstr = " ".join(refl)
            rtree = Tree.fromstring(rstr)
        rtreel.append(rtree)
    #store hyptree into hyplist    
    htreel = []
    senl = []
    for i in hfile:
        if i.strip() != "":
            senl.append(i.strip())
        else:
            htreel.append(Tree.fromstring(" ".join(senl)))
            senl = []
            
    #loop and score
    for r in rtreel:
        for h in htreel:
            score,hword,rword= score_similarity(h,r)
            scoredic[" ".join(hword)] = score
            
    return scoredic     
Пример #6
0
 def _load_cached(self, domain):
     train_cached = ujson.load(open(os.path.join(os.path.dirname(__file__), self._pcache, f"{domain}.train.json"), "r"))
     trainexamples = [(x, Tree.fromstring(y)) for x, y in train_cached]
     test_cached = ujson.load(open(os.path.join(os.path.dirname(__file__), self._pcache, f"{domain}.test.json"), "r"))
     testexamples = [(x, Tree.fromstring(y)) for x, y in test_cached]
     print("loaded from cache")
     return trainexamples, testexamples
Пример #7
0
 def get_pprint(self, tree):
     content = ""
     stream = io.StringIO(content)
     # Some issue with the tree structure causes a problem with pprint
     # That's why we have to convert to string and then parse
     Tree.fromstring(str(tree)).pretty_print(stream=stream)
     return stream.getvalue()
Пример #8
0
def old_main():
    g = []
    x = range(10)
    for param in x:
        d = DifferenceTeacher(param)
        cmp = TreeComparator(0, 20, 20)
        d.setTreeComparator(cmp)
        mla = open('output_mla_manual2.txt')
        mla_list = json.load(mla)
        di = mla_list['cogs_dict']['reverse_dict']
        di = {int(key): di[key] for key in di}
        mla_list = [(Tree.fromstring(tup[0]), Tree.fromstring(tup[1]))
                    for tup in mla_list['trees']]
        for tree, weights in mla_list:
            update_weights(weights)
            d.addPositiveExample(tree, weights)
        c = learn(d, di)
        p, nt = measure_generalization([tup[0] for tup in mla_list], c)
        g.append(p)
        if param == 2 or param == 8:
            print(c)
        print("param: {0}, generalization: {1}, nt: {2}".format(param, p, nt))
    plt.plot(x, g)
    plt.show()
    exit()
Пример #9
0
def get_json(input_file, output_file, parser, count):
    f = open(input_file, 'r')
    f1 = open(output_file, 'w')
    for line in f:
        if line == "\n":
            continue
        else:
            test = {}
            test["pairID"] = line.strip()
            test["sentence1"] = f.readline().strip()
            parse_string = remove_formatting(
                parser.raw_parse(test["sentence1"]))
            test["sentence1_parse"] = parse_string
            test["sentence1_binary_parse"] = format_binary_tree(
                Tree.fromstring(parse_string))
            test["sentence2"] = f.readline().strip()
            parse_string = remove_formatting(
                parser.raw_parse(test["sentence2"]))
            test["sentence2_parse"] = parse_string
            test["sentence2_binary_parse"] = format_binary_tree(
                Tree.fromstring(parse_string))
            test["gold_label"] = f.readline().strip()
            test = json.dumps(test)
            print(test)
            f1.write(test)
            f1.write("\n")
            count = count + 1
    f.close()
    f1.close()
Пример #10
0
 def test_Unary(self):
     dcs = Tree.fromstring(
         '(!fb:tv.tv_series_episode.writer fb:en.straight_and_true)')
     constituent = dcs2constituent(dcs)[0]
     expected_constituent = \
       Tree.fromstring('(ID !fb:tv.tv_series_episode.writer fb:en.straight_and_true)')
     self.assertEqual(expected_constituent, constituent)
Пример #11
0
 def test_AndNoParentPredicate(self):
     dcs = Tree.fromstring(
         '(and fb:en.doom (fb:cvg.computer_videogame.gameplay_modes fb:en.multiplayer_game))'
     )
     constituent = dcs2constituent(dcs)[0]
     expected_constituent = \
       Tree.fromstring('(ID fb:en.doom (ID fb:cvg.computer_videogame.gameplay_modes fb:en.multiplayer_game))')
     self.assertEqual(expected_constituent, constituent)
def parser_output_to_parse_deriv_trees(output):
    lines = output.strip().split("\n")
    deriv_tree_lines = lines[::2]
    parse_tree_lines = lines[1::2]

    parse_trees = [Tree.fromstring(line.replace('\x06', 'epsilon_')) for line in parse_tree_lines if line != '']
    deriv_trees = [Tree.fromstring(line) for line in deriv_tree_lines if line != '']
    return parse_trees, deriv_trees
Пример #13
0
def is_tree(line):
    """Simple `oracle` to see if line is a tree."""
    assert isinstance(line, str), line
    try:
        Tree.fromstring(line)
        return True
    except ValueError:
        return False
Пример #14
0
 def test_Count(self):
     dcs = Tree.fromstring(
         '(count (!fb:military.armed_force.units fb:en.u_army))')
     # from pudb import set_trace; set_trace()
     constituent = dcs2constituent(dcs)[0]
     expected_constituent = \
       Tree.fromstring('(ID COUNT (ID !fb:military.armed_force.units fb:en.u_army))')
     self.assertEqual(expected_constituent, constituent)
Пример #15
0
def main():
    rules = loadrules("pokemon.yaml")
    trees = []
    trees.append(Tree.fromstring("(S let me show you my Pokémon)"))
    trees.append(Tree.fromstring("(S let me show you my cats)"))

    for tree in trees:
        translate(tree, rules)
Пример #16
0
 def test_Number(self):
     dcs = Tree.fromstring(
         '(fb:government.us_president.presidency_number (number 22.0 fb:en.unitless))'
     )
     constituent = dcs2constituent(dcs)[0]
     expected_constituent = \
       Tree.fromstring('(ID fb:government.us_president.presidency_number (NUMBER 22.0 fb:en.unitless))')
     self.assertEqual(expected_constituent, constituent)
Пример #17
0
 def test_evalb_correctly_scores_identical_trees(self):
     tree1 = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))")
     tree2 = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))")
     evalb_scorer = EvalbBracketingScorer()
     evalb_scorer([tree1], [tree2])
     metrics = evalb_scorer.get_metric()
     assert metrics["evalb_recall"] == 1.0
     assert metrics["evalb_precision"] == 1.0
     assert metrics["evalb_f1_measure"] == 1.0
Пример #18
0
 def test_evalb_correctly_calculates_bracketing_metrics_over_multiple_trees(self):
     tree1 = Tree.fromstring("(S (VP (D the) (NP dog)) (VP (V chased) (NP (D the) (N cat))))")
     tree2 = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))")
     evalb_scorer = EvalbBracketingScorer()
     evalb_scorer([tree1, tree2], [tree2, tree2])
     metrics = evalb_scorer.get_metric()
     assert metrics["evalb_recall"] == 0.875
     assert metrics["evalb_precision"] == 0.875
     assert metrics["evalb_f1_measure"] == 0.875
Пример #19
0
def my_is_tree_same(str_input1, str_input2):
    root1 = Tree.fromstring(str_input1)
    root2 = Tree.fromstring(str_input2)
    str_output1 = my_oneline(root1)
    str_output2 = my_oneline(root2)
    if str_output1 == str_output2:
        return True
    else:
        return False
 def test_evalb_correctly_calculates_bracketing_metrics_over_multiple_trees(self):
     tree1 = Tree.fromstring("(S (VP (D the) (NP dog)) (VP (V chased) (NP (D the) (N cat))))")
     tree2 = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))")
     evalb_scorer = EvalbBracketingScorer()
     evalb_scorer([tree1, tree2], [tree2, tree2])
     metrics = evalb_scorer.get_metric()
     assert metrics["evalb_recall"] == 0.875
     assert metrics["evalb_precision"] == 0.875
     assert metrics["evalb_f1_measure"] == 0.875
 def test_evalb_correctly_scores_identical_trees(self):
     tree1 = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))")
     tree2 = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))")
     evalb_scorer = EvalbBracketingScorer()
     evalb_scorer([tree1], [tree2])
     metrics = evalb_scorer.get_metric()
     assert metrics["evalb_recall"] == 1.0
     assert metrics["evalb_precision"] == 1.0
     assert metrics["evalb_f1_measure"] == 1.0
 def test_evalb_correctly_scores_imperfect_trees(self):
     # Change to constiutency label (VP ... )should effect scores, but change to POS
     # tag (NP dog) should have no effect.
     tree1 = Tree.fromstring("(S (VP (D the) (NP dog)) (VP (V chased) (NP (D the) (N cat))))")
     tree2 = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))")
     evalb_scorer = EvalbBracketingScorer()
     evalb_scorer([tree1], [tree2])
     metrics = evalb_scorer.get_metric()
     assert metrics["evalb_recall"] == 0.75
     assert metrics["evalb_precision"] == 0.75
     assert metrics["evalb_f1_measure"] == 0.75
Пример #23
0
 def preprocess_eval(self):
     with open(self.ground_truth_path) as f:
         lines = f.readlines()
     lines = list(map(lambda x: x.rstrip(),lines))
     for i in range(len(lines)):
         self.ground_truth.append(Tree.fromstring(lines[i]).productions())
     with open(self.predicted_path) as f:
         self.lines_1 = f.readlines()
     self.lines_1 = list(map(lambda x: x.rstrip(),self.lines_1))
     for i in range(len(self.lines_1)):
         self.predicted.append(Tree.fromstring(self.lines_1[i]).productions())
Пример #24
0
 def test_evalb_correctly_scores_imperfect_trees(self):
     # Change to constiutency label (VP ... )should effect scores, but change to POS
     # tag (NP dog) should have no effect.
     tree1 = Tree.fromstring("(S (VP (D the) (NP dog)) (VP (V chased) (NP (D the) (N cat))))")
     tree2 = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))")
     evalb_scorer = EvalbBracketingScorer()
     evalb_scorer([tree1], [tree2])
     metrics = evalb_scorer.get_metric()
     assert metrics["evalb_recall"] == 0.75
     assert metrics["evalb_precision"] == 0.75
     assert metrics["evalb_f1_measure"] == 0.75
def tree_from_string(s):
    try:
        tree_string = s
        tree_string = tree_str_post_process(tree_string)
        tree_line = Tree.fromstring(tree_string)
    except Exception as e:
        # print(f'Tree.fromstring(tree_string) failed, try to omit the post_process')
        # print(s)
        tree_string = s
        tree_line = Tree.fromstring(tree_string)
    return tree_line
Пример #26
0
 def test_JoinAnd(self):
     dcs = Tree.fromstring((
         '(!fb:education.academic_post.institution'
         ' (and (fb:education.academic_post.person fb:en.marshall_hall)'
         ' (fb:education.academic_post.position_or_title fb:en.professor)))'
     ))
     constituent = dcs2constituent(dcs)[0]
     expected_constituent = \
       Tree.fromstring(('(ID !fb:education.academic_post.institution'
                        ' (ID fb:education.academic_post.person fb:en.marshall_hall)'
                        ' (ID fb:education.academic_post.position_or_title fb:en.professor))'))
     self.assertEqual(expected_constituent, constituent)
 def test_evalb_with_terrible_trees_handles_nan_f1(self):
     # If precision and recall are zero, evalb returns nan f1.
     # This checks that we handle the zero division.
     tree1 = Tree.fromstring(u"(PP (VROOT (PP That) (VROOT (PP could) "
                             u"(VROOT (PP cost) (VROOT (PP him))))) (PP .))")
     tree2 = Tree.fromstring(u"(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))")
     evalb_scorer = EvalbBracketingScorer()
     evalb_scorer([tree1], [tree2])
     metrics = evalb_scorer.get_metric()
     assert metrics[u"evalb_recall"] == 0.0
     assert metrics[u"evalb_precision"] == 0.0
     assert metrics[u"evalb_f1_measure"] == 0.0
Пример #28
0
 def parse_treestr(self, treestr):
     treestr = treestr.strip()
     tree = Tree.fromstring(treestr)
     if tree.label() != ROOT_NODE_NAME:
         new_root = Tree.fromstring("({})".format(ROOT_NODE_NAME))
         new_root.insert(0, tree)
         tree = new_root
     tree.chomsky_normal_form()
     self.starts[tree.label()] += 1
     # print(tree)
     # tree.pretty_print()
     self.traverse_tree(tree)
Пример #29
0
    def evaluate(gold_str_list: list, pred_str_list: list):
        """
        :param gold_str_list:   [str]   Ground Truth 树字符串列表
        :param pred_str_list:   [str]   Prediction 树字符串列表
        :return:评估结果字符串
        """
        assert len(gold_str_list) == len(pred_str_list)

        gold_trees = [Tree.fromstring(s) for s in gold_str_list]
        pred_trees = [Tree.fromstring(s) for s in pred_str_list]
        ret = MyEvaluation.evaluate_trees(gold_trees, pred_trees)
        return ret
 def test_evalb_with_terrible_trees_handles_nan_f1(self):
     # If precision and recall are zero, evalb returns nan f1.
     # This checks that we handle the zero division.
     tree1 = Tree.fromstring("(PP (VROOT (PP That) (VROOT (PP could) "
                             "(VROOT (PP cost) (VROOT (PP him))))) (PP .))")
     tree2 = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))")
     evalb_scorer = EvalbBracketingScorer()
     evalb_scorer([tree1], [tree2])
     metrics = evalb_scorer.get_metric()
     assert metrics["evalb_recall"] == 0.0
     assert metrics["evalb_precision"] == 0.0
     assert metrics["evalb_f1_measure"] == 0.0
Пример #31
0
def main():
    rules = loadrules("german.yaml")
    trees = []

    ## I like eating / Ich esse gern
    trees.append(
        Tree.fromstring("(S (NP (PRP I)) (VP (VB like) (VBG eating)))"))

    ## I am hungry / Ich habe Hunger
    trees.append(Tree.fromstring("(S (NP (PRP I)) (VP (VB am) (JJ hungry)))"))

    for tree in trees:
        translate(tree, rules)
Пример #32
0
def load_gold_tree(json_path):
    trees = {}
    with open(json_path) as f:
        for line in f:
            data = json.loads(line.strip())
            doc_id = data['doc_id']
            if 'labelled_attachment_tree' in data:
                tree = Tree.fromstring(data['labelled_attachment_tree'])
            else:
                tree = Tree.fromstring(data['attach_tree'])
            trees[doc_id] = tree

    return trees
Пример #33
0
 def test_JoinAnd(self):
     constituent = Tree.fromstring((
         '(ID !fb:education.academic_post.institution'
         ' (ID fb:education.academic_post.person fb:en.marshall_hall)'
         ' (ID fb:education.academic_post.position_or_title fb:en.professor))'
     ))
     expected_dcs = Tree.fromstring((
         '(!fb:education.academic_post.institution'
         ' (and (fb:education.academic_post.person fb:en.marshall_hall)'
         ' (fb:education.academic_post.position_or_title fb:en.professor)))'
     ))
     dcs = constituent2dcs(constituent)[0]
     self.assertEqual(expected_dcs, dcs)
Пример #34
0
def verify_f1(path):
    f1_list = []
    with codecs.open(path, encoding='utf-8') as f:
        for line in f:
            try:
                line = line.encode('UTF-8')
            except UnicodeError as e:
                print "ENCODING ERROR:", line, e
                line = "{}"
            loaded_example = json.loads(line)
            t1 = Tree.fromstring(loaded_example['sentence1_parse'])
            l1 = len(t1.leaves())
            t1 = tree2list(t1)
            t2 = Tree.fromstring(loaded_example['sentence2_parse'])
            l2 = len(t2.leaves())
            t2 = tree2list(t2)
            # print t1
            # print l1
            # print t2
            # print l2

            bt1 = get_balanced_tree(l1)
            bt2 = get_balanced_tree(l2)
            # print bt1
            # print bt2

            print t1
            t1 = get_brackets(t1)[0]
            print t1
            sys.exit(0)

            t2 = get_brackets(t2)[0]
            bt1 = get_brackets(bt1)[0]
            bt2 = get_brackets(bt2)[0]

            # t1.add((0,l1))
            # bt1.add((0,l1))
            # t2.add((0,l2))
            # bt2.add((0,l2))

            # print t1
            # print t2
            # print bt1
            # print bt2

            f1 = compute_f1(t1 & bt1, t1, bt1)
            f1_list.append(f1)
            f1 = compute_f1(t2 & bt2, t2, bt2)
            f1_list.append(f1)

    return sum(f1_list) / len(f1_list), len(f1_list)
def tree_from_string(tree_string):
    try:
        s = tree_string
        s = tree_str_post_process(s)
        tree = Tree.fromstring(s)
    except Exception as e:
        # print(f'Tree.fromstring(tree_string) failed, try to omit the post_process')
        try:
            tree = Tree.fromstring(tree_string)
        except Exception as e:
            print(f'ERROR: unable to parse the tree')
            print(tree_string)
            raise e
    return tree
Пример #36
0
    def __visualize(self, gold_tree: str, parsed_tree: str):
        if self.GOLD_tc != None:
            self.CANVAS.destroy_widget(self.GOLD_tc)
            self.CANVAS.destroy_widget(self.PARSED_tc)

        GOLD = Tree.fromstring('(' + gold_tree + ')')
        PARSED = Tree.fromstring('(' + parsed_tree + ')')

        self.GOLD_tc = TreeWidget(self.CANVAS.canvas(), GOLD)
        self.PARSED_tc = TreeWidget(self.CANVAS.canvas(), PARSED)

        self.CANVAS.add_widget(self.GOLD_tc, 0, 0)
        self.CANVAS.add_widget(self.PARSED_tc, 0, self.GOLD_tc.height() + 10)
        self.CANVAS.pack(expand=True)
Пример #37
0
def treebank_bracket_parse(t):
    try:
        return Tree.fromstring(t, remove_empty_top_bracketing=True)
    except IndexError:
        # in case it's the real treebank format,
        # strip first and last brackets before parsing
        return tree.bracket_parse(t.strip()[1:-1])
    def testConvert(self):
        sample_tree = Tree.fromstring("(S (NP I) (VP (V saw) (NP him)))")
        converter = DotLanguageConverter()
        str = converter.convert(sample_tree)

        expected_tree_string = ("digraph parse_tree {\n"
                                "\t\"S\" [label=\"S\"];\n"
                                "\t\"NP\" [label=\"NP\"];\n"
                                "\t\"S\"-> \"NP\";\n"
                                "\t\"I\" [label=\"I\"];\n"
                                "\t\"NP\"-> \"I\";\n"
                                "\t\"VP\" [label=\"VP\"];\n"
                                "\t\"S\"-> \"VP\";\n"
                                "\t\"V\" [label=\"V\"];\n"
                                "\t\"VP\"-> \"V\";\n"
                                "\t\"saw\" [label=\"saw\"];\n"
                                "\t\"V\"-> \"saw\";\n"
                                "\t\"NP_1\" [label=\"NP\"];\n"
                                "\t\"VP\"-> \"NP_1\";\n"
                                "\t\"him\" [label=\"him\"];\n"
                                "\t\"NP_1\"-> \"him\";\n"
                                "}")


        self.assertEqual(str, expected_tree_string)
Пример #39
0
def calc(param):

    p = ["He", "he", "Him", "him", "She", "she", "Her",
        "her", "It", "it", "They", "they"]
    r = ["Himself", "himself", "Herself", "herself",
        "Itself", "itself", "Themselves", "themselves"]
    fname = param[1]
    pro = param[2]
    with open(fname) as f:
        sents = f.readlines()
    trees = [Tree.fromstring(s) for s in sents]
    pos = get_pos(trees[-1], pro)
    pos = pos[:-1]
    if pro in p:
        tree, pos = hobbs(trees, pos)
        #for t in trees:
        #    print t, '\n'        
        #print "Proposed antecedent for '"+pro+"':", tree[pos]
        return tree, tree[pos]
    elif pro in r:
        tree, pos = resolve_reflexive(trees, pos)
        #for t in trees:
        #    print t, '\n'
        #print "Proposed antecedent for '"+pro+"':", tree[pos] 
        return tree, tree[pos]  
Пример #40
0
def syntactic_parse_features(paragraph, parse):
  """ Returns the count for the usage of S, SBAR units in the syntactic parse,
  plus statistics about the height of the trees  """
  KEPT_FEATURES = ['S', 'SBAR']

  # Increment the count for the part-of-speech of each head of phrase
  counts_of_heads = Counter()
  tree_heights = []
  for t_string in parse:  
    t = Tree.fromstring(t_string)
    for st in t.subtrees():
      counts_of_heads[st.label()] += 1
    tree_heights.append(t.height())

  # Keep only the head parts-of-speech that appear in KEPT_FEATURES
  features = dict(("syntactic_head_"+key, counts_of_heads[key]) for 
    key in counts_of_heads if key in KEPT_FEATURES)
  features = Counter(features)
  # Add in the features related to tree height
  features["tree_height_mean"] = np.mean(tree_heights)
  features["tree_height_median"] = np.median(tree_heights)
  features["tree_height_max"] = np.max(tree_heights)
  features["tree_height_min"] = np.min(tree_heights)
  features["tree_height_spread"] = np.max(tree_heights) - np.min(tree_heights)
  return Counter(features)
Пример #41
0
def main(argv):
    if len(sys.argv) == 2 and argv[1] == "demo":
        demo()
    else:
        if len(sys.argv) > 3 or len(sys.argv) < 2:
            print "Enter the file and the pronoun to resolve."
        elif len(sys.argv) == 3:
            p = ["He", "he", "Him", "him", "She", "she", "Her",
                "her", "It", "it", "They", "they"]
            r = ["Himself", "himself", "Herself", "herself",
                "Itself", "itself", "Themselves", "themselves"]
            fname = sys.argv[1]
            pro = sys.argv[2]
            with open(fname) as f:
                sents = f.readlines()
            trees = [Tree.fromstring(s) for s in sents]
            pos = get_pos(trees[-1], pro)
            pos = pos[:-1]
            if pro in p:
                tree, pos = hobbs(trees, pos)
                for t in trees:
                    print t, '\n'
                print "Proposed antecedent for '"+pro+"':", tree[pos]
            elif pro in r:
                tree, pos = resolve_reflexive(trees, pos)
                for t in trees:
                    print t, '\n'
                print "Proposed antecedent for '"+pro+"':", tree[pos]
Пример #42
0
 def test_construct_tree_from_spans_handles_nested_labels(self):
     # The tree construction should split the "S-NP" into (S (NP ...)).
     tree_spans = [((0, 1), 'D'), ((1, 2), 'N'), ((0, 2), 'S-NP')]
     sentence = ["the", "dog"]
     tree = self.model.construct_tree_from_spans({x:y for x, y in tree_spans}, sentence)
     correct_tree = Tree.fromstring("(S (NP (D the) (N dog)))")
     assert tree == correct_tree
    def __render_tree(self):
        string = self.output_text_area.get("1.0", END)
        string = string.replace("\n", "")

        tree = Tree.fromstring(string)

        tree.draw()
Пример #44
0
def extract_trees(filename="./out/toy_pcfg2.gen"):
    trees = []
    with open(filename) as fh:
        for line in fh:
            trees.append(Tree.fromstring(line))

    return trees
Пример #45
0
 def pprint(self, **kwargs):
     """Returns a representation of the tree compatible with the LaTeX
     qtree package. Requires the nltk module. See
     http://www.nltk.org/_modules/nltk/tree.html."""
     from nltk import Tree as NLTKTree
     tree = NLTKTree.fromstring(self.ptb()) 
     return tree.pprint(**kwargs)
def process_sentence(sentence):
    global corenlp
    result = {}
    if len(sentence) == 0:
        return result
    parse = json.loads(corenlp.parse(sentence))
    tree = Tree.fromstring(parse['sentences'][0]['parsetree'])
    return extract_phrases(tree, 'PP')
Пример #47
0
 def set_parse_info(self, tokens, pos_tag, parse_string, dependency_tree):
     self.tokens = tokens
     self.pos_tag = pos_tag
     self.dependency_tree = dependency_tree
     self.parse_tree = Tree.fromstring(parse_string)
     for i in range(len(self.parse_tree.leaves())):
         self.parse_tree.__setitem__(
             self.parse_tree.leaf_treeposition(i), i+1)
Пример #48
0
 def test_tree_construction_with_too_few_spans_creates_trees_with_depth_one_word_nodes(self):
     # We only have a partial tree here: (S (NP (D the) (N dog)). Decoding should
     # recover this from the spans, whilst attaching all other words to the root node with
     # XX POS tag labels, as the right hand side splits will not occur in tree_spans.
     tree_spans = [((0, 1), 'D'), ((1, 2), 'N'), ((0, 2), 'NP'), ((0, 5), 'S')]
     sentence = ["the", "dog", "chased", "the", "cat"]
     tree = self.model.construct_tree_from_spans({x:y for x, y in tree_spans}, sentence)
     correct_tree = Tree.fromstring("(S (NP (D the) (N dog)) (XX chased) (XX the) (XX cat))")
     assert tree == correct_tree
Пример #49
0
 def test_construct_tree_from_spans(self):
     # (S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))
     tree_spans = [((0, 1), 'D'), ((1, 2), 'N'), ((0, 2), 'NP'),
                   ((2, 3), 'V'), ((3, 4), 'D'), ((4, 5), 'N'),
                   ((3, 5), 'NP'), ((2, 5), 'VP'), ((0, 5), 'S')]
     sentence = ["the", "dog", "chased", "the", "cat"]
     tree = self.model.construct_tree_from_spans({x:y for x, y in tree_spans}, sentence)
     correct_tree = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))")
     assert tree == correct_tree
Пример #50
0
 def latex(self):
     """Returns a representation of the tree compatible with the
     LaTeX qtree package. Requires the nltk module. See 
     http://www.nltk.org/_modules/nltk/tree.html."""
     from nltk import Tree as NLTKTree
     string = self.ptb().replace('[', '\[').replace(']', '\]')
     tree = NLTKTree.fromstring(string) 
     latex = tree.pformat_latex_qtree()
     return latex.replace('-LRB-', '(').replace('-RRB-', ')')
Пример #51
0
def add_top_to_tree(treebank_file):
    f = open(treebank_file, "r")
    root_set = set([])
    for sentence in f:
        t = Tree.fromstring(sentence, remove_empty_top_bracketing=False)
        top_node = Tree("TOP", [])
        top_node.append(t)
        print NewTree.flat_print(top_node)
    f.close()
Пример #52
0
def determiner_usage(paragraph, parse, verbose=False):
  """
  Gets the count of times determiners are used per context.

  Central insight:
     Differences in determiner usage reflect whether the author assumes
     the noun phrase is a category that is coherent within common knowledge. 
     For instance:
        * We observe _the depravity_ of our age
        * Poverty, hunger, mental illness - they were _the inevitable result_ 
          of life in this world.
        * the Reagan administration is testing _the gullibility_ of world opinion
        * Abortion threatens _the moral and Christian character_ of this nation
     In all of these (pulled from low-complexity paragraphs), the author
     is assuming/presupposing a state of the world without having established its truth.
     Note that the presence of a detreminer in the subject doesn't have this
     meaning as often, because of the discourse rule that subjects
     are almost always *already* shared common knowledge -- we would expect subjects
     to often begin with "the" because they almost always have been introduced in 
     a prior sentence.

  Returns:
    dict, potentially with keys of:
       old info (for # times the determiner is in the subject,
             e.g., "The man wore a hat" -- the man was probably already 
             introduced earlier in the paragraph)
       knowledge assumed (for # times the determine assumes facts,
             e.g., "Some man wore the hat" -- err, what hat?)
       SBAR (for # times a determiner was accompanied by an SBAR, 
             e.g., "I saw the man wearing the hat" -- the man both assumed 
             and explained in terms of the knowledge assumed, "the hat")

  Notes:
    - Dependency parse may be better (cleaner and more accurate)
    - Might benefit from excluding proper noun phrases like country names
  """
  DETERMINER_LIST = ["the", "The"]
  features = Counter()
  for t_string in parse:
    t = Tree.fromstring(t_string)
    sent_not_shown = True
    for pos in t.treepositions('postorder'):
      if t[pos] in DETERMINER_LIST:
        phrase_of_interest = " ".join(t[pos[:-2]].leaves())
        while len(pos):
          match = utils_parsing.check_for_match(t, pos)
          if match:
            features["determiner_"+match] += 1
            if verbose:
              if sent_not_shown:
                print " ".join(t.leaves())
                sent_not_shown = False
              print "'%s' -- %s" % (phrase_of_interest, match)
            break
          pos = pos[:-1]
  return features
Пример #53
0
def gen_root(treebank_file):
    # if you use unicode here, there is a bug...
    f = open(treebank_file, "r")
    root_set = set([])
    for sentence in f:
        t = Tree.fromstring(sentence, remove_empty_top_bracketing=True)
        root = t.label()
        root_set.add(root)
    f.close()
    for r in root_set:
        print r
Пример #54
0
def drawrst(strtree, fname):
    """ Draw RST tree into a file
    """
    if not fname.endswith(".ps"):
        fname += ".ps"
    cf = CanvasFrame()
    t = Tree.fromstring(strtree)
    tc = TreeWidget(cf.canvas(), t)
    cf.add_widget(tc,10,10) # (10,10) offsets
    cf.print_to_file(fname)
    cf.destroy()
def process_sentence(sentence):

    global corenlp
    result = {}

    if len(sentence) == 0:
        return result

    parse = json.loads(corenlp.parse(sentence))
    tree = Tree.fromstring(parse['sentences'][0]['parsetree'])
    return [pair[1] for pair in tree.pos()], [leaf.lower() for leaf in tree.leaves()]
Пример #56
0
    def _get_arg_product_rules(self, a_doc_id, a_arg, a_rel, a_parses):
        """Extract syntactic production rules for the given arg.

        Args:
          a_doc_id (str):
            id of the document
          a_arg (str):
            argument to extract productions for
          a_rel (dict):
            discourse relation to extract features for
          a_parses (dict):
            parsed sentences

        Returns:
          set:
            set of syntactic productions

        """
        ret = set()
        # obtain token indices for each arg sentence
        snt_id = None
        snt2tok = self._get_snt2tok(a_rel[a_arg][TOK_LIST])
        # obtain set of leaves corresponding to that argument
        arg_leaves = set()
        subt_leaves = set()
        processed_leaves = set()
        itree = itree_str = inode_path = None
        for snt_id, toks in snt2tok.iteritems():
            itree_str = a_parses[a_doc_id][SENTENCES][snt_id][PARSE_TREE]
            itree = Tree.fromstring(itree_str)
            if not itree.leaves():
                print("Invalid parse tree for sentence {:d}".format(snt_id),
                      file=sys.stderr)
                continue
            # obtain all terminal syntactic nodes from the arg
            for itok in toks:
                inode_path = itree.leaf_treeposition(itok)
                arg_leaves.add(itree[inode_path])
            # check all subtrees (not efficient, but easy to implement)
            for s_t in itree.subtrees():
                subt_leaves.update(s_t.leaves())
                if subt_leaves.issubset(arg_leaves) and \
                   not subt_leaves.issubset(processed_leaves):
                    ret.update(str(p) for p in itree.productions()
                               if any(is_nonterminal(n)
                                      for n in p.rhs()))
                    processed_leaves.update(subt_leaves)
                subt_leaves.clear()
                if processed_leaves == arg_leaves:
                    break
            arg_leaves.clear()
            processed_leaves.clear()
        return ret
Пример #57
0
def tag_phrase_tree(treebank_file, corpus):
    f = codecs.open(treebank_file, "r", "utf-8")
    s_ind = -1
    for sentence in f:
        s_ind += 1
        if s_ind % 10 == 0:
            sys.stderr.write(unicode(s_ind) + u"\n")
        tree = Tree.fromstring(sentence, remove_empty_top_bracketing=False)
        preterminals = [t for t in tree.subtrees(lambda t: t.height() == 2)]
        for i in xrange(len(preterminals)):
            preterminals[i].set_label(corpus[s_ind][i][1])
        sys.stdout.write(NewTree.flat_print(tree) + u"\n")
Пример #58
0
def draw_tree(tree_string):
    raise NotImplementedError()

    from nltk import Tree
    from nltk.draw.util import CanvasFrame
    from nltk.draw import TreeWidget

    cf = CanvasFrame()
    tree = Tree.fromstring(tree_string.replace('[','(').replace(']',')') )
    cf.add_widget(TreeWidget(cf.canvas(), tree), 10, 10)
    cf.print_to_file('tree.ps')
    cf.destroy
    def __to_dot(self):
        string = self.output_text_area.get("1.0", END)
        string = string.replace("\n", "")

        tree = Tree.fromstring(string)

        converter = DotLanguageConverter()
        dotstring = converter.convert(tree)

        self.output_dot_text_area.delete(1.0, END)
        self.output_dot_text_area.insert(INSERT, dotstring)

        pass