示例#1
0
def read_treefile(hyptreefile,reftreefile):
    hfile = codecs.open(hyptreefile,"r",encoding='utf-8')
    rfile = codecs.open(reftreefile,"r",encoding='utf-8')
    scoredic = {}
    #store rtree into rtreelist suppose there are more than one reference
    rtreel = []
    for i in rfile:
        refl = []
        if i.strip() != "":
            refl.append(i.strip())
            rstr = " ".join(refl)
            rtree = Tree.fromstring(rstr)
        rtreel.append(rtree)
    #store hyptree into hyplist    
    htreel = []
    senl = []
    for i in hfile:
        if i.strip() != "":
            senl.append(i.strip())
        else:
            htreel.append(Tree.fromstring(" ".join(senl)))
            senl = []
            
    #loop and score
    for r in rtreel:
        for h in htreel:
            score,hword,rword= score_similarity(h,r)
            scoredic[" ".join(hword)] = score
            
    return scoredic     
示例#2
0
def _muc_read_text(s, top_node):
    # The tokenizer sometimes splits within coref tags.
    def __fix_tokenization(sents):
        for index in range(len(sents)):
            next = 1
            while sents[index].count('<COREF') != sents[index].count('</COREF>'):
                sents[index] += ' '
                sents[index] += sents[index + next]
                sents[index + next] = ''
                next += 1
        sents = filter(None, sents)
        return sents
    if s:
        tree = Tree(top_node, [])        
        if _MUC6_PARA_RE.match(s):
            for para in _MUC6_PARA_RE.findall(s):
                if para and para[0] and para[0].strip():
                    tree.append(Tree('P', []))
                    for sent in _MUC6_SENT_RE.findall(para[0]):
                        words = _MUC6_SENT_RE.match(sent[0]).group('sent').strip()
                        # There are empty sentences <s></s> in the MUC6 corpus.
                        if words:
                            tree[-1].append(_muc_read_words(words, 'S'))                
        elif _MUC7_PARA_RE.match(s):
            for para in _MUC7_PARA_SPLIT_RE.split(s):
                if para and para.strip():
                    tree.append(Tree('P', []))
                    for sent in __fix_tokenization(_SENT_TOKENIZER.tokenize(para)):
                        tree[-1].append(_muc_read_words(sent, 'S'))
        return tree
示例#3
0
    def test_pcfg(self):
        o = pcfg.PCFG()
        tree = Tree('S', (Tree('NP', ('foo',)), Tree('VP', ('bar',))))

        o.update_counts(tree)
        self.assertSetEqual(
                set([(p, 1) for p in tree.productions()]),
                set(o.production_counts.items()))
        self.assertSetEqual(set([(p.lhs(), 1) for p in tree.productions()]),
                set(o.lhs_counts.items()))
        o.update_counts(tree)

        tree = Tree('S', (Tree('VP', ('foo',)), Tree('NP', ('bar',))))
        o.update_counts(tree)
        o.update_counts(tree)
        self.assertEqual(6, len(o.production_counts))
        for count in o.production_counts.values():
            self.assertEqual(2, count)
        self.assertEqual(3, len(o.lhs_counts))
        for count in o.lhs_counts.values():
            self.assertEqual(4, count)

        o.compute_scores()
        for production, score in o.scored_productions.items():
            self.assertAlmostEqual(-0.69314718055, score, msg='%s' % production)
    def match(self, tree):
        try:
            if tree.label() != 'ROOT':
                raise IndexError
            if tree[0].label() != 'SBARQ':
                raise IndexError
            if tree[0][0][0].label() != 'WRB':
                raise IndexError
            if tree[0][0][0][0].lower() != 'when':
                raise IndexError
            if tree[0][1].label() != 'SQ':
                raise IndexError
            if tree[0][1][0].label() != 'VBD':
                raise IndexError
            if tree[0][1][1].label() != 'NP':
                raise IndexError
            if tree[0][1][2].label() != 'VP':
                raise IndexError

            part = Pattern.Part()
            part.object = ParentedTree.fromstring(str(tree[0][1][1]))
            part.property = ParentedTree.fromstring(str(Tree('VP', [
                Tree.fromstring(str(tree[0][0][0])),
                Tree.fromstring(str(tree[0][1][0])),
                Tree.fromstring(str(tree[0][1][2]))
            ])))

            return [part]
        except IndexError:
            return []
def parser_output_to_parse_deriv_trees(output):
    lines = output.strip().split("\n")
    deriv_tree_lines = lines[::2]
    parse_tree_lines = lines[1::2]

    parse_trees = [Tree.fromstring(line.replace('\x06', 'epsilon_')) for line in parse_tree_lines if line != '']
    deriv_trees = [Tree.fromstring(line) for line in deriv_tree_lines if line != '']
    return parse_trees, deriv_trees
 def test_evalb_correctly_calculates_bracketing_metrics_over_multiple_trees(self):
     tree1 = Tree.fromstring("(S (VP (D the) (NP dog)) (VP (V chased) (NP (D the) (N cat))))")
     tree2 = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))")
     evalb_scorer = EvalbBracketingScorer()
     evalb_scorer([tree1, tree2], [tree2, tree2])
     metrics = evalb_scorer.get_metric()
     assert metrics["evalb_recall"] == 0.875
     assert metrics["evalb_precision"] == 0.875
     assert metrics["evalb_f1_measure"] == 0.875
示例#7
0
def add_top_to_tree(treebank_file):
    f = open(treebank_file, "r")
    root_set = set([])
    for sentence in f:
        t = Tree.fromstring(sentence, remove_empty_top_bracketing=False)
        top_node = Tree("TOP", [])
        top_node.append(t)
        print NewTree.flat_print(top_node)
    f.close()
 def test_evalb_correctly_scores_identical_trees(self):
     tree1 = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))")
     tree2 = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))")
     evalb_scorer = EvalbBracketingScorer()
     evalb_scorer([tree1], [tree2])
     metrics = evalb_scorer.get_metric()
     assert metrics["evalb_recall"] == 1.0
     assert metrics["evalb_precision"] == 1.0
     assert metrics["evalb_f1_measure"] == 1.0
示例#9
0
文件: world.py 项目: pyknife/allennlp
    def _construct_node_from_actions(self,
                                     current_node: Tree,
                                     remaining_actions: List[List[str]],
                                     add_var_function: bool) -> List[List[str]]:
        """
        Given a current node in the logical form tree, and a list of actions in an action sequence,
        this method fills in the children of the current node from the action sequence, then
        returns whatever actions are left.

        For example, we could get a node with type ``c``, and an action sequence that begins with
        ``c -> [<r,c>, r]``.  This method will add two children to the input node, consuming
        actions from the action sequence for nodes of type ``<r,c>`` (and all of its children,
        recursively) and ``r`` (and all of its children, recursively).  This method assumes that
        action sequences are produced `depth-first`, so all actions for the subtree under ``<r,c>``
        appear before actions for the subtree under ``r``.  If there are any actions in the action
        sequence after the ``<r,c>`` and ``r`` subtrees have terminated in leaf nodes, they will be
        returned.
        """
        if not remaining_actions:
            logger.error("No actions left to construct current node: %s", current_node)
            raise ParsingError("Incomplete action sequence")
        left_side, right_side = remaining_actions.pop(0)
        if left_side != current_node.label():
            logger.error("Current node: %s", current_node)
            logger.error("Next action: %s -> %s", left_side, right_side)
            logger.error("Remaining actions were: %s", remaining_actions)
            raise ParsingError("Current node does not match next action")
        if right_side[0] == '[':
            # This is a non-terminal expansion, with more than one child node.
            for child_type in right_side[1:-1].split(', '):
                if child_type.startswith("'lambda"):
                    # We need to special-case the handling of lambda here, because it's handled a
                    # bit weirdly in the action sequence.  This is stripping off the single quotes
                    # around something like `'lambda x'`.
                    child_type = child_type[1:-1]
                child_node = Tree(child_type, [])
                current_node.append(child_node)  # you add a child to an nltk.Tree with `append`
                if not self.is_terminal(child_type):
                    remaining_actions = self._construct_node_from_actions(child_node,
                                                                          remaining_actions,
                                                                          add_var_function)
        elif self.is_terminal(right_side):
            # The current node is a pre-terminal; we'll add a single terminal child.  We need to
            # check first for whether we need to add a (var _) around the terminal node, though.
            if add_var_function and right_side in self._lambda_variables:
                right_side = f"(var {right_side})"
            if add_var_function and right_side == 'var':
                raise ParsingError('add_var_function was true, but action sequence already had var')
            current_node.append(Tree(right_side, []))  # you add a child to an nltk.Tree with `append`
        else:
            # The only way this can happen is if you have a unary non-terminal production rule.
            # That is almost certainly not what you want with this kind of grammar, so we'll crash.
            # If you really do want this, open a PR with a valid use case.
            raise ParsingError(f"Found a unary production rule: {left_side} -> {right_side}. "
                               "Are you sure you want a unary production rule in your grammar?")
        return remaining_actions
示例#10
0
def drawTrees(chart):
	for state in chart[-1]:
		if state.isParse(grammar):
			treeString = buildTreeString(state,'')
			tree = Tree(treeString)
			print 'Showing parse tree. Close window to continue.'
			tree.draw()
			ans = raw_input('Do you want to see another parse tree?(y/n): ')
			if ans == 'n': return
	print 'No more valid parses'
示例#11
0
文件: sritg.py 项目: macabot/SRITG
def extract_itg(alignments_file_name, parses_file_name, inv_extension):
    """Extract a inversion transduction grammar (ITG)
    from the given files.
    
    Keyword arguments:
    alignments_file_name -- name of file containing alignments
        between sentences in l1_file_name and l2_file_name
    parses_file_name -- name of file containing parse trees
        of the sentences in l1_file_name
    inv_extension -- extension denoting whether a node is inverted
        
    Returns a Counter of binary ITG rules and unary rules. Each ITG rule is 
    represented as the tuple (lhs, rhs), where rhs is a tuple of nodes."""
    binary_itg = Counter()
    unary_itg = Counter()
    num_lines = number_of_lines(parses_file_name)
    alignments_file = open(alignments_file_name)
    parses_file = open(parses_file_name)
    
    for i, l1_parse in enumerate(parses_file):
        if i % (num_lines/100) is 0:
            sys.stdout.write('\r%d%%' % (i*100/num_lines,))
            sys.stdout.flush()

        try: # TODO remove try/catch
            reordered_indexes = str_to_reordered_indexes(alignments_file.next())
            # remove outer brackets from Berkeley parse
            l1_parse = l1_parse.strip()
            l1_parse = l1_parse[1:len(l1_parse)-1]
            l1_parse = l1_parse.strip()
            parse_tree = Tree(l1_parse)            
            parse_forest = generate_forest(parse_tree, 
                reordered_indexes, inv_extension)
        except:
            error_log = open('error.log', 'a')
            error_log.write('%s -- in extract_itg/3\n' % time.asctime())
            error_log.write('line: %s\n' % i)
            error_log.write('%s\n' % l1_parse.strip())
            error_log.write('%s\n' % reordered_indexes)
            error_log.write('\n')
            error_log.close()
            print 'Error in extract_itg/3. See error.log'
            raise

        binary_rules, unary_rules = extract_rules(parse_forest, 
                                                  parse_tree.leaves())
        for rule in binary_rules:
            binary_itg[rule] += 1

        for rule in unary_rules:
            unary_itg[rule] += 1

    alignments_file.close()
    parses_file.close()
    return binary_itg, unary_itg
 def test_evalb_correctly_scores_imperfect_trees(self):
     # Change to constiutency label (VP ... )should effect scores, but change to POS
     # tag (NP dog) should have no effect.
     tree1 = Tree.fromstring("(S (VP (D the) (NP dog)) (VP (V chased) (NP (D the) (N cat))))")
     tree2 = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))")
     evalb_scorer = EvalbBracketingScorer()
     evalb_scorer([tree1], [tree2])
     metrics = evalb_scorer.get_metric()
     assert metrics["evalb_recall"] == 0.75
     assert metrics["evalb_precision"] == 0.75
     assert metrics["evalb_f1_measure"] == 0.75
 def test_evalb_with_terrible_trees_handles_nan_f1(self):
     # If precision and recall are zero, evalb returns nan f1.
     # This checks that we handle the zero division.
     tree1 = Tree.fromstring("(PP (VROOT (PP That) (VROOT (PP could) "
                             "(VROOT (PP cost) (VROOT (PP him))))) (PP .))")
     tree2 = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))")
     evalb_scorer = EvalbBracketingScorer()
     evalb_scorer([tree1], [tree2])
     metrics = evalb_scorer.get_metric()
     assert metrics["evalb_recall"] == 0.0
     assert metrics["evalb_precision"] == 0.0
     assert metrics["evalb_f1_measure"] == 0.0
示例#14
0
def get_sentence_and_indexes(parsed_sentence):

  sentence_tree = Tree(parsed_sentence)
  if sentence_tree.node == bitpar_top: #remove designated TOP-symbol    
    sentence_tree = sentence_tree[0]
    
  rlist = [0]*len(sentence_tree.leaves())
  slist = [""]*len(sentence_tree.leaves())
  get_sentence_and_indexes_rec_helper(sentence_tree, rlist, slist)
  reordered_sentence = " ".join(slist)
  
  return reordered_sentence, rlist
示例#15
0
    def _build_hierplane_tree(self, tree: Tree, index: int, is_root: bool) -> JsonDict:
        """
        Recursively builds a JSON dictionary from an NLTK ``Tree`` suitable for
        rendering trees using the `Hierplane library<https://allenai.github.io/hierplane/>`.

        Parameters
        ----------
        tree : ``Tree``, required.
            The tree to convert into Hierplane JSON.
        index : int, required.
            The character index into the tree, used for creating spans.
        is_root : bool
            An indicator which allows us to add the outer Hierplane JSON which
            is required for rendering.

        Returns
        -------
        A JSON dictionary render-able by Hierplane for the given tree.
        """
        children = []
        for child in tree:
            if isinstance(child, Tree):
                # If the child is a tree, it has children,
                # as NLTK leaves are just strings.
                children.append(self._build_hierplane_tree(child, index, is_root=False))
            else:
                # We're at a leaf, so add the length of
                # the word to the character index.
                index += len(child)

        label = tree.label()
        span = " ".join(tree.leaves())
        hierplane_node = {
                "word": span,
                "nodeType": label,
                "attributes": [label],
                "link": label
        }
        if children:
            hierplane_node["children"] = children
        # TODO(Mark): Figure out how to span highlighting to the leaves.
        if is_root:
            hierplane_node = {
                    "linkNameToLabel": LINK_TO_LABEL,
                    "nodeTypeToStyle": NODE_TYPE_TO_STYLE,
                    "text": span,
                    "root": hierplane_node
            }
        return hierplane_node
示例#16
0
def evaluate(fragments, sumfunc, condition, normalization, verbose=True, perbook=False, topfragments=False, breakdown=True, conftable=False):
	green = "\033[32m"; red = "\033[31m"; gray = "\033[0m" # ANSI codes
	names = set(map(getauthor, fragments.values()[0]))
	results = {}
	# heading
	if verbose and not perbook:
		print "\n &", 21 * " ",
		print "&".join(a.rjust(16) for a in sorted(names)),
		print "&\tguess &\t\t\tconfidence\\\\"
	prev = "foo.bar"
	# loop over texts to be classified
	for text in sorted(fragments):
		if perbook and getauthor(text) != getauthor(prev):
			print "\n &", 21 * " ",
			print " &".join("\\rotatebox{45}{%s}" % a.split(" - ")[-1].split(".")[0].replace("&","\\&") for a in sorted(fragments[text])), "\\\\"
		if verbose: print text.split(" - ")[-1].split(".")[0][:25].replace("&","\\&").ljust(25),
		inter = {}
		# loop over possible authors
		for author in sorted(fragments[text]):
			inter[author] = sum(map(sumfunc, filter(condition, fragments[text][author].items()))) / normalization(text, author)
		if verbose:
			for author in sorted(inter):
				if inter[author] == max(inter.values()): l,r = "\\textbf{","}"
				else: l, r = "".ljust(8), " "
				if isinstance(inter[author], float): print ("& %s%.2f%s" % (l,inter[author],r)).rjust(16),
				elif isinstance(inter[author], int): print ("& %s%d%s" % (l,inter[author],r)).rjust(16),
				else: print "& %s%s" % (l,repr(inter[author]).rjust(8),r),
		actualauthor = getauthor(text)
		guess = max(inter, key=inter.get)
		results.setdefault(actualauthor, []).append(guess)
		if verbose and not perbook:
			print "&",
			print green+"correct:" if getauthor(guess) == actualauthor else red+"wrong:  ",
			print getauthor(guess).ljust(10), gray,
			try: confidence = (100 * (max(inter.values()) - sorted(inter.values())[-2]) / float(max(inter.values())))
			except ZeroDivisionError: confidence = 0.0
			except IndexError: confidence = 0.0
			print "& %s%5.2f%s " % ((red if confidence < 50 else green), confidence, gray)
		elif verbose: print "\\\\"
		prev = text
	if verbose: print

	if topfragments: print "top fragments"
	for name in sorted(names) if topfragments else ():
		for text in sorted(fragments):
			if not getauthor(text) == name: continue
			print text
			for label in ("(ROOT", "(S ", "(NP ", "(VP ", "(PP "):
				guess = max(fragments[text], key=lambda x: sum(sumfunc(a) for a in fragments[text][x].items() if condition(a)) / norm(x))
				try:
					frag = max((a[0] for a in fragments[text][guess].iteritems() if condition(a) and a[0].startswith(label)), key=lambda x: (sumfunc((x,fragments[text][guess][x])), fragments[text][guess][x]))
				except ValueError: pass
				else:
					f1 = Tree(frag)
					f2 = Tree(frag)
					print "%2d" % fragments[text][guess][frag], " ".join(a.replace(" ", "_")[:-1] for a in re.findall(r" \)|[^ )]+\)", frag)),
					try: f2.un_chomsky_normal_form()
					except: print f1.pprint(margin=9999, parens=("[", " ]"))
					else: print f2.pprint(margin=9999, parens=("[", " ]"))
示例#17
0
def reduce_nps(sentence):
    """
    take any occurrences of NP trees that contain only one  NP tree and reduce them
    """
    res = Tree('S',[])
    for child in sentence:
        #print child
        if isinstance(child, Tree):
            #print len(child)

            if len(child) == 1:
                res.append(child[0])
                continue
        res.append(child)
    return res
示例#18
0
def calc(param):

    p = ["He", "he", "Him", "him", "She", "she", "Her",
        "her", "It", "it", "They", "they"]
    r = ["Himself", "himself", "Herself", "herself",
        "Itself", "itself", "Themselves", "themselves"]
    fname = param[1]
    pro = param[2]
    with open(fname) as f:
        sents = f.readlines()
    trees = [Tree.fromstring(s) for s in sents]
    pos = get_pos(trees[-1], pro)
    pos = pos[:-1]
    if pro in p:
        tree, pos = hobbs(trees, pos)
        #for t in trees:
        #    print t, '\n'        
        #print "Proposed antecedent for '"+pro+"':", tree[pos]
        return tree, tree[pos]
    elif pro in r:
        tree, pos = resolve_reflexive(trees, pos)
        #for t in trees:
        #    print t, '\n'
        #print "Proposed antecedent for '"+pro+"':", tree[pos] 
        return tree, tree[pos]  
示例#19
0
def parse_ccgbank_tree(s):
    t = Tree.parse(s, 
                   parse_node=parse_ccgbank_node, 
                   parse_leaf=parse_ccgbank_leaf, 
                   node_pattern=ccgbank_node_pattern, 
                   leaf_pattern=ccgbank_leaf_pattern)
    return excise_empty_nodes(t)
示例#20
0
    def test_simple_tags(self):
        grammar = "ANIMAL : {<ANIMAL>}"
        rp = ruleparser.RuleParser(grammar)

        expected = Tree.parse("(S el/DT (ANIMAL perro/NN/ANIMAL) ladra/VB al/DT (ANIMAL gato/NN/ANIMAL))", parse_leaf=rp.from_string_token_to_tuple)
        result = rp.tag(self.text)
        self.assertEqual(result, expected)
    def testConvert(self):
        sample_tree = Tree.fromstring("(S (NP I) (VP (V saw) (NP him)))")
        converter = DotLanguageConverter()
        str = converter.convert(sample_tree)

        expected_tree_string = ("digraph parse_tree {\n"
                                "\t\"S\" [label=\"S\"];\n"
                                "\t\"NP\" [label=\"NP\"];\n"
                                "\t\"S\"-> \"NP\";\n"
                                "\t\"I\" [label=\"I\"];\n"
                                "\t\"NP\"-> \"I\";\n"
                                "\t\"VP\" [label=\"VP\"];\n"
                                "\t\"S\"-> \"VP\";\n"
                                "\t\"V\" [label=\"V\"];\n"
                                "\t\"VP\"-> \"V\";\n"
                                "\t\"saw\" [label=\"saw\"];\n"
                                "\t\"V\"-> \"saw\";\n"
                                "\t\"NP_1\" [label=\"NP\"];\n"
                                "\t\"VP\"-> \"NP_1\";\n"
                                "\t\"him\" [label=\"him\"];\n"
                                "\t\"NP_1\"-> \"him\";\n"
                                "}")


        self.assertEqual(str, expected_tree_string)
    def __render_tree(self):
        string = self.output_text_area.get("1.0", END)
        string = string.replace("\n", "")

        tree = Tree.fromstring(string)

        tree.draw()
示例#23
0
def extract_trees(filename="./out/toy_pcfg2.gen"):
    trees = []
    with open(filename) as fh:
        for line in fh:
            trees.append(Tree.fromstring(line))

    return trees
示例#24
0
文件: delphin.py 项目: ned2/typediff
 def pprint(self, **kwargs):
     """Returns a representation of the tree compatible with the LaTeX
     qtree package. Requires the nltk module. See
     http://www.nltk.org/_modules/nltk/tree.html."""
     from nltk import Tree as NLTKTree
     tree = NLTKTree.fromstring(self.ptb()) 
     return tree.pprint(**kwargs)
示例#25
0
    def tag(self, input_tree):
        """
        Tag an input tree using the rules in parsed grammars.
        """
        #clean input tree:
        input_tree = self.clean(input_tree)
        
        text = self.from_tree_to_text(input_tree)
        #print "INPUT TEXT: "+text
        for rule in self.rules:
            rule_name = rule.keys()[0]
            rule = rule.values()[0]

            matches = re.finditer(rule, text, re.I)
            for match in matches:
                match_text = match.group(rule_name)
                #eliminar espacios al principio y al final del matching text,
                #para controlar que cada subarbol <NAME> está bien delimitado
                #en el texto resultante (no se come espacios opcionales):
                match_text = match_text.strip()
                text = string.replace(text, match_text, "<"+rule_name+">")
                #print "TEXT = "+text
                self.stack.append(match_text)

        #print "OUTPUT TEXT : "+text
        output_tree_str = "(S "+self.from_text_to_tree_str(text)+" )"
        #print "OUTPUT TREE STR: "+output_tree_str
        output_tree = Tree.parse(output_tree_str, parse_leaf=self.from_string_token_to_tuple)
        return output_tree
示例#26
0
def main(argv):
    if len(sys.argv) == 2 and argv[1] == "demo":
        demo()
    else:
        if len(sys.argv) > 3 or len(sys.argv) < 2:
            print "Enter the file and the pronoun to resolve."
        elif len(sys.argv) == 3:
            p = ["He", "he", "Him", "him", "She", "she", "Her",
                "her", "It", "it", "They", "they"]
            r = ["Himself", "himself", "Herself", "herself",
                "Itself", "itself", "Themselves", "themselves"]
            fname = sys.argv[1]
            pro = sys.argv[2]
            with open(fname) as f:
                sents = f.readlines()
            trees = [Tree.fromstring(s) for s in sents]
            pos = get_pos(trees[-1], pro)
            pos = pos[:-1]
            if pro in p:
                tree, pos = hobbs(trees, pos)
                for t in trees:
                    print t, '\n'
                print "Proposed antecedent for '"+pro+"':", tree[pos]
            elif pro in r:
                tree, pos = resolve_reflexive(trees, pos)
                for t in trees:
                    print t, '\n'
                print "Proposed antecedent for '"+pro+"':", tree[pos]
 def test_construct_tree_from_spans_handles_nested_labels(self):
     # The tree construction should split the "S-NP" into (S (NP ...)).
     tree_spans = [((0, 1), 'D'), ((1, 2), 'N'), ((0, 2), 'S-NP')]
     sentence = ["the", "dog"]
     tree = self.model.construct_tree_from_spans({x:y for x, y in tree_spans}, sentence)
     correct_tree = Tree.fromstring("(S (NP (D the) (N dog)))")
     assert tree == correct_tree
示例#28
0
def syntactic_parse_features(paragraph, parse):
  """ Returns the count for the usage of S, SBAR units in the syntactic parse,
  plus statistics about the height of the trees  """
  KEPT_FEATURES = ['S', 'SBAR']

  # Increment the count for the part-of-speech of each head of phrase
  counts_of_heads = Counter()
  tree_heights = []
  for t_string in parse:  
    t = Tree.fromstring(t_string)
    for st in t.subtrees():
      counts_of_heads[st.label()] += 1
    tree_heights.append(t.height())

  # Keep only the head parts-of-speech that appear in KEPT_FEATURES
  features = dict(("syntactic_head_"+key, counts_of_heads[key]) for 
    key in counts_of_heads if key in KEPT_FEATURES)
  features = Counter(features)
  # Add in the features related to tree height
  features["tree_height_mean"] = np.mean(tree_heights)
  features["tree_height_median"] = np.median(tree_heights)
  features["tree_height_max"] = np.max(tree_heights)
  features["tree_height_min"] = np.min(tree_heights)
  features["tree_height_spread"] = np.max(tree_heights) - np.min(tree_heights)
  return Counter(features)
示例#29
0
文件: wsj.py 项目: davidswelt/dmvccm
def treebank_bracket_parse(t):
    try:
        return Tree.fromstring(t, remove_empty_top_bracketing=True)
    except IndexError:
        # in case it's the real treebank format,
        # strip first and last brackets before parsing
        return tree.bracket_parse(t.strip()[1:-1])
示例#30
0
文件: util.py 项目: OMARI1988/upparse
def negra_tree_iter(corpus_root):
  pieces = []
  for line in open(corpus_root):
    if line.startswith('%'):
      s = ''.join(pieces).strip()
      if len(s):
        yield Tree.parse(s)

      pieces = []

    else:
      pieces.append(line)

  if len(pieces):
    s = ''.join(pieces).strip()
    yield Tree.parse(s)
示例#31
0
    else:
        nonterm.append((lhs, rhses[0], rhses[1], math.log(prob)))

for line in open('../../data/wiki-en-short.tok'):
    print(line.strip('\n'))
    best_score = defaultdict(lambda: -1000000)
    best_edge = dict()
    words = line.strip('\n').split()
    for i in range(len(words)):
        for lhs, log_prob in preterm[words[i]]:
            best_score['{} {} {}'.format(lhs, i, i + 1)] = log_prob

    for j in range(2, len(words) + 1):
        for i in reversed(range(j - 1)):
            for k in range(i + 1, j):
                for sym, lsym, rsym, logprob in nonterm:
                    if '{} {} {}'.format(
                            lsym, i, k) in best_score and '{} {} {}'.format(
                                rsym, k, j) in best_score:
                        my_lp = best_score['{} {} {}'.format(
                            lsym, i, k)] + best_score['{} {} {}'.format(
                                rsym, k, j)] + logprob
                        if my_lp > best_score['{} {} {}'.format(sym, i, j)]:
                            best_score['{} {} {}'.format(sym, i, j)] = my_lp
                            best_edge['{} {} {}'.format(
                                sym, i, j)] = ('{} {} {}'.format(lsym, i, k),
                                               '{} {} {}'.format(rsym, k, j))
    tree = Tree.fromstring(
        print_tree('S 0 ' + str(len(words)), words, best_edge))
    print(tree)
示例#32
0
        sentence = temp_string

        matches = re.findall(grammar, sentence)
        sentence = re.sub(grammar, "| ", sentence)

        sentence = "(S " + sentence + ")"

        for i in range(len(matches)):
            temp_sentence = "(" + chunk_name + " "
            for j in range(len(matches[i])):
                temp_sentence += matches[i][j] + " "
            temp_sentence += ")"
            sentence = sentence.replace("|", temp_sentence, 1)
        print(sentence)

        return sentence


if __name__ == "__main__":
    parser = GrammarParse()
    # sentence = "The quick brown fox jumps over the lazy dog"
    sentence = "I was hugging an amazing spectacular dog"
    # result = parser.regExParse("NP", "(\w*/DT)? ?(\w*/JJ)* ?(\w*/NN)", sentence)
    result = parser.regExParse(
        "VP", "(\w*/VB\w*) ?(\w*/DT)? ?(\w*/JJ)* ?(\w*/NN) ?(\w*/RB\w?)?",
        sentence)

    tr = Tree.fromstring(result)
    print(tr)
    tr.draw()
示例#33
0
文件: world.py 项目: feiga/bilm-tf
    def _construct_node_from_actions(
            self, current_node: Tree, remaining_actions: List[List[str]],
            add_var_function: bool) -> List[List[str]]:
        """
        Given a current node in the logical form tree, and a list of actions in an action sequence,
        this method fills in the children of the current node from the action sequence, then
        returns whatever actions are left.

        For example, we could get a node with type ``c``, and an action sequence that begins with
        ``c -> [<r,c>, r]``.  This method will add two children to the input node, consuming
        actions from the action sequence for nodes of type ``<r,c>`` (and all of its children,
        recursively) and ``r`` (and all of its children, recursively).  This method assumes that
        action sequences are produced `depth-first`, so all actions for the subtree under ``<r,c>``
        appear before actions for the subtree under ``r``.  If there are any actions in the action
        sequence after the ``<r,c>`` and ``r`` subtrees have terminated in leaf nodes, they will be
        returned.
        """
        if not remaining_actions:
            logger.error("No actions left to construct current node: %s",
                         current_node)
            raise ParsingError("Incomplete action sequence")
        left_side, right_side = remaining_actions.pop(0)
        if left_side != current_node.label():
            logger.error("Current node: %s", current_node)
            logger.error("Next action: %s -> %s", left_side, right_side)
            logger.error("Remaining actions were: %s", remaining_actions)
            raise ParsingError("Current node does not match next action")
        if right_side[0] == '[':
            # This is a non-terminal expansion, with more than one child node.
            for child_type in right_side[1:-1].split(', '):
                if child_type.startswith("'lambda"):
                    # We need to special-case the handling of lambda here, because it's handled a
                    # bit weirdly in the action sequence.  This is stripping off the single quotes
                    # around something like `'lambda x'`.
                    child_type = child_type[1:-1]
                child_node = Tree(child_type, [])
                current_node.append(
                    child_node
                )  # you add a child to an nltk.Tree with `append`
                if not self.is_terminal(child_type):
                    remaining_actions = self._construct_node_from_actions(
                        child_node, remaining_actions, add_var_function)
        elif self.is_terminal(right_side):
            # The current node is a pre-terminal; we'll add a single terminal child.  We need to
            # check first for whether we need to add a (var _) around the terminal node, though.
            if add_var_function and right_side in self._lambda_variables:
                right_side = f"(var {right_side})"
            if add_var_function and right_side == 'var':
                raise ParsingError(
                    'add_var_function was true, but action sequence already had var'
                )
            current_node.append(
                Tree(right_side,
                     []))  # you add a child to an nltk.Tree with `append`
        else:
            # The only way this can happen is if you have a unary non-terminal production rule.
            # That is almost certainly not what you want with this kind of grammar, so we'll crash.
            # If you really do want this, open a PR with a valid use case.
            raise ParsingError(
                f"Found a unary production rule: {left_side} -> {right_side}. "
                "Are you sure you want a unary production rule in your grammar?"
            )
        return remaining_actions
示例#34
0
 def add_sibling(self, next_token):
     self.stack[-1].append(Tree(next_token, []))
    def induce_CYK(self, sentence, show=True, beam_size=20):

        binaries = {}
        for lhs in self.proba_cfg.keys():
            for rhs in self.proba_cfg[lhs]:
                if not rhs in binaries.keys(): binaries[rhs] = set()
                binaries[rhs].add(lhs)

        lb = set([B[0] for B in binaries.keys()])
        rb = set([B[1] for B in binaries.keys()])
        bi = set(binaries.keys())
        axioms = self.axiomes
        #import pdb; pdb.set_trace()
        init_sentence = sentence
        sentence = self.oov.beam_search_decoder(sentence=init_sentence,
                                                show=show,
                                                beam_size=beam_size).split()
        n = len(sentence)

        hist_scores = [[dict() for i in range(n + 1)] for k in range(n + 1)]
        hist_track = [[dict() for i in range(n + 1)] for k in range(n + 1)]

        r_pos = [[set() for i in range(n + 1)] for k in range(n + 1)]
        l_pos = [[set() for i in range(n + 1)] for k in range(n + 1)]

        for i, word in enumerate(sentence):
            #word = word.lower()
            for A, words in self.proba_lexicons.items():
                if word in words.keys():
                    hist_scores[i][i + 1][A] = words[word]
                    if A in lb: l_pos[i][i + 1].add(A)
                    if A in rb: r_pos[i][i + 1].add(A)

        for window in range(2, n + 1):
            for start in range(n + 1 - window):
                end = start + window
                for split in range(start + 1, end):
                    left, right = hist_scores[start][split], hist_scores[
                        split][end]
                    l_int, r_int = l_pos[start][split] & lb, r_pos[split][
                        end] & rb
                    final_int = set(product(l_int, r_int)) & bi
                    for (B, C) in final_int:
                        for A in binaries[(B, C)]:
                            prob = left[B] * right[C] * self.proba_cfg[A][(B,
                                                                           C)]
                            if prob > hist_scores[start][end].get(A, 0.0):
                                hist_scores[start][end][A] = prob
                                hist_track[start][end][A] = (split, B, C)
                                if A in lb: l_pos[start][end].add(A)

                                if A in rb: r_pos[start][end].add(A)
        tree = self.get_tree(hist_track, hist_scores, 0, n,
                             init_sentence.split(), n, 'SENT')
        if tree == 'NOT IN GRAMMAR':
            ret = self.OOG(init_sentence.split()), 0
        else:
            tree = Tree.fromstring(tree)
            tree.un_chomsky_normal_form()
            ret = ' '.join(tree.pformat().split()), 1
        return ret
示例#36
0
def deep_copy_tree(tree):
    if type(tree) == Tree:
        return Tree(tree.label(), [deep_copy_tree(child) for child in tree])
    return tree
示例#37
0
def to_nltk_tree(node):
    if len(node.children) > 0:
        return Tree(tok_format(node), [to_nltk_tree(child) for child in node.children])
    else:
        return tok_format(node)
示例#38
0
 def convert_leaf_str_to_tree(_tree):
     if isinstance(_tree, str):
         return Tree(_tree, [])
     else:
         return Tree(_tree.label(), [convert_leaf_str_to_tree(_tree_e) for _tree_e in _tree])
示例#39
0
 def simplify_tree(t:Tree):
     if t.label() == "call":     # remove call, make first arg of it the parent of the other args
         assert(len(t[0]) == 0)
         # if not t[0].label().startswith("SW."):
         #     print(t)
         # assert(t[0].label().startswith("SW."))
         t.set_label(t[0].label())
         del t[0]
     elif t.label() == "string": # remove, annotate
         afterstring.update(set([tc.label() for tc in t]))
         assert(len(t) == 1)
         assert(len(t[0]) == 0)
         t.set_label(f"arg:{t[0].label()}")
         del t[0]
     if t.label().startswith("edu.stanford.nlp.sempre.overnight.SimpleWorld."):
         t.set_label("SW:" + t.label()[len("edu.stanford.nlp.sempre.overnight.SimpleWorld."):])
     if t.label() == "SW:getProperty":
         assert(len(t) == 2)
         if self.full_simplify:
             ret = simplify_tree(t[1])
             ret.append(simplify_tree(t[0]))
         else:
             children = [simplify_tree(te) for te in t]
             ret = t
             ret[:] = children
         return ret
     elif t.label() == "SW:singleton":
         assert(len(t) == 1)
         assert(len(t[0]) == 0)
         if not self.full_simplify:
             t[0].set_label(f"singleton:{t[0].label()}")
         return simplify_tree(t[0])
     elif t.label() == "SW:ensureNumericProperty":
         assert(len(t) == 1)
         # assert(len(t[0]) == 1)
         # t[0][0].set_label(f"numeric:{t[0][0].label()}")
         if self.full_simplify:
             ret = simplify_tree(t[0])
         else:
             ret = t
             ret[:] = [simplify_tree(te) for te in ret]
         return ret
     elif t.label() == "SW:ensureNumericEntity":
         assert(len(t) == 1)
         if self.full_simplify:
             ret = simplify_tree(t[0])
         else:
             ret = t
             ret[:] = [simplify_tree(te) for te in ret]
         return ret
     elif t.label() == "SW:aggregate":
         assert(len(t) == 2)
         ret = simplify_tree(t[0])
         assert(ret.label() in ["arg:avg", "arg:sum"])
         assert(len(ret) == 0)
         ret.set_label(f"agg:{ret.label()}")
         ret.append(simplify_tree(t[1]))
         return ret
     else:
         t[:] = [simplify_tree(tc) for tc in t]
         return t
示例#40
0
 def __call__(self, x:Tree):
     if self.p > self.rng.random_sample():
         return Tree(self.mask_symbol, [])
     else:
         return Tree(x.label(), [self(xe) for xe in x])
示例#41
0
    """
    Recursively turn a tree into a binary tree.
    """
    if isinstance(tree, str):
        return tree
    elif len(tree) == 1:
        return binarize(tree[0])
    else:
        label = tree.label()
        return reduce(lambda x, y: Tree(label, (binarize(x), binarize(y))),
                      tree)
        #return reduce(lambda x, y: (binarize(x), binarize(y)), tree)


t = Tree.fromstring(
    '(ROOT (S (NP (DT This)) (VP (VBZ is) (NP (NP (DT an) (NN example)) (PP (IN of) (NP (NN tokenziation)))))))'
)
bt = binarize(t)

import re
import string
from stanfordcorenlp import StanfordCoreNLP
from nltk import Tree
from functools import reduce
'''
regex = re.compile('[%s]' % re.escape(string.punctuation))

def parse_sentence(sentence):
    nlp = StanfordCoreNLP(r'./stanford-corenlp-full-2018-02-27')
    sentence = regex.sub('', sentence)
示例#42
0
    def test_process(self):
        #Test for only stop words removal
        nltka = NLTK(stopwords_removal=True, url_tagging=True)
        nltka.set_lang("")
        self.assertEqual(
            nltka.process(
                "The striped bats are hanging on their feet for the best"),
            ["striped", "bats", "hanging", "feet", "best"])

        #Test for only stemming
        nltka.stemming = True
        nltka.stopwords_removal = False
        self.assertEqual(
            nltka.process(
                "My name is Francesco and I am a student at the University of the city of Bari"
            ), [
                "my", "name", "is", "francesco", "and", "i", "am", "a",
                "student", "at", "the", "univers", "of", "the", "citi", "of",
                "bari"
            ])
        nltka.stemming = False

        #Test for only lemmatization
        nltka.lemmatization = True
        self.assertEqual(
            nltka.process(
                "The striped bats are hanging on their feet for best"), [
                    "The", "strip", "bat", "be", "hang", "on", "their", "foot",
                    "for", "best"
                ])

        #Test for lemmatization with multiple whitespaces removal
        nltka.strip_multiple_whitespaces = True
        self.assertEqual(
            nltka.process(
                "The   striped  bats    are    hanging   on   their    feet   for  best"
            ), [
                "The", "strip", "bat", "be", "hang", "on", "their", "foot",
                "for", "best"
            ])

        #Test for lemmatization with multiple whitespaces removal and URL tagging
        nltka.url_tagging = True
        self.assertEqual(
            nltka.process(
                "The   striped http://facebook.com bats https://github.com   are   http://facebook.com hanging   on   their    feet   for  best  http://twitter.it"
            ), [
                "The", "strip", "<URL>", "bat", "<URL>", "be", "<URL>", "hang",
                "on", "their", "foot", "for", "best", "<URL>"
            ])

        # Test for lemmatization, multiple whitespaces removal, URL tagging and stemming
        nltka.stemming = True
        self.assertEqual(
            nltka.process(
                "The   striped http://facebook.com bats https://github.com   are   http://facebook.com hanging   on   their    feet   for  best  http://twitter.it"
            ), [
                "the", "strip", "<url>", "bat", "<url>", "be", "<url>", "hang",
                "on", "their", "foot", "for", "best", "<url>"
            ])

        # Test for lemmatization, multiple whitespaces removal, URL tagging, stemming, stop words removal
        nltka.stopwords_removal = True
        self.assertEqual(
            nltka.process(
                "The   striped http://facebook.com bats https://github.com   are   http://facebook.com hanging   on   their    feet   for  best  http://twitter.it"
            ), [
                "strip", "<url>", "bat", "<url>", "<url>", "hang", "foot",
                "best", "<url>"
            ])

        nltka.named_entity_recognition = True
        nltka.stopwords_removal = False
        nltka.stemming = False
        nltka.lemmatization = False
        result = nltka.process(
            "Facebook was fined by Hewlett Packard for spending 100€ to buy Cristiano Ronaldo from Juventus"
        )

        self.assertEqual(
            result,
            Tree('S', [
                Tree('PERSON', [('Facebook', 'NNP')]), ('was', 'VBD'),
                ('fined', 'VBN'), ('by', 'IN'),
                Tree('PERSON', [('Hewlett', 'NNP'), ('Packard', 'NNP')]),
                ('for', 'IN'), ('spending', 'VBG'), ('100€', 'CD'),
                ('to', 'TO'), ('buy', 'VB'),
                Tree('PERSON', [('Cristiano', 'NNP'), ('Ronaldo', 'NNP')]),
                ('from', 'IN'),
                Tree('GPE', [('Juventus', 'NNP')])
            ]))
示例#43
0
    type=int,
    default=1,
    help=
    "Number of processors to use, -1 means use all processors, in Windows, multiprocessing doesn't work, go for n_jobs = 1"
)

args = parser.parse_args()

print('Reading the training corpus :')

filename = args.data_file
corpus = read_corpus_pcfg(filename)

print('Binarizing the trees :')

trees = [Tree.fromstring(sentence) for sentence in corpus]
for tree in trees:
    tree.chomsky_normal_form(horzMarkov=2)
    tree.collapse_unary(True, True)

train_frac = args.train_frac
print('Training on %.2f %% of the data: ' % (100 * train_frac))

size = len(corpus)
train_size = int(train_frac * size)

train, test = corpus[:train_size], corpus[train_size:]
train_t, test_t = trees[:train_size], trees[train_size:]

if not args.test_mode:
    entername = 'frac_data_sentences.txt'
示例#44
0
 def parse(self, doc):
     batch = doc.to_batch(self.device)
     output = self.forward(batch)
     tree = output['tree'][0]
     tree = Tree.fromstring(tree)
     return tree
示例#45
0
def testing():
    converter = TreesConverter()
    t = Tree('?', [Tree('a', []), Tree('b', [])])
    converted = converter.convert_tree(t)
    backwards = converter.reverse_convert_tree(converted)
    print(t, converted, backwards)
示例#46
0
def to_nltk_tree(node):
    if node.n_lefts + node.n_rights > 0:
        return Tree(node.orth_,
                    [to_nltk_tree(child) for child in node.children])
    else:
        return node.orth_
示例#47
0
from nltk import Tree
from nltk import induce_pcfg
import pickle
from nltk import Nonterminal

terminal_dict = {}
non_terminal_dict = {}
with open("TrainingTree.txt", 'r') as f:
    lines = f.readlines()
    lines = [line.replace('[','(').replace(']',')') for line in lines]
    rules = []
    for line in lines:
        t = Tree.fromstring(line)
        rules += t.productions()
    S = Nonterminal('S')
    grammar = induce_pcfg(S, rules)
    print(grammar)

with open("grammar.pkl", 'wb') as pickle_file:
    pickle.dump(grammar, pickle_file, protocol=pickle.HIGHEST_PROTOCOL)
示例#48
0
from nltk import Tree, RegexpChunkParser
from nltk.chunk import RegexpParser
from nltk.chunk.regexp import ChunkString, ChunkRule, ChinkRule

s = [('the', 'DT'), ('book', 'NN'), ('has', 'VBZ'), ('many', 'JJ'),
     ('chapters', 'NNS')]
# forth
chunker = RegexpParser(r'''
NP:
    {<DT><NN.*><.*>*<NN.*>}
    }<VB.*>{''')

print(chunker.parse(s))

# back
t = Tree('S', s)
cs = ChunkString(t)
print(cs)

ur = ChunkRule('<DT><NN.*><.*>*<NN.*>', 'chunk determiners and nouns')
ur.apply(cs)
print(cs)

ir = ChinkRule('<VB.*>', 'chink verbs')
ir.apply(cs)
print(cs)

print(cs.to_chunkstruct())
# cs.to_chunkstruct().draw()

chunker = RegexpChunkParser([ur, ir])
示例#49
0
def get_labels_from_tree(x:Tree):
    ret = {x.label()}
    for child in x:
        ret |= get_labels_from_tree(child)
    return ret
def is_proper_name(noun_phrase:nltk.Tree):
    for word, typ in noun_phrase.leaves():
        if (not word.istitle()) and (not typ in []):
            return False
    return True
示例#51
0
        else:
            ret.append(t)
        prev_t = t
    return ret


with gzip.open(sys.argv[1]) as f:
    while 1:
        line = f.readline()
        if not line:
            break
        line = line.strip()

        if MODE == HEAD:
            hl = remove_digits(fix_paren(line))
            headline = Tree.fromstring(hl).leaves()
            MODE = NEXT

        if MODE == TEXT:
            article_parse.append(remove_digits(fix_paren(line)))

        if MODE == SENT and re.match(r'<token id=\"[\d]+\">', line):
            words.append(f.readline().strip().replace("<word>", "").replace(
                "</word>", ""))
            lemmas.append(f.readline().strip().replace("<lemma>", "").replace(
                "</lemma>", ""))
            for _ in range(3):
                f.readline()
            ners.append(f.readline().strip().replace("<NER>",
                                                     "").replace("</NER>", ""))
def to_nltk_tree(node):
    if node.n_lefts + node.n_rights > 0:
        return Tree(tok_format(node),
                    [to_nltk_tree(child) for child in node.children])
    else:
        return tok_format(node)
示例#53
0
                                "??", "").replace("- ", "").replace(">>",
                                                                    "") + " "
                    sentences = all.replace("-", "").replace(";", ".").replace(
                        "?", ".").replace("!", ".").lower().split(".")
                    for sentence in sentences:
                        sentence = unicode(sentence, errors='ignore')
                        total += 1

                        output = nlp_stan.annotate(
                            sentence,
                            properties={
                                'annotators':
                                'tokenize,ssplit,pos,depparse,parse',
                                'outputFormat': 'json'
                            })
                        t = Tree.fromstring(output['sentences'][0]['parse'])
                        height = t.height() - 1

                        total_comp += height
                        doc = nlp(sentence)
                        sub_toks = [
                            tok for tok in doc if (tok.dep_ in SUBJECTS)
                        ]
                        obj_toks = [
                            tok for tok in doc if (tok.dep_ in OBJECTS)
                        ]
                        for sub in sub_toks:
                            sub = str(sub)
                            zir_sub = sub.split(" ")
                            for zir in zir_sub:
                                if zir in female_cat:
示例#54
0
def to_nltk_tree(node):
    if node.n_lefts + node.n_rights > 0:
        parsed_child_nodes = [to_nltk_tree(child) for child in node.children]
        return Tree(node.orth_, parsed_child_nodes)
    else:
        return node.orth_
    def _rename_tags(self, tree):
        """
        Rename the tags and restructure the tree to cnf with
        Roark factorization.
        Find duplicates in leaves and make them unique.
        Add information about heads.
        """

        def _put_head_on_subtree(tree, leaf_nodes):
            """
            Put the head information on the correct subtree labels
            :count int: assign unique ident to duplicated leaves (this is needed later for correct parsing)
            """
            oldlabel = tree.label()
            simplified_label = re.split("-", oldlabel)[0]
            head, head_pos = None, None
            for subtree in tree:
                #subtree is a terminal
                if subtree[0] in leaf_nodes:
                    #create head and make head lowercase
                    temp_head, temp_head_pos = subtree[0].lower(), subtree.label()

                    #check whether the head should project higher up
                    if PHRASES.get(simplified_label) and temp_head_pos and temp_head_pos[0] in PHRASES[simplified_label]:
                        head, head_pos = temp_head, temp_head_pos

                    temp_newlabel = "{}|SPL{}#MID{}|SPL".format(temp_head_pos, temp_head, temp_head_pos)
                    subtree.set_label(temp_newlabel)

                #subtree is not a terminal
                else:
                    temp_head, temp_head_pos = _put_head_on_subtree(subtree, leaf_nodes)

                    if PHRASES.get(simplified_label) and temp_head_pos and temp_head_pos[0] in PHRASES[simplified_label]:
                        head, head_pos = temp_head, temp_head_pos

            #if we found a head in one of the subtrees
            if head:

                #create new label with head info
                newlabel = "{}|SPL{}#MID{}|SPL".format(oldlabel, head, head_pos)
                tree.set_label(newlabel)

            return head, head_pos

        def _recurse_tags(tree, parent, sibling, branches):

            #Obtain the terminal nodes of this tree
            leaves = tree.leaves()

            # Check if the queue of branches that need to be processed
            # is filled. If this is the case, processing the branches
            # takes top priority.

            current_label = tree.label()
            tree.set_label(current_label)

            for subtree in tree:
                try:
                    current_label = subtree.label()
                except AttributeError:
                    pass
                else:
                    subtree.set_label(current_label)


            if branches != []:
                newparent   = str(tree.label())
                newlabel    = "{}^{}".format(parent, sibling)
                lefttree    = branches.pop(0)

                # Construct both branch sides of the tree
                leftside  = _recurse_tags(tree, newparent, None, [])
                rightside = _recurse_tags(lefttree, newlabel,
                                          newparent, branches)

                return "({} {} {})".format(newlabel, leftside, rightside)
            # Else if the current rule in the tree maps to more than
            # 2 children, put the branches in a queue.
            elif len(tree) > 2:
                # branches on the queue are all branches that will be nested:
                # These are the 3rd+ branch in a tree. The new parent is the
                # current tree label.
                branches    = [tree[i] for i in range(2, len(tree))]
                newparent   = str(tree.label())
                newsibling  = str(tree[0].label())

                # The rightmost branch will be written like a normal tree.
                # The left side will get nested and labels will be rewritten.
                leftside  = _recurse_tags(tree[0], newparent, None, [])
                rightside = _recurse_tags(tree[1], newparent,
                                          newsibling, branches)

                # If the current node is on the left hand side or does
                # not have a parent, do not change the label. Else,
                # reformat the label to <parent>^<sibling>
                if parent is None or sibling is None:
                    return "({} {} {})".format(
                                str(tree.label()), leftside, rightside)
                else:
                    newlabel = "{}^{}".format(parent, sibling)
                    return "({}({} {} {}))".format(
                                newlabel, str(tree.label()),
                                leftside, rightside)
            # If a rule is binary, check for whether we are on
            # the right or left branch of the tree.
            elif len(tree) == 2:
                # If on the left branch, the new label will be the
                # current label of the tree.
                if sibling is None:
                    newlabel = str(tree.label())
                    newsibling = str(tree[0].label())

                    # Recursion, change tags of the subtrees
                    leftside  = _recurse_tags(tree[0], newlabel, None, [])
                    rightside = _recurse_tags(tree[1], newlabel, newsibling, [])

                    return "({} {} {})".format(newlabel, leftside, rightside)
                # If on the right branch, the new label will be
                # reformatted to <parent>^<sibling>, which will
                # then be rewritten as the current label.
                else:
                    newlabel = "{}^{}".format(parent, sibling)
                    newparent = str(tree.label())
                    newsibling = str(tree[0].label())

                    # Recursion, change tags of the subtrees
                    leftside  = _recurse_tags(tree[0], newparent, None, [])
                    rightside = _recurse_tags(tree[1], newparent, newsibling, [])
                    return "({} ({} {} {}))".format(
                                newlabel, newparent, leftside, rightside)
            # If a rule is unary, first check whether the rule leads
            # to a nonterminal symbol.
            else:
                # If the unary rule leads to a nonterminal symbol
                if tree[0] not in leaves:
                    subtree = _recurse_tags(tree[0], tree.label(), None, [])
                    if sibling is None:
                        return "({} {})".format(tree.label(), subtree)
                    else:
                        newlabel = "{}^{}".format(parent, sibling)
                        return "({} ({} {}))".format(
                                    newlabel, tree.label(), subtree)
                # Else if the unary rule leads to a terminal symbol
                elif sibling is None:
                    newlabel = tree.label()
                    new_word = tree[0].lower()
                    word_label = re.split("\|SPL", newlabel)[0]
                    lemma = lemmatize(pattern.sub(lambda m: replacing[re.escape(m.group(0))], new_word), word_label)
                    if lemma != pattern.sub(lambda m: replacing[re.escape(m.group(0))], new_word):
                        self._tagged_words.update([(lemma, word_label)]) 
                        self._tagged_words.update([(pattern.sub(lambda m: replacing[re.escape(m.group(0))], new_word), word_label)])
                    else:
                        self._tagged_words.update([(lemma, word_label)]) 
                    if new_word in set_of_dupls:
                        new_word = "".join([new_word, "#", str(unique_counts[0])])
                        unique_counts[0] += 1
                    
                    return "({} {})".format(pattern.sub(lambda m: replacing[re.escape(m.group(0))], newlabel), pattern.sub(lambda m: replacing[re.escape(m.group(0))], new_word))
                else:
                    newlabel = "{}^{}".format(parent, sibling)
                    new_word = tree[0].lower()
                    word_label = re.split("\|SPL", tree.label())[0]
                    lemma = lemmatize(pattern.sub(lambda m: replacing[re.escape(m.group(0))], new_word), word_label)
                    if lemma != pattern.sub(lambda m: replacing[re.escape(m.group(0))], new_word):

                        self._tagged_words.update([(lemma, word_label)]) #split because terminal nodes should not store infor about lex heads
                        self._tagged_words.update([(pattern.sub(lambda m: replacing[re.escape(m.group(0))], new_word), word_label)])
                    else:
                        self._tagged_words.update([(lemma, word_label)]) 
                    if new_word in set_of_dupls:
                        new_word = "".join([new_word, "#", str(unique_counts[0])])
                        unique_counts[0] += 1
                    pattern.sub(lambda m: replacing[re.escape(m.group(0))], new_word)
                    return "({} ({} {}))".format(
                                newlabel, tree.label(), pattern.sub(lambda m: replacing[re.escape(m.group(0))], new_word))

        # Start recursion
        if self._lexicalized == True:
            leaf_nodes = tree.leaves()
            _put_head_on_subtree(tree, leaf_nodes)

        temp_counter =  Counter([x.lower() for x in leaf_nodes])
        set_of_dupls = {x for x in temp_counter if temp_counter[x] > 1}

        unique_counts = [0]

        string = _recurse_tags(tree, None, None, [])

        return Tree.fromstring(string)
示例#56
0
        if len(item) < 1:
            i = i + 1
            sub_trees.append("")
        else:
            sub_trees[i] = sub_trees[i] + " " + item

    i = 0
    for item in sub_trees:
        sub_trees[i] = ' '.join(item.split())
        i = i + 1

    sub_trees = [t for t in sub_trees if t != '']

    return sub_trees


target = "I know this has already been answered, but I wanted to share a potentially better looking way to call Popen via the use of from x import x and functions."

sub_tree = getSentenceRelations("S", target)

root = parser.raw_parse(target)
tree_string = list(root)[0]
tree_string = str(tree_string).replace("\n", "")
tree_string = ' '.join(tree_string.split())
root = Tree.fromstring(tree_string)
list(root)[0].pretty_print()

for item in sub_tree:
    tree = Tree.fromstring(item)
    list(tree)[0].pretty_print()
from nltk import RegexpParser, Tree
from pos_tagged_oz import pos_tagged_oz

# define adjective-noun chunk grammar here
chunk_grammar = "AN: {<JJ><NN>}"

# create RegexpParser object here
chunk_parser = RegexpParser(chunk_grammar)

# chunk the pos-tagged sentence at index 282 in pos_tagged_oz here
scaredy_cat = chunk_parser.parse(pos_tagged_oz[282])

print(scaredy_cat)

# pretty_print the chunked sentence here
Tree.fromstring(str(scaredy_cat)).pretty_print()
示例#58
0
def generate_parse_tree(node):
    if node.n_lefts + node.n_rights > 0:
        return Tree(node.orth_,
                    [generate_parse_tree(child) for child in node.children])
    else:
        return node.orth_
示例#59
0
def tree_to_ptree(tree: nltk.Tree):
    tree_str = tree.__str__()
    ptree = PTree.fromstring(tree_str)
    return ptree
def main():
    parser = argparse.ArgumentParser(
        description=
        "ignore input; make a demo grammar that is compliant in form",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    addonoffarg(parser, 'debug', help="debug mode", default=False)
    parser.add_argument("--infile",
                        "-i",
                        nargs='?',
                        type=argparse.FileType('r'),
                        default=sys.stdin,
                        help="input file (ignored)")
    parser.add_argument("--outfile",
                        "-o",
                        nargs='?',
                        type=argparse.FileType('w'),
                        default=sys.stdout,
                        help="output file (grammar)")

    try:
        args = parser.parse_args()
    except IOError as msg:
        parser.error(str(msg))
    """
	#workdir = tempfile.mkdtemp(prefix=os.path.basename(__file__), dir=os.getenv('TMPDIR', '/tmp'))

	def cleanwork():
		shutil.rmtree(workdir, ignore_errors=True)
	if args.debug:
		print(workdir)
	else:
		atexit.register(cleanwork)
	"""

    infile = prepfile(args.infile, 'r')
    outfile = prepfile(args.outfile, 'w')

    # S is a list of all trees read from train.trees.pre.unk
    S = []
    for line in infile:
        S.append(line)

    from nltk import Tree
    from collections import defaultdict
    D = defaultdict(int)
    d = defaultdict(int)

    # for all trees in S
    for tree in range(0, len(S)):

        t = Tree.fromstring(S[tree])
        # length stores no of productions in each tree
        length = len(t.productions())

        # for each production in t
        for i in range(0, length):
            rule = str(t.productions()[i])
            #D stores count of each distinct rule
            D[rule] += 1
            #d stores count of unique heads of rules
            d[rule.split()[0]] += 1

    #used to find the rule with max frequency
    W = []
    for w in sorted(D, key=D.get, reverse=True):
        W.append([w, D.get(w)])
    #Rule with max frequency
    #print(W[0])

    # stores a list of all distinct rules with their frequencies
    W_new = []
    for w in D:
        W_new.append([w, D.get(w)])

    import math
    # stores a list of all rules and their probabilities
    H = []
    # for each distinct rule
    for w in W_new:
        r = d.get(w[0].split()[0])
        H.append([w[0], round((float(D.get(w[0])) / r), 7)])

    # stores the grammar rules along with their probabilities
    cfg = []
    for i in H:
        rule = str(i[0].replace('\'', '') + " # " + str(i[1]))
        #print(rule)
        cfg.append(rule)

    for i in cfg:
        outfile.write(i)
        outfile.write('\n')