Exemplo n.º 1
0
def compare_step(tree: Tree, template: Tree) -> bool:
    if template == '*':
        return True
    if type(template) != str and template.label() == '*':
        res_star = True
        for temp_node in template:
            res_star = res_star and compare_star_step(tree, temp_node)
        return res_star
    if type(tree) == str or type(template) == str:
        if tree == template:
            return True
        return False
    if tree.label() != template.label():
        return False
    else:
        res = False
        for t_node in template:
            nodes = get_node_by_label(tree, t_node)
            if nodes == []:
                return False
            for node in nodes:
                if compare_step(node, t_node):
                    return True
            return False
        return res
Exemplo n.º 2
0
    def _get_gold_spans(
        self, tree: Tree, index: int, typed_spans: Dict[Tuple[int, int], str]
    ) -> int:
        """
        Recursively construct the gold spans from an nltk `Tree`.
        Labels are the constituents, and in the case of nested constituents
        with the same spans, labels are concatenated in parent-child order.
        For example, `(S (NP (D the) (N man)))` would have an `S-NP` label
        for the outer span, as it has both `S` and `NP` label.
        Spans are inclusive.

        TODO(Mark): If we encounter a gold nested labelling at test time
        which we haven't encountered, we won't be able to run the model
        at all.

        # Parameters

        tree : `Tree`, required.
            An NLTK parse tree to extract spans from.
        index : `int`, required.
            The index of the current span in the sentence being considered.
        typed_spans : `Dict[Tuple[int, int], str]`, required.
            A dictionary mapping spans to span labels.

        # Returns

        typed_spans : `Dict[Tuple[int, int], str]`.
            A dictionary mapping all subtree spans in the parse tree
            to their constituency labels. POS tags are ignored.
        """
        # NLTK leaves are strings.
        if isinstance(tree[0], str):
            # The "length" of a tree is defined by
            # NLTK as the number of children.
            # We don't actually want the spans for leaves, because
            # their labels are POS tags. Instead, we just add the length
            # of the word to the end index as we iterate through.
            end = index + len(tree)
        else:
            # otherwise, the tree has children.
            child_start = index
            for child in tree:
                # typed_spans is being updated inplace.
                end = self._get_gold_spans(child, child_start, typed_spans)
                child_start = end
            # Set the end index of the current span to
            # the last appended index - 1, as the span is inclusive.
            span = (index, end - 1)
            current_span_label = typed_spans.get(span)
            if current_span_label is None:
                # This span doesn't have nested labels, just
                # use the current node's label.
                typed_spans[span] = tree.label()
            else:
                # This span has already been added, so prepend
                # this label (as we are traversing the tree from
                # the bottom up).
                typed_spans[span] = tree.label() + "-" + current_span_label

        return end
Exemplo n.º 3
0
def remove_punctuation(tree: Tree) -> list:
    if len(tree) == 1 and not isinstance(tree[0], Tree):
        if tree.label() in _PUNCTUATION_TAGS:
            return None
        else:
            return tree
    else:
        children = [remove_punctuation(child) for child in tree]
        children = [child for child in children if child is not None]
        return Tree(tree.label(), children)
Exemplo n.º 4
0
def get_node_by_label(tree: Tree, template: Tree):
    res_list = []
    if type(template) == str:
        return tree
    if template.label() == "*":
        return tree
    label = template.label()
    for node in tree:
        if type(node) != str:
            if node.label() == label:
                res_list.append(node)
    return res_list
Exemplo n.º 5
0
            def tree2dict(tree: Tree):
                result = {}

                result['type'] = tree.label()
                children = [
                    tree2dict(t) if isinstance(t, Tree) else t for t in tree
                ]
                if tree.label() == 'TOKEN':
                    result['id'] = int(children[0])
                elif children:
                    result['children'] = children

                return result
Exemplo n.º 6
0
def makeSyntacticTransfer(
        tree, rootPOS, firstChildPOS, secondChildPOS,
        translate):  #translate is boolean, True translates to Spanish
    numChildren = len(tree)
    if tree.label() == rootPOS:
        tempVar = False
        for index in range(0, numChildren):
            if (tree[index].label() == secondChildPOS and tempVar == True):
                tempTree = tree[index - 1]
                tree[index - 1] = tree[index]
                tree[index] = tempTree
            tempVar = False
            if (tree[index].label() == firstChildPOS):
                tempVar = True
    if tree.height() > 2:
        for index in range(0, numChildren):
            newTree = makeSyntacticTransfer(tree[index], rootPOS,
                                            firstChildPOS, secondChildPOS,
                                            translate)
            if newTree is not None:
                tree[index] = newTree
    elif translate == True:
        word = tree.leaves()[0]
        if translationsToSpanish.has_key(word):
            tree = Tree(tree.label(), [translationsToSpanish[word]])
        return tree
Exemplo n.º 7
0
    def uncollapse(self, tree):

        uncollapsed = []
        for child in tree:

            if type(child) == type(u'') or type(child) == type(""):
                uncollapsed.append(child)
            else:
                # It also removes EMPTY nodes
                while child.label() == SeqTree.EMPTY_LABEL and len(child) != 0:
                    child = child[-1]

                label = child.label()
                if '+' in label:

                    label_split = label.split('+')
                    swap = Tree(label_split[0], [])

                    last_swap_level = swap
                    for unary in label_split[1:]:
                        last_swap_level.append(Tree(unary, []))
                        last_swap_level = last_swap_level[-1]
                    last_swap_level.extend(child)
                    uncollapsed.append(self.uncollapse(swap))
                # We are uncollapsing the child node
                else:
                    uncollapsed.append(self.uncollapse(child))

        tree = Tree(tree.label(), uncollapsed)
        return tree
Exemplo n.º 8
0
def attach_tree(head,dep,attachment,chain,indexes,flag,coindex=None):
    #head,dep: trees; flag: 'right'/'left'
    """ attach dep's projection chain to head's projection chain """
    if isinstance(coindex,int): # handle coindex tag
        label = attachment['label2']
        offset = attachment['offset2']
        dep = Tree(dep.label(),['*-'+str(coindex)])        
    else:
        label = attachment['label']
        offset = attachment['offset']
        
    l_index = [l[0] for l in chain[0]].index(label)
    count = sum([l[1] for l in chain[0]][:l_index+1])-offset
    if flag=='right':
        a_index = indexes[count-1]+1
    elif flag=='left':
        a_index = indexes[count-1]
        indexes[count-1] += 1
    else:
        return "Invalid flag!"
    if head.label()=='PRN':
        s = 'head[0]'
    else:
        s = 'head'
    for i in range(count-1):
        s += '['+str(indexes[i])+']'
    eval(s+'.insert('+str(a_index)+',dep)') # insert() vs pop()
    
    if 'f_tag' in attachment:
        if attachment['f_tag'] not in {'PRD','PRDs'}:
            eval(s+'.set_label('+s+'.label()+"-"+attachment["f_tag"])')
        else:
            s += '['+str(indexes[count-1])+']'
            eval(s+'.set_label('+s+'.label()+"-"+attachment["f_tag"])')
    return head,indexes
Exemplo n.º 9
0
    def _uncollapse(self, tree):

        uncollapsed = []
        for child in tree:

            if type(child) == type(u'') or type(child) == type(""):
                uncollapsed.append(child)
            else:

                label = child.label()
                #NEWJOINT
                if self.join_char in label:

                    label_split = label.split(self.join_char)
                    swap = Tree(label_split[0], [])

                    last_swap_level = swap
                    for unary in label_split[1:]:
                        last_swap_level.append(Tree(unary, []))
                        last_swap_level = last_swap_level[-1]
                    last_swap_level.extend(child)
                    uncollapsed.append(self._uncollapse(swap))
                #We are uncolapsing the child node
                else:
                    uncollapsed.append(self._uncollapse(child))

        tree = Tree(tree.label(), uncollapsed)
        return tree
Exemplo n.º 10
0
    def _get_gold_spans(
            self,  # pylint: disable=arguments-differ
            tree: Tree,
            index: int,
            typed_spans: Dict[Tuple[int, int], str]) -> int:
        """
        Recursively construct the gold spans from an nltk ``Tree``.
        Spans are inclusive.

        Parameters
        ----------
        tree : ``Tree``, required.
            An NLTK parse tree to extract spans from.
        index : ``int``, required.
            The index of the current span in the sentence being considered.
        typed_spans : ``Dict[Tuple[int, int], str]``, required.
            A dictionary mapping spans to span labels.

        Returns
        -------
        typed_spans : ``Dict[Tuple[int, int], str]``.
            A dictionary mapping all subtree spans in the parse tree
            to their constituency labels. Leaf nodes have POS tag spans, which
            are denoted by a label of "LABEL-POS".
        """
        # NLTK leaves are strings.
        if isinstance(tree[0], str):
            # The "length" of a tree is defined by
            # NLTK as the number of children.
            # We don't actually want the spans for leaves, because
            # their labels are pos tags. However, it makes the
            # indexing more straightforward, so we'll collect them
            # and filter them out below. We subtract 1 from the end
            # index so the spans are inclusive.
            end = index + len(tree)
            typed_spans[(index, end - 1)] = tree.label() + "-POS"
        else:
            # otherwise, the tree has children.
            child_start = index
            for child in tree:
                # typed_spans is being updated inplace.
                end = self._get_gold_spans(child, child_start, typed_spans)
                child_start = end
            # Set the end index of the current span to
            # the last appended index - 1, as the span is inclusive.
            typed_spans[(index, end - 1)] = tree.label()
        return end
Exemplo n.º 11
0
 def rec(node: Tree):
     for sub_node in node:
         if isinstance(sub_node, Tree):
             rec(sub_node)
         elif isinstance(sub_node, str):
             if node.label() not in self.nonsense_attr:
                 if sub_node not in res:
                     res.append(sub_node)
Exemplo n.º 12
0
def _gen_pos_tags(tree: Tree) -> List[str]:
    """Return the POS tags from an NLTK tree."""
    if len(tree) == 1 and not isinstance(tree[0], Tree):
        if not tree[0].startswith("*"):
            yield tree.label()
    else:
        for child in tree:
            yield from _gen_pos_tags(child)
Exemplo n.º 13
0
 def munge(t):
     if type(t) == Tree:
         toks = t.leaves()
         t = Tree(t.label(), [munge(child) for child in t])
         setattr(t, "tokens", toks)
         return t
     else:
         return Tree(t, [])
Exemplo n.º 14
0
    def get_actions(cls, tree: Tree) -> List[Action]:
        if len(tree) == 1 and not isinstance(tree[0], Tree):
            return [cls.get_action_at_pos_node(tree)]

        actions: List[Action] = [NonTerminalAction(tree.label())]
        for child in tree:
            actions.extend(cls.get_actions(child))
        actions.append(ReduceAction())
        return actions
Exemplo n.º 15
0
 def build_tree(self, tree: Tree):
     children = self.sample_rule(tree.label())
     for child in children:
         if isinstance(child, Nonterminal):
             subtree = Tree(child, [])
             tree.append(subtree)
             self.build_tree(subtree)
         elif isinstance(child, str):
             tree.append(child)
         else:
             raise ValueError(f"Unexpected type {type(child)}")
Exemplo n.º 16
0
def to_numbers(tree: tree_mod.Tree, interner: interners.Interner):
    """
    Maps a tree to a tree of int(s) which are the numbers assigned to the node labels by the interner.

    :param tree:
    :param interner:
    :return:
    """
    num = interner(tree.label().strip())

    return tree_mod.Tree(num, [to_numbers(t, interner) for t in tree])
Exemplo n.º 17
0
def to_tensor(tree: tree_mod.Tree):
    """
    Maps a tree of int(s) to a tree of torch.LongTensor(s) which contain the same values.

    :param tree:
    :return:
    """
    lab = torch.LongTensor([int(tree.label())])
    children = [to_tensor(t) for t in tree]

    return tree_mod.Tree(lab, children)
Exemplo n.º 18
0
def all_nodes(tree: tree_mod.Tree):
    """
    Ensures that a tree contains only nodes that are also trees. This means that leaf nodes that are

    :param tree:
    :return:
    """
    if isinstance(tree, tree_mod.Tree):
        return tree_mod.Tree(tree.label(),
                             [all_nodes(child) for child in tree])
    else:
        return tree_mod.Tree(tree, [])
Exemplo n.º 19
0
def strip_functional_tags(tree: Tree) -> None:
    """
    Removes all functional tags from constituency labels in an NLTK tree.
    We also strip off anything after a =, - or | character, because these
    are functional tags which we don't want to use.
    This modification is done in-place.
    """
    clean_label = tree.label().split("=")[0].split("-")[0].split("|")[0]
    tree.set_label(clean_label)
    for child in tree:
        if not isinstance(child[0], str):
            strip_functional_tags(child)
Exemplo n.º 20
0
    def _strip_functional_tags(self, tree: Tree) -> None:
        """
        Removes all functional tags from constituency labels in an NLTK tree.
        We also strip off anything after a =, - or | character, because these
        are functional tags which we don't want to use.

        This modification is done in-place.
        """
        clean_label = tree.label().split("=")[0].split("-")[0].split("|")[0]
        tree.set_label(clean_label)
        for child in tree:
            if not isinstance(child[0], str):
                self._strip_functional_tags(child)
    def _get_gold_spans(
            self,  # pylint: disable=arguments-differ
            tree: Tree,
            index: int,
            typed_spans: Dict[Tuple[int, int], str]) -> int:
        # NLTK leaves are strings.
        if isinstance(tree[0], str):
            end = index + len(tree)
        else:
            # otherwise, the tree has children.
            child_start = index
            for child in tree:
                # typed_spans is being updated inplace.
                end = self._get_gold_spans(child, child_start, typed_spans)
                child_start = end
            span = (index, end - 1)
            current_span_label = typed_spans.get(span)
            if current_span_label is None:
                typed_spans[span] = tree.label()
            else:
                typed_spans[span] = tree.label() + "-" + current_span_label

        return end
Exemplo n.º 22
0
def find_step(tree: Tree, template: Tree) -> list:
    if template == "@":
        if type(tree) == str:
            return [tree]
        return tree.leaves()
    if template == '*':
        return []
    if type(template) != str and template.label() == '*':
        res_star = []
        for temp_node in template:
            res_star.extend(find_star_step(tree, temp_node))
        return res_star
    if type(tree) == str or type(template) == str:
        if tree == template:
            return []
        return []
    if tree.label() != template.label():
        return []
    else:
        res = []
        for t_node in template:
            for node in get_node_by_label(tree, t_node):
                res.extend(find_step(node, t_node))
        return res
Exemplo n.º 23
0
def get_constituencies_from_tree(tree: Tree, tags: List[str]):
    """
    This is a recursive function that searches through the tree (a nltk.tree.Tree) representing a constituency parse,
    and finds all nodes with a tag in tags.

    Returns:
        spans: list of strings. Each string corresponds to a node with one of the desired tags. The string is the
        node's leaves, joined together.
    """
    spans = []
    if tree.label() in tags:
        spans.append(' '.join(tree.leaves()))
    nonleaf_children = [child for child in tree if isinstance(child, Tree)]
    spans += [span for child in nonleaf_children for span in get_constituencies_from_tree(child, tags)]
    return spans
Exemplo n.º 24
0
def build_tree(node,chain): # -> handle function tags
    """ -> PS tree of node's projection chain """
    preterminal = node['tag']
    if 'lemma' in node: # not a trace-node
        if (node['lemma'].lower() in wh_lemmas) and \
           node['tag']!='CONJ': #WH feature
            preterminal += '-WH'    
    output = Tree(preterminal,[node['word']])
    for l in chain[0][::-1]:
        for i in range(l[1]):
            output = Tree(l[0],[output])
    if chain[1]:
        if chain[1]=='PRN':
            output = Tree(chain[1],[output])
        else:
            output.set_label(output.label()+'-'+chain[1])
    return output
Exemplo n.º 25
0
    def forward(self, tree: Tree):
        index = tree.label()

        # look up will give us a matrix, but that is hard to concat with the vectors coming from
        # the children, so we turn it into a vector...
        node = self.node_look_up(index)[0]

        data = node
        # concat it with the info from the children into a single vector ...
        for child in tree:
            data = torch.cat((data, self.forward(child)))

        # append the node embedding again to make sure backwards pass has enough info
        data = torch.cat((data, node))

        # and then resize the concatenation so we actually get a sequence of the correct length
        # (+2 since len gives us the number of children and we added the actual node label embedding
        # twice)
        data = data.view(len(tree) + 2, 1, -1)

        # should no longer need initial state - defaults to 0 in newer pytorch versions
        # index gets us the hidden state end of sequence vector
        # this will have dimensions: (num_layers * num_directions, batch, hidden_size)
        # we assume that there is exactly one element in the batch and we only want the last layer (?)
        embedding = self.node_lstm(data)

        encoded = embedding[1][0]
        # so we will re-size to (num_layers,num_directions, all the rest):
        encoded = encoded.view(self.num_layers, 2, -1)
        # and then retrieve only the last layer and merge the forward and backward direction into a single
        # vector:
        encoded = encoded[-1].view(-1)

        # then we squash the whole through a FF layer for the next level in the tree or the output:
        encoded = self.final_linearity(encoded)
        return self.non_linearity(encoded)
Exemplo n.º 26
0
def parse_string_tree(s, start):
    idx = start

    assert s[idx] == '('
    idx = idx + 1

    while s[idx] == ' ':
        idx = idx + 1

    if s[idx] == '(':

        tl, idx = parse_string_tree(s, idx)
        while s[idx] == ' ':
            idx = idx + 1

        if s[idx] == '(':
            t, idx = parse_string_tree(s, idx)

        # is a leaf

        t.insert(0, tl)

        # match closing bracket
        while s[idx] != ')':
            idx = idx + 1
        idx = idx + 1

        return t, idx
    else:
        # there is an input element
        aux = idx + 1
        while s[aux] != ' ' and s[aux] != ')' and s[aux] != '(':
            aux = aux + 1
        w = s[idx:aux]
        idx = aux
        t = Tree(w, [])

        while s[idx] == ' ':
            idx = idx + 1

        if s[idx] != '(':
            if s[idx] != ')':
                # another word
                aux = idx + 1
                while s[aux] != ' ' and s[aux] != ')' and s[aux] != '(':
                    aux = aux + 1
                wr = s[idx:aux]
                idx = aux
                tr = Tree(wr, [])

                t.append(tr)

            # match closing bracket
            while s[idx] != ')':
                idx = idx + 1
            idx = idx + 1

            return t, idx

        else:
            new_t, idx = parse_string_tree(s, idx)

            if len(t.label()) == 1:
                # is a variable
                new_t.insert(0, t)
                t = new_t
            else:
                # is an operator
                t.append(new_t)

            # match closing bracket
            while s[idx] != ')':
                idx = idx + 1
            idx = idx + 1

            return t, idx
Exemplo n.º 27
0
    def _get_gold_spans(
            self,  # pylint: disable=arguments-differ
            tree: Tree,
            index: int,
            typed_spans: Dict[Tuple[int, int], str]) -> int:
        """
        Recursively construct the gold spans from an nltk ``Tree``.
        Labels are the constituents, and in the case of nested constituents
        with the same spans, labels are concatenated in parent-child order.
        For example, ``(S (NP (D the) (N man)))`` would have an ``S-NP`` label
        for the outer span, as it has both ``S`` and ``NP`` label.
        Spans are inclusive.

        TODO(Mark): If we encounter a gold nested labelling at test time
        which we haven't encountered, we won't be able to run the model
        at all.

        Parameters
        ----------
        tree : ``Tree``, required.
            An NLTK parse tree to extract spans from.
        index : ``int``, required.
            The index of the current span in the sentence being considered.
        typed_spans : ``Dict[Tuple[int, int], str]``, required.
            A dictionary mapping spans to span labels.

        Returns
        -------
        typed_spans : ``Dict[Tuple[int, int], str]``.
            A dictionary mapping all subtree spans in the parse tree
            to their constituency labels. Leaf nodes have POS tag spans, which
            are denoted by a label of "LABEL-POS".
        """
        # NLTK leaves are strings.
        if isinstance(tree[0], str):
            # The "length" of a tree is defined by
            # NLTK as the number of children.
            # We don't actually want the spans for leaves, because
            # their labels are POS tags. However, it makes the
            # indexing more straightforward, so we'll collect them
            # and filter them out below. We subtract 1 from the end
            # index so the spans are inclusive.
            end = index + len(tree)
            typed_spans[(index, end - 1)] = tree.label() + "-POS"
        else:
            # otherwise, the tree has children.
            child_start = index
            for child in tree:
                # typed_spans is being updated inplace.
                end = self._get_gold_spans(child, child_start, typed_spans)
                child_start = end
            # Set the end index of the current span to
            # the last appended index - 1, as the span is inclusive.
            span = (index, end - 1)
            current_span_label = typed_spans.get(span)
            if current_span_label is None:
                # This span doesn't have nested labels, just
                # use the current node's label.
                typed_spans[span] = tree.label()
            else:
                # This span has already been added, so prepend
                # this label (as we are traversing the tree from
                # the bottom up).
                typed_spans[span] = tree.label() + "-" + current_span_label

        return end
Exemplo n.º 28
0
		def traverse(node):
			def extract_tags(W):
				pos = [W.getAttribute('lc') if W.getAttribute('lc') else None]
				if W.getAttribute('clitic') in {'ezafe', 'pronominal', 'verb', 'prep', 'adv', 'det'}:
					pos.append(W.getAttribute('clitic'))
				if W.getAttribute('ne_sort'):
					pos.append(W.getAttribute('ne_sort'))
				if W.getAttribute('n_type'):
					pos.append(W.getAttribute('n_type'))
				if W.getAttribute('ya_type'):
					pos.append(W.getAttribute('ya_type'))
				if W.getAttribute('ke_type'):
					pos.append(W.getAttribute('ke_type'))
				if W.getAttribute('type'):
					pos.append(W.getAttribute('type'))
				if W.getAttribute('kind'):
					pos.append(W.getAttribute('kind'))
				return pos

			def clitic_join(tree, clitic):
				if type(tree[-1]) == Tree:
					return clitic_join(tree[-1], clitic)
				else:
					if(clitic[0][0][0] == 'ا'):
						clitic[0] = ('‌' + clitic[0][0], clitic[0][1])
					tree[-1]=(tree[-1][0] + clitic[0][0], clitic[0][1])
					tree.set_label('CLITICS')
					return

			if not len(node.childNodes):
				return
			first = node.childNodes[0]
			if first.tagName == 'w':
				pos=extract_tags(first)
				return Tree(node.tagName, [(first.childNodes[0].data.replace('می ', 'می‌'), self._pos_map(pos))])
			childs = node.childNodes[2:] if node.tagName == 'S' else node.childNodes
			for child in childs:
				if not len(child.childNodes):
					childs.remove(child)
			tree = Tree(node.tagName, map(traverse, childs))
			if self._join_clitics and len(tree) > 1 and type(tree[1]) == Tree and tree[1].label() == 'CLITIC' and tree[1][0][1] not in {'P', 'V'}:
				clitic=tree[-1]
				tree = Tree(tree.label(), [subtree for subtree in tree[0]])
				clitic_join(tree, clitic)
			if self._join_verb_parts and len(tree) > 1 and type(tree[1]) == Tree and type(tree[0]) == Tree and tree[0].label() == 'AUX' and tree[0][0][0] in self._tokenizer.before_verbs:
				tree[1][0] = (tree[0][0][0] + ' ' + tree[1][0][0], tree[1][0][1])
				tree.remove(tree[0])
			if self._join_verb_parts and len(tree.leaves()) > 1 and tree.leaves()[-1][0] in self._tokenizer.after_verbs and tree.leaves()[-2][0] in self._tokenizer.verbe :
				tree[1][0] = (tree[0].leaves()[-1][0] + ' ' + tree[1][0][0], tree[1][0][1])
				path = tree.leaf_treeposition(len(tree.leaves())-2)
				removingtree = tree
				while len(path) > 2 :
					removingtree = removingtree[path[0]]
					path = path[1:]
				removingtree.remove(Tree(tree.pos()[-2][1],[tree.pos()[-2][0]]))
			if self._join_verb_parts and len(tree.leaves()) > 1 and tree.leaves()[-1][0] in self._tokenizer.after_verbs and tree.leaves()[-2][0] in self._tokenizer.verbe :
				tree[1][0] = (tree[0].leaves()[-1][0] + ' ' + tree[1][0][0], tree[1][0][1])
				path = tree.leaf_treeposition(len(tree.leaves())-2)
				removingtree = tree
				while len(path) > 2 :
					removingtree = removingtree[path[0]]
					path = path[1:]
				removingtree.remove(Tree(tree.pos()[-2][1],[tree.pos()[-2][0]]))
			return tree
Exemplo n.º 29
0
from nltk.tree import Tree

vp = Tree('VP', [Tree('V', ['saw']), Tree('NP', ['him'])])

s = Tree('S', [Tree('NP', ['I']), vp])

print(s)

dp1 = Tree('dp', [Tree('d', ['the']), Tree('np', ['dog'])])
dp2 = Tree('dp', [Tree('d', ['the']), Tree('np', ['cat'])])
vp = Tree('vp', [Tree('v', ['chased']), dp2])

dp1.draw()

dp2.draw()

vp.draw()

tree = Tree('s', [dp1, vp])

print(tree)
tree.draw()

len(tree)

print(tree.leaves())

tree.label()

dp1.label()
Exemplo n.º 30
0
        def traverse(node):
            def extract_tags(W):
                pos = [W.getAttribute('lc') if W.getAttribute('lc') else None]
                if W.getAttribute('clitic') in {
                        'ezafe', 'pronominal', 'verb', 'prep', 'adv', 'det'
                }:
                    pos.append(W.getAttribute('clitic'))
                if W.getAttribute('ne_sort'):
                    pos.append(W.getAttribute('ne_sort'))
                if W.getAttribute('n_type'):
                    pos.append(W.getAttribute('n_type'))
                if W.getAttribute('ya_type'):
                    pos.append(W.getAttribute('ya_type'))
                if W.getAttribute('ke_type'):
                    pos.append(W.getAttribute('ke_type'))
                if W.getAttribute('type'):
                    pos.append(W.getAttribute('type'))
                if W.getAttribute('kind'):
                    pos.append(W.getAttribute('kind'))
                return pos

            def clitic_join(tree, clitic):
                if type(tree[-1]) == Tree:
                    return clitic_join(tree[-1], clitic)
                else:
                    if (clitic[0][0][0] == 'ا'):
                        clitic[0] = ('‌' + clitic[0][0], clitic[0][1])
                    tree[-1] = (tree[-1][0] + clitic[0][0], clitic[0][1])
                    tree.set_label('CLITICS')
                    return

            if not len(node.childNodes):
                return
            first = node.childNodes[0]
            if first.tagName == 'w':
                pos = extract_tags(first)
                return Tree(node.tagName, [(first.childNodes[0].data.replace(
                    'می ', 'می‌'), self._pos_map(pos))])
            childs = node.childNodes[
                2:] if node.tagName == 'S' else node.childNodes
            for child in childs:
                if not len(child.childNodes):
                    childs.remove(child)
            tree = Tree(node.tagName, map(traverse, childs))
            if self._join_clitics and len(tree) > 1 and type(
                    tree[1]) == Tree and tree[1].label(
                    ) == 'CLITIC' and tree[1][0][1] not in {'P', 'V'}:
                clitic = tree[-1]
                tree = Tree(tree.label(), [subtree for subtree in tree[0]])
                clitic_join(tree, clitic)
            if self._join_verb_parts and len(tree) > 1 and type(
                    tree[1]) == Tree and type(
                        tree[0]) == Tree and tree[0].label() == 'AUX' and tree[
                            0][0][0] in self._tokenizer.before_verbs:
                tree[1][0] = (tree[0][0][0] + ' ' + tree[1][0][0],
                              tree[1][0][1])
                tree.remove(tree[0])
            if self._join_verb_parts and len(
                    tree.leaves()) > 1 and tree.leaves(
                    )[-1][0] in self._tokenizer.after_verbs and tree.leaves(
                    )[-2][0] in self._tokenizer.verbe:
                tree[1][0] = (tree[0].leaves()[-1][0] + ' ' + tree[1][0][0],
                              tree[1][0][1])
                path = tree.leaf_treeposition(len(tree.leaves()) - 2)
                removingtree = tree
                while len(path) > 2:
                    removingtree = removingtree[path[0]]
                    path = path[1:]
                removingtree.remove(
                    Tree(tree.pos()[-2][1], [tree.pos()[-2][0]]))
            if self._join_verb_parts and len(
                    tree.leaves()) > 1 and tree.leaves(
                    )[-1][0] in self._tokenizer.after_verbs and tree.leaves(
                    )[-2][0] in self._tokenizer.verbe:
                tree[1][0] = (tree[0].leaves()[-1][0] + ' ' + tree[1][0][0],
                              tree[1][0][1])
                path = tree.leaf_treeposition(len(tree.leaves()) - 2)
                removingtree = tree
                while len(path) > 2:
                    removingtree = removingtree[path[0]]
                    path = path[1:]
                removingtree.remove(
                    Tree(tree.pos()[-2][1], [tree.pos()[-2][0]]))
            return tree
 def _strip_functional_tags(self, tree: Tree) -> None:
     clean_label = tree.label().split("=")[0].split("-")[0].split("|")[0]
     tree.set_label(clean_label)
     for child in tree:
         if not isinstance(child[0], str):
             self._strip_functional_tags(child)
Exemplo n.º 32
0
    def _get_gold_spans(self, # pylint: disable=arguments-differ
                        tree: Tree,
                        index: int,
                        typed_spans: Dict[Tuple[int, int], str]) -> int:
        """
        Recursively construct the gold spans from an nltk ``Tree``.
        Labels are the constituents, and in the case of nested constituents
        with the same spans, labels are concatenated in parent-child order.
        For example, ``(S (NP (D the) (N man)))`` would have an ``S-NP`` label
        for the outer span, as it has both ``S`` and ``NP`` label.
        Spans are inclusive.

        TODO(Mark): If we encounter a gold nested labelling at test time
        which we haven't encountered, we won't be able to run the model
        at all.

        Parameters
        ----------
        tree : ``Tree``, required.
            An NLTK parse tree to extract spans from.
        index : ``int``, required.
            The index of the current span in the sentence being considered.
        typed_spans : ``Dict[Tuple[int, int], str]``, required.
            A dictionary mapping spans to span labels.

        Returns
        -------
        typed_spans : ``Dict[Tuple[int, int], str]``.
            A dictionary mapping all subtree spans in the parse tree
            to their constituency labels. POS tags are ignored.
        """
        # NLTK leaves are strings.
        if isinstance(tree[0], str):
            # The "length" of a tree is defined by
            # NLTK as the number of children.
            # We don't actually want the spans for leaves, because
            # their labels are POS tags. Instead, we just add the length
            # of the word to the end index as we iterate through.
            end = index + len(tree)
        else:
            # otherwise, the tree has children.
            child_start = index
            for child in tree:
                # typed_spans is being updated inplace.
                end = self._get_gold_spans(child, child_start, typed_spans)
                child_start = end
            # Set the end index of the current span to
            # the last appended index - 1, as the span is inclusive.
            span = (index, end - 1)
            current_span_label = typed_spans.get(span)
            if current_span_label is None:
                # This span doesn't have nested labels, just
                # use the current node's label.
                typed_spans[span] = tree.label()
            else:
                # This span has already been added, so prepend
                # this label (as we are traversing the tree from
                # the bottom up).
                typed_spans[span] = tree.label() + "-" + current_span_label

        return end