def compare_step(tree: Tree, template: Tree) -> bool: if template == '*': return True if type(template) != str and template.label() == '*': res_star = True for temp_node in template: res_star = res_star and compare_star_step(tree, temp_node) return res_star if type(tree) == str or type(template) == str: if tree == template: return True return False if tree.label() != template.label(): return False else: res = False for t_node in template: nodes = get_node_by_label(tree, t_node) if nodes == []: return False for node in nodes: if compare_step(node, t_node): return True return False return res
def _get_gold_spans( self, tree: Tree, index: int, typed_spans: Dict[Tuple[int, int], str] ) -> int: """ Recursively construct the gold spans from an nltk `Tree`. Labels are the constituents, and in the case of nested constituents with the same spans, labels are concatenated in parent-child order. For example, `(S (NP (D the) (N man)))` would have an `S-NP` label for the outer span, as it has both `S` and `NP` label. Spans are inclusive. TODO(Mark): If we encounter a gold nested labelling at test time which we haven't encountered, we won't be able to run the model at all. # Parameters tree : `Tree`, required. An NLTK parse tree to extract spans from. index : `int`, required. The index of the current span in the sentence being considered. typed_spans : `Dict[Tuple[int, int], str]`, required. A dictionary mapping spans to span labels. # Returns typed_spans : `Dict[Tuple[int, int], str]`. A dictionary mapping all subtree spans in the parse tree to their constituency labels. POS tags are ignored. """ # NLTK leaves are strings. if isinstance(tree[0], str): # The "length" of a tree is defined by # NLTK as the number of children. # We don't actually want the spans for leaves, because # their labels are POS tags. Instead, we just add the length # of the word to the end index as we iterate through. end = index + len(tree) else: # otherwise, the tree has children. child_start = index for child in tree: # typed_spans is being updated inplace. end = self._get_gold_spans(child, child_start, typed_spans) child_start = end # Set the end index of the current span to # the last appended index - 1, as the span is inclusive. span = (index, end - 1) current_span_label = typed_spans.get(span) if current_span_label is None: # This span doesn't have nested labels, just # use the current node's label. typed_spans[span] = tree.label() else: # This span has already been added, so prepend # this label (as we are traversing the tree from # the bottom up). typed_spans[span] = tree.label() + "-" + current_span_label return end
def remove_punctuation(tree: Tree) -> list: if len(tree) == 1 and not isinstance(tree[0], Tree): if tree.label() in _PUNCTUATION_TAGS: return None else: return tree else: children = [remove_punctuation(child) for child in tree] children = [child for child in children if child is not None] return Tree(tree.label(), children)
def get_node_by_label(tree: Tree, template: Tree): res_list = [] if type(template) == str: return tree if template.label() == "*": return tree label = template.label() for node in tree: if type(node) != str: if node.label() == label: res_list.append(node) return res_list
def tree2dict(tree: Tree): result = {} result['type'] = tree.label() children = [ tree2dict(t) if isinstance(t, Tree) else t for t in tree ] if tree.label() == 'TOKEN': result['id'] = int(children[0]) elif children: result['children'] = children return result
def makeSyntacticTransfer( tree, rootPOS, firstChildPOS, secondChildPOS, translate): #translate is boolean, True translates to Spanish numChildren = len(tree) if tree.label() == rootPOS: tempVar = False for index in range(0, numChildren): if (tree[index].label() == secondChildPOS and tempVar == True): tempTree = tree[index - 1] tree[index - 1] = tree[index] tree[index] = tempTree tempVar = False if (tree[index].label() == firstChildPOS): tempVar = True if tree.height() > 2: for index in range(0, numChildren): newTree = makeSyntacticTransfer(tree[index], rootPOS, firstChildPOS, secondChildPOS, translate) if newTree is not None: tree[index] = newTree elif translate == True: word = tree.leaves()[0] if translationsToSpanish.has_key(word): tree = Tree(tree.label(), [translationsToSpanish[word]]) return tree
def uncollapse(self, tree): uncollapsed = [] for child in tree: if type(child) == type(u'') or type(child) == type(""): uncollapsed.append(child) else: # It also removes EMPTY nodes while child.label() == SeqTree.EMPTY_LABEL and len(child) != 0: child = child[-1] label = child.label() if '+' in label: label_split = label.split('+') swap = Tree(label_split[0], []) last_swap_level = swap for unary in label_split[1:]: last_swap_level.append(Tree(unary, [])) last_swap_level = last_swap_level[-1] last_swap_level.extend(child) uncollapsed.append(self.uncollapse(swap)) # We are uncollapsing the child node else: uncollapsed.append(self.uncollapse(child)) tree = Tree(tree.label(), uncollapsed) return tree
def attach_tree(head,dep,attachment,chain,indexes,flag,coindex=None): #head,dep: trees; flag: 'right'/'left' """ attach dep's projection chain to head's projection chain """ if isinstance(coindex,int): # handle coindex tag label = attachment['label2'] offset = attachment['offset2'] dep = Tree(dep.label(),['*-'+str(coindex)]) else: label = attachment['label'] offset = attachment['offset'] l_index = [l[0] for l in chain[0]].index(label) count = sum([l[1] for l in chain[0]][:l_index+1])-offset if flag=='right': a_index = indexes[count-1]+1 elif flag=='left': a_index = indexes[count-1] indexes[count-1] += 1 else: return "Invalid flag!" if head.label()=='PRN': s = 'head[0]' else: s = 'head' for i in range(count-1): s += '['+str(indexes[i])+']' eval(s+'.insert('+str(a_index)+',dep)') # insert() vs pop() if 'f_tag' in attachment: if attachment['f_tag'] not in {'PRD','PRDs'}: eval(s+'.set_label('+s+'.label()+"-"+attachment["f_tag"])') else: s += '['+str(indexes[count-1])+']' eval(s+'.set_label('+s+'.label()+"-"+attachment["f_tag"])') return head,indexes
def _uncollapse(self, tree): uncollapsed = [] for child in tree: if type(child) == type(u'') or type(child) == type(""): uncollapsed.append(child) else: label = child.label() #NEWJOINT if self.join_char in label: label_split = label.split(self.join_char) swap = Tree(label_split[0], []) last_swap_level = swap for unary in label_split[1:]: last_swap_level.append(Tree(unary, [])) last_swap_level = last_swap_level[-1] last_swap_level.extend(child) uncollapsed.append(self._uncollapse(swap)) #We are uncolapsing the child node else: uncollapsed.append(self._uncollapse(child)) tree = Tree(tree.label(), uncollapsed) return tree
def _get_gold_spans( self, # pylint: disable=arguments-differ tree: Tree, index: int, typed_spans: Dict[Tuple[int, int], str]) -> int: """ Recursively construct the gold spans from an nltk ``Tree``. Spans are inclusive. Parameters ---------- tree : ``Tree``, required. An NLTK parse tree to extract spans from. index : ``int``, required. The index of the current span in the sentence being considered. typed_spans : ``Dict[Tuple[int, int], str]``, required. A dictionary mapping spans to span labels. Returns ------- typed_spans : ``Dict[Tuple[int, int], str]``. A dictionary mapping all subtree spans in the parse tree to their constituency labels. Leaf nodes have POS tag spans, which are denoted by a label of "LABEL-POS". """ # NLTK leaves are strings. if isinstance(tree[0], str): # The "length" of a tree is defined by # NLTK as the number of children. # We don't actually want the spans for leaves, because # their labels are pos tags. However, it makes the # indexing more straightforward, so we'll collect them # and filter them out below. We subtract 1 from the end # index so the spans are inclusive. end = index + len(tree) typed_spans[(index, end - 1)] = tree.label() + "-POS" else: # otherwise, the tree has children. child_start = index for child in tree: # typed_spans is being updated inplace. end = self._get_gold_spans(child, child_start, typed_spans) child_start = end # Set the end index of the current span to # the last appended index - 1, as the span is inclusive. typed_spans[(index, end - 1)] = tree.label() return end
def rec(node: Tree): for sub_node in node: if isinstance(sub_node, Tree): rec(sub_node) elif isinstance(sub_node, str): if node.label() not in self.nonsense_attr: if sub_node not in res: res.append(sub_node)
def _gen_pos_tags(tree: Tree) -> List[str]: """Return the POS tags from an NLTK tree.""" if len(tree) == 1 and not isinstance(tree[0], Tree): if not tree[0].startswith("*"): yield tree.label() else: for child in tree: yield from _gen_pos_tags(child)
def munge(t): if type(t) == Tree: toks = t.leaves() t = Tree(t.label(), [munge(child) for child in t]) setattr(t, "tokens", toks) return t else: return Tree(t, [])
def get_actions(cls, tree: Tree) -> List[Action]: if len(tree) == 1 and not isinstance(tree[0], Tree): return [cls.get_action_at_pos_node(tree)] actions: List[Action] = [NonTerminalAction(tree.label())] for child in tree: actions.extend(cls.get_actions(child)) actions.append(ReduceAction()) return actions
def build_tree(self, tree: Tree): children = self.sample_rule(tree.label()) for child in children: if isinstance(child, Nonterminal): subtree = Tree(child, []) tree.append(subtree) self.build_tree(subtree) elif isinstance(child, str): tree.append(child) else: raise ValueError(f"Unexpected type {type(child)}")
def to_numbers(tree: tree_mod.Tree, interner: interners.Interner): """ Maps a tree to a tree of int(s) which are the numbers assigned to the node labels by the interner. :param tree: :param interner: :return: """ num = interner(tree.label().strip()) return tree_mod.Tree(num, [to_numbers(t, interner) for t in tree])
def to_tensor(tree: tree_mod.Tree): """ Maps a tree of int(s) to a tree of torch.LongTensor(s) which contain the same values. :param tree: :return: """ lab = torch.LongTensor([int(tree.label())]) children = [to_tensor(t) for t in tree] return tree_mod.Tree(lab, children)
def all_nodes(tree: tree_mod.Tree): """ Ensures that a tree contains only nodes that are also trees. This means that leaf nodes that are :param tree: :return: """ if isinstance(tree, tree_mod.Tree): return tree_mod.Tree(tree.label(), [all_nodes(child) for child in tree]) else: return tree_mod.Tree(tree, [])
def strip_functional_tags(tree: Tree) -> None: """ Removes all functional tags from constituency labels in an NLTK tree. We also strip off anything after a =, - or | character, because these are functional tags which we don't want to use. This modification is done in-place. """ clean_label = tree.label().split("=")[0].split("-")[0].split("|")[0] tree.set_label(clean_label) for child in tree: if not isinstance(child[0], str): strip_functional_tags(child)
def _strip_functional_tags(self, tree: Tree) -> None: """ Removes all functional tags from constituency labels in an NLTK tree. We also strip off anything after a =, - or | character, because these are functional tags which we don't want to use. This modification is done in-place. """ clean_label = tree.label().split("=")[0].split("-")[0].split("|")[0] tree.set_label(clean_label) for child in tree: if not isinstance(child[0], str): self._strip_functional_tags(child)
def _get_gold_spans( self, # pylint: disable=arguments-differ tree: Tree, index: int, typed_spans: Dict[Tuple[int, int], str]) -> int: # NLTK leaves are strings. if isinstance(tree[0], str): end = index + len(tree) else: # otherwise, the tree has children. child_start = index for child in tree: # typed_spans is being updated inplace. end = self._get_gold_spans(child, child_start, typed_spans) child_start = end span = (index, end - 1) current_span_label = typed_spans.get(span) if current_span_label is None: typed_spans[span] = tree.label() else: typed_spans[span] = tree.label() + "-" + current_span_label return end
def find_step(tree: Tree, template: Tree) -> list: if template == "@": if type(tree) == str: return [tree] return tree.leaves() if template == '*': return [] if type(template) != str and template.label() == '*': res_star = [] for temp_node in template: res_star.extend(find_star_step(tree, temp_node)) return res_star if type(tree) == str or type(template) == str: if tree == template: return [] return [] if tree.label() != template.label(): return [] else: res = [] for t_node in template: for node in get_node_by_label(tree, t_node): res.extend(find_step(node, t_node)) return res
def get_constituencies_from_tree(tree: Tree, tags: List[str]): """ This is a recursive function that searches through the tree (a nltk.tree.Tree) representing a constituency parse, and finds all nodes with a tag in tags. Returns: spans: list of strings. Each string corresponds to a node with one of the desired tags. The string is the node's leaves, joined together. """ spans = [] if tree.label() in tags: spans.append(' '.join(tree.leaves())) nonleaf_children = [child for child in tree if isinstance(child, Tree)] spans += [span for child in nonleaf_children for span in get_constituencies_from_tree(child, tags)] return spans
def build_tree(node,chain): # -> handle function tags """ -> PS tree of node's projection chain """ preterminal = node['tag'] if 'lemma' in node: # not a trace-node if (node['lemma'].lower() in wh_lemmas) and \ node['tag']!='CONJ': #WH feature preterminal += '-WH' output = Tree(preterminal,[node['word']]) for l in chain[0][::-1]: for i in range(l[1]): output = Tree(l[0],[output]) if chain[1]: if chain[1]=='PRN': output = Tree(chain[1],[output]) else: output.set_label(output.label()+'-'+chain[1]) return output
def forward(self, tree: Tree): index = tree.label() # look up will give us a matrix, but that is hard to concat with the vectors coming from # the children, so we turn it into a vector... node = self.node_look_up(index)[0] data = node # concat it with the info from the children into a single vector ... for child in tree: data = torch.cat((data, self.forward(child))) # append the node embedding again to make sure backwards pass has enough info data = torch.cat((data, node)) # and then resize the concatenation so we actually get a sequence of the correct length # (+2 since len gives us the number of children and we added the actual node label embedding # twice) data = data.view(len(tree) + 2, 1, -1) # should no longer need initial state - defaults to 0 in newer pytorch versions # index gets us the hidden state end of sequence vector # this will have dimensions: (num_layers * num_directions, batch, hidden_size) # we assume that there is exactly one element in the batch and we only want the last layer (?) embedding = self.node_lstm(data) encoded = embedding[1][0] # so we will re-size to (num_layers,num_directions, all the rest): encoded = encoded.view(self.num_layers, 2, -1) # and then retrieve only the last layer and merge the forward and backward direction into a single # vector: encoded = encoded[-1].view(-1) # then we squash the whole through a FF layer for the next level in the tree or the output: encoded = self.final_linearity(encoded) return self.non_linearity(encoded)
def parse_string_tree(s, start): idx = start assert s[idx] == '(' idx = idx + 1 while s[idx] == ' ': idx = idx + 1 if s[idx] == '(': tl, idx = parse_string_tree(s, idx) while s[idx] == ' ': idx = idx + 1 if s[idx] == '(': t, idx = parse_string_tree(s, idx) # is a leaf t.insert(0, tl) # match closing bracket while s[idx] != ')': idx = idx + 1 idx = idx + 1 return t, idx else: # there is an input element aux = idx + 1 while s[aux] != ' ' and s[aux] != ')' and s[aux] != '(': aux = aux + 1 w = s[idx:aux] idx = aux t = Tree(w, []) while s[idx] == ' ': idx = idx + 1 if s[idx] != '(': if s[idx] != ')': # another word aux = idx + 1 while s[aux] != ' ' and s[aux] != ')' and s[aux] != '(': aux = aux + 1 wr = s[idx:aux] idx = aux tr = Tree(wr, []) t.append(tr) # match closing bracket while s[idx] != ')': idx = idx + 1 idx = idx + 1 return t, idx else: new_t, idx = parse_string_tree(s, idx) if len(t.label()) == 1: # is a variable new_t.insert(0, t) t = new_t else: # is an operator t.append(new_t) # match closing bracket while s[idx] != ')': idx = idx + 1 idx = idx + 1 return t, idx
def _get_gold_spans( self, # pylint: disable=arguments-differ tree: Tree, index: int, typed_spans: Dict[Tuple[int, int], str]) -> int: """ Recursively construct the gold spans from an nltk ``Tree``. Labels are the constituents, and in the case of nested constituents with the same spans, labels are concatenated in parent-child order. For example, ``(S (NP (D the) (N man)))`` would have an ``S-NP`` label for the outer span, as it has both ``S`` and ``NP`` label. Spans are inclusive. TODO(Mark): If we encounter a gold nested labelling at test time which we haven't encountered, we won't be able to run the model at all. Parameters ---------- tree : ``Tree``, required. An NLTK parse tree to extract spans from. index : ``int``, required. The index of the current span in the sentence being considered. typed_spans : ``Dict[Tuple[int, int], str]``, required. A dictionary mapping spans to span labels. Returns ------- typed_spans : ``Dict[Tuple[int, int], str]``. A dictionary mapping all subtree spans in the parse tree to their constituency labels. Leaf nodes have POS tag spans, which are denoted by a label of "LABEL-POS". """ # NLTK leaves are strings. if isinstance(tree[0], str): # The "length" of a tree is defined by # NLTK as the number of children. # We don't actually want the spans for leaves, because # their labels are POS tags. However, it makes the # indexing more straightforward, so we'll collect them # and filter them out below. We subtract 1 from the end # index so the spans are inclusive. end = index + len(tree) typed_spans[(index, end - 1)] = tree.label() + "-POS" else: # otherwise, the tree has children. child_start = index for child in tree: # typed_spans is being updated inplace. end = self._get_gold_spans(child, child_start, typed_spans) child_start = end # Set the end index of the current span to # the last appended index - 1, as the span is inclusive. span = (index, end - 1) current_span_label = typed_spans.get(span) if current_span_label is None: # This span doesn't have nested labels, just # use the current node's label. typed_spans[span] = tree.label() else: # This span has already been added, so prepend # this label (as we are traversing the tree from # the bottom up). typed_spans[span] = tree.label() + "-" + current_span_label return end
def traverse(node): def extract_tags(W): pos = [W.getAttribute('lc') if W.getAttribute('lc') else None] if W.getAttribute('clitic') in {'ezafe', 'pronominal', 'verb', 'prep', 'adv', 'det'}: pos.append(W.getAttribute('clitic')) if W.getAttribute('ne_sort'): pos.append(W.getAttribute('ne_sort')) if W.getAttribute('n_type'): pos.append(W.getAttribute('n_type')) if W.getAttribute('ya_type'): pos.append(W.getAttribute('ya_type')) if W.getAttribute('ke_type'): pos.append(W.getAttribute('ke_type')) if W.getAttribute('type'): pos.append(W.getAttribute('type')) if W.getAttribute('kind'): pos.append(W.getAttribute('kind')) return pos def clitic_join(tree, clitic): if type(tree[-1]) == Tree: return clitic_join(tree[-1], clitic) else: if(clitic[0][0][0] == 'ا'): clitic[0] = ('' + clitic[0][0], clitic[0][1]) tree[-1]=(tree[-1][0] + clitic[0][0], clitic[0][1]) tree.set_label('CLITICS') return if not len(node.childNodes): return first = node.childNodes[0] if first.tagName == 'w': pos=extract_tags(first) return Tree(node.tagName, [(first.childNodes[0].data.replace('می ', 'می'), self._pos_map(pos))]) childs = node.childNodes[2:] if node.tagName == 'S' else node.childNodes for child in childs: if not len(child.childNodes): childs.remove(child) tree = Tree(node.tagName, map(traverse, childs)) if self._join_clitics and len(tree) > 1 and type(tree[1]) == Tree and tree[1].label() == 'CLITIC' and tree[1][0][1] not in {'P', 'V'}: clitic=tree[-1] tree = Tree(tree.label(), [subtree for subtree in tree[0]]) clitic_join(tree, clitic) if self._join_verb_parts and len(tree) > 1 and type(tree[1]) == Tree and type(tree[0]) == Tree and tree[0].label() == 'AUX' and tree[0][0][0] in self._tokenizer.before_verbs: tree[1][0] = (tree[0][0][0] + ' ' + tree[1][0][0], tree[1][0][1]) tree.remove(tree[0]) if self._join_verb_parts and len(tree.leaves()) > 1 and tree.leaves()[-1][0] in self._tokenizer.after_verbs and tree.leaves()[-2][0] in self._tokenizer.verbe : tree[1][0] = (tree[0].leaves()[-1][0] + ' ' + tree[1][0][0], tree[1][0][1]) path = tree.leaf_treeposition(len(tree.leaves())-2) removingtree = tree while len(path) > 2 : removingtree = removingtree[path[0]] path = path[1:] removingtree.remove(Tree(tree.pos()[-2][1],[tree.pos()[-2][0]])) if self._join_verb_parts and len(tree.leaves()) > 1 and tree.leaves()[-1][0] in self._tokenizer.after_verbs and tree.leaves()[-2][0] in self._tokenizer.verbe : tree[1][0] = (tree[0].leaves()[-1][0] + ' ' + tree[1][0][0], tree[1][0][1]) path = tree.leaf_treeposition(len(tree.leaves())-2) removingtree = tree while len(path) > 2 : removingtree = removingtree[path[0]] path = path[1:] removingtree.remove(Tree(tree.pos()[-2][1],[tree.pos()[-2][0]])) return tree
from nltk.tree import Tree vp = Tree('VP', [Tree('V', ['saw']), Tree('NP', ['him'])]) s = Tree('S', [Tree('NP', ['I']), vp]) print(s) dp1 = Tree('dp', [Tree('d', ['the']), Tree('np', ['dog'])]) dp2 = Tree('dp', [Tree('d', ['the']), Tree('np', ['cat'])]) vp = Tree('vp', [Tree('v', ['chased']), dp2]) dp1.draw() dp2.draw() vp.draw() tree = Tree('s', [dp1, vp]) print(tree) tree.draw() len(tree) print(tree.leaves()) tree.label() dp1.label()
def traverse(node): def extract_tags(W): pos = [W.getAttribute('lc') if W.getAttribute('lc') else None] if W.getAttribute('clitic') in { 'ezafe', 'pronominal', 'verb', 'prep', 'adv', 'det' }: pos.append(W.getAttribute('clitic')) if W.getAttribute('ne_sort'): pos.append(W.getAttribute('ne_sort')) if W.getAttribute('n_type'): pos.append(W.getAttribute('n_type')) if W.getAttribute('ya_type'): pos.append(W.getAttribute('ya_type')) if W.getAttribute('ke_type'): pos.append(W.getAttribute('ke_type')) if W.getAttribute('type'): pos.append(W.getAttribute('type')) if W.getAttribute('kind'): pos.append(W.getAttribute('kind')) return pos def clitic_join(tree, clitic): if type(tree[-1]) == Tree: return clitic_join(tree[-1], clitic) else: if (clitic[0][0][0] == 'ا'): clitic[0] = ('' + clitic[0][0], clitic[0][1]) tree[-1] = (tree[-1][0] + clitic[0][0], clitic[0][1]) tree.set_label('CLITICS') return if not len(node.childNodes): return first = node.childNodes[0] if first.tagName == 'w': pos = extract_tags(first) return Tree(node.tagName, [(first.childNodes[0].data.replace( 'می ', 'می'), self._pos_map(pos))]) childs = node.childNodes[ 2:] if node.tagName == 'S' else node.childNodes for child in childs: if not len(child.childNodes): childs.remove(child) tree = Tree(node.tagName, map(traverse, childs)) if self._join_clitics and len(tree) > 1 and type( tree[1]) == Tree and tree[1].label( ) == 'CLITIC' and tree[1][0][1] not in {'P', 'V'}: clitic = tree[-1] tree = Tree(tree.label(), [subtree for subtree in tree[0]]) clitic_join(tree, clitic) if self._join_verb_parts and len(tree) > 1 and type( tree[1]) == Tree and type( tree[0]) == Tree and tree[0].label() == 'AUX' and tree[ 0][0][0] in self._tokenizer.before_verbs: tree[1][0] = (tree[0][0][0] + ' ' + tree[1][0][0], tree[1][0][1]) tree.remove(tree[0]) if self._join_verb_parts and len( tree.leaves()) > 1 and tree.leaves( )[-1][0] in self._tokenizer.after_verbs and tree.leaves( )[-2][0] in self._tokenizer.verbe: tree[1][0] = (tree[0].leaves()[-1][0] + ' ' + tree[1][0][0], tree[1][0][1]) path = tree.leaf_treeposition(len(tree.leaves()) - 2) removingtree = tree while len(path) > 2: removingtree = removingtree[path[0]] path = path[1:] removingtree.remove( Tree(tree.pos()[-2][1], [tree.pos()[-2][0]])) if self._join_verb_parts and len( tree.leaves()) > 1 and tree.leaves( )[-1][0] in self._tokenizer.after_verbs and tree.leaves( )[-2][0] in self._tokenizer.verbe: tree[1][0] = (tree[0].leaves()[-1][0] + ' ' + tree[1][0][0], tree[1][0][1]) path = tree.leaf_treeposition(len(tree.leaves()) - 2) removingtree = tree while len(path) > 2: removingtree = removingtree[path[0]] path = path[1:] removingtree.remove( Tree(tree.pos()[-2][1], [tree.pos()[-2][0]])) return tree
def _strip_functional_tags(self, tree: Tree) -> None: clean_label = tree.label().split("=")[0].split("-")[0].split("|")[0] tree.set_label(clean_label) for child in tree: if not isinstance(child[0], str): self._strip_functional_tags(child)
def _get_gold_spans(self, # pylint: disable=arguments-differ tree: Tree, index: int, typed_spans: Dict[Tuple[int, int], str]) -> int: """ Recursively construct the gold spans from an nltk ``Tree``. Labels are the constituents, and in the case of nested constituents with the same spans, labels are concatenated in parent-child order. For example, ``(S (NP (D the) (N man)))`` would have an ``S-NP`` label for the outer span, as it has both ``S`` and ``NP`` label. Spans are inclusive. TODO(Mark): If we encounter a gold nested labelling at test time which we haven't encountered, we won't be able to run the model at all. Parameters ---------- tree : ``Tree``, required. An NLTK parse tree to extract spans from. index : ``int``, required. The index of the current span in the sentence being considered. typed_spans : ``Dict[Tuple[int, int], str]``, required. A dictionary mapping spans to span labels. Returns ------- typed_spans : ``Dict[Tuple[int, int], str]``. A dictionary mapping all subtree spans in the parse tree to their constituency labels. POS tags are ignored. """ # NLTK leaves are strings. if isinstance(tree[0], str): # The "length" of a tree is defined by # NLTK as the number of children. # We don't actually want the spans for leaves, because # their labels are POS tags. Instead, we just add the length # of the word to the end index as we iterate through. end = index + len(tree) else: # otherwise, the tree has children. child_start = index for child in tree: # typed_spans is being updated inplace. end = self._get_gold_spans(child, child_start, typed_spans) child_start = end # Set the end index of the current span to # the last appended index - 1, as the span is inclusive. span = (index, end - 1) current_span_label = typed_spans.get(span) if current_span_label is None: # This span doesn't have nested labels, just # use the current node's label. typed_spans[span] = tree.label() else: # This span has already been added, so prepend # this label (as we are traversing the tree from # the bottom up). typed_spans[span] = tree.label() + "-" + current_span_label return end