def traverse(node): def extract_tags(W): pos = [W.getAttribute('lc') if W.getAttribute('lc') else None] if W.getAttribute('clitic') in { 'ezafe', 'pronominal', 'verb', 'prep', 'adv', 'det' }: pos.append(W.getAttribute('clitic')) if W.getAttribute('ne_sort'): pos.append(W.getAttribute('ne_sort')) if W.getAttribute('n_type'): pos.append(W.getAttribute('n_type')) if W.getAttribute('ya_type'): pos.append(W.getAttribute('ya_type')) if W.getAttribute('ke_type'): pos.append(W.getAttribute('ke_type')) if W.getAttribute('type'): pos.append(W.getAttribute('type')) if W.getAttribute('kind'): pos.append(W.getAttribute('kind')) return pos def clitic_join(tree, clitic): if type(tree[-1]) == Tree: return clitic_join(tree[-1], clitic) else: if (clitic[0][0][0] == 'ا'): clitic[0] = ('' + clitic[0][0], clitic[0][1]) tree[-1] = (tree[-1][0] + clitic[0][0], clitic[0][1]) tree.set_label('CLITICS') return if not len(node.childNodes): return first = node.childNodes[0] if first.tagName == 'w': pos = extract_tags(first) return Tree(node.tagName, [(first.childNodes[0].data.replace( 'می ', 'می'), self._pos_map(pos))]) childs = node.childNodes[ 2:] if node.tagName == 'S' else node.childNodes for child in childs: if not len(child.childNodes): childs.remove(child) tree = Tree(node.tagName, map(traverse, childs)) if self._join_clitics and len(tree) > 1 and type( tree[1]) == Tree and tree[1].label( ) == 'CLITIC' and tree[1][0][1] not in {'P', 'V'}: clitic = tree[-1] tree = Tree(tree.label(), [subtree for subtree in tree[0]]) clitic_join(tree, clitic) if self._join_verb_parts and len(tree) > 1 and type( tree[1]) == Tree and type( tree[0]) == Tree and tree[0].label() == 'AUX' and tree[ 0][0][0] in self._tokenizer.before_verbs: tree[1][0] = (tree[0][0][0] + ' ' + tree[1][0][0], tree[1][0][1]) tree.remove(tree[0]) if self._join_verb_parts and len( tree.leaves()) > 1 and tree.leaves( )[-1][0] in self._tokenizer.after_verbs and tree.leaves( )[-2][0] in self._tokenizer.verbe: tree[1][0] = (tree[0].leaves()[-1][0] + ' ' + tree[1][0][0], tree[1][0][1]) path = tree.leaf_treeposition(len(tree.leaves()) - 2) removingtree = tree while len(path) > 2: removingtree = removingtree[path[0]] path = path[1:] removingtree.remove( Tree(tree.pos()[-2][1], [tree.pos()[-2][0]])) if self._join_verb_parts and len( tree.leaves()) > 1 and tree.leaves( )[-1][0] in self._tokenizer.after_verbs and tree.leaves( )[-2][0] in self._tokenizer.verbe: tree[1][0] = (tree[0].leaves()[-1][0] + ' ' + tree[1][0][0], tree[1][0][1]) path = tree.leaf_treeposition(len(tree.leaves()) - 2) removingtree = tree while len(path) > 2: removingtree = removingtree[path[0]] path = path[1:] removingtree.remove( Tree(tree.pos()[-2][1], [tree.pos()[-2][0]])) return tree
def traverse(node): def extract_tags(W): pos = [W.getAttribute('lc') if W.getAttribute('lc') else None] if W.getAttribute('clitic') in {'ezafe', 'pronominal', 'verb', 'prep', 'adv', 'det'}: pos.append(W.getAttribute('clitic')) if W.getAttribute('ne_sort'): pos.append(W.getAttribute('ne_sort')) if W.getAttribute('n_type'): pos.append(W.getAttribute('n_type')) if W.getAttribute('ya_type'): pos.append(W.getAttribute('ya_type')) if W.getAttribute('ke_type'): pos.append(W.getAttribute('ke_type')) if W.getAttribute('type'): pos.append(W.getAttribute('type')) if W.getAttribute('kind'): pos.append(W.getAttribute('kind')) return pos def clitic_join(tree, clitic): if type(tree[-1]) == Tree: return clitic_join(tree[-1], clitic) else: if(clitic[0][0][0] == 'ا'): clitic[0] = ('' + clitic[0][0], clitic[0][1]) tree[-1]=(tree[-1][0] + clitic[0][0], clitic[0][1]) tree.set_label('CLITICS') return if not len(node.childNodes): return first = node.childNodes[0] if first.tagName == 'w': pos=extract_tags(first) return Tree(node.tagName, [(first.childNodes[0].data.replace('می ', 'می'), self._pos_map(pos))]) childs = node.childNodes[2:] if node.tagName == 'S' else node.childNodes for child in childs: if not len(child.childNodes): childs.remove(child) tree = Tree(node.tagName, map(traverse, childs)) if self._join_clitics and len(tree) > 1 and type(tree[1]) == Tree and tree[1].label() == 'CLITIC' and tree[1][0][1] not in {'P', 'V'}: clitic=tree[-1] tree = Tree(tree.label(), [subtree for subtree in tree[0]]) clitic_join(tree, clitic) if self._join_verb_parts and len(tree) > 1 and type(tree[1]) == Tree and type(tree[0]) == Tree and tree[0].label() == 'AUX' and tree[0][0][0] in self._tokenizer.before_verbs: tree[1][0] = (tree[0][0][0] + ' ' + tree[1][0][0], tree[1][0][1]) tree.remove(tree[0]) if self._join_verb_parts and len(tree.leaves()) > 1 and tree.leaves()[-1][0] in self._tokenizer.after_verbs and tree.leaves()[-2][0] in self._tokenizer.verbe : tree[1][0] = (tree[0].leaves()[-1][0] + ' ' + tree[1][0][0], tree[1][0][1]) path = tree.leaf_treeposition(len(tree.leaves())-2) removingtree = tree while len(path) > 2 : removingtree = removingtree[path[0]] path = path[1:] removingtree.remove(Tree(tree.pos()[-2][1],[tree.pos()[-2][0]])) if self._join_verb_parts and len(tree.leaves()) > 1 and tree.leaves()[-1][0] in self._tokenizer.after_verbs and tree.leaves()[-2][0] in self._tokenizer.verbe : tree[1][0] = (tree[0].leaves()[-1][0] + ' ' + tree[1][0][0], tree[1][0][1]) path = tree.leaf_treeposition(len(tree.leaves())-2) removingtree = tree while len(path) > 2 : removingtree = removingtree[path[0]] path = path[1:] removingtree.remove(Tree(tree.pos()[-2][1],[tree.pos()[-2][0]])) return tree