Пример #1
0
        def traverse(node):
            def extract_tags(W):
                pos = [W.getAttribute('lc') if W.getAttribute('lc') else None]
                if W.getAttribute('clitic') in {
                        'ezafe', 'pronominal', 'verb', 'prep', 'adv', 'det'
                }:
                    pos.append(W.getAttribute('clitic'))
                if W.getAttribute('ne_sort'):
                    pos.append(W.getAttribute('ne_sort'))
                if W.getAttribute('n_type'):
                    pos.append(W.getAttribute('n_type'))
                if W.getAttribute('ya_type'):
                    pos.append(W.getAttribute('ya_type'))
                if W.getAttribute('ke_type'):
                    pos.append(W.getAttribute('ke_type'))
                if W.getAttribute('type'):
                    pos.append(W.getAttribute('type'))
                if W.getAttribute('kind'):
                    pos.append(W.getAttribute('kind'))
                return pos

            def clitic_join(tree, clitic):
                if type(tree[-1]) == Tree:
                    return clitic_join(tree[-1], clitic)
                else:
                    if (clitic[0][0][0] == 'ا'):
                        clitic[0] = ('‌' + clitic[0][0], clitic[0][1])
                    tree[-1] = (tree[-1][0] + clitic[0][0], clitic[0][1])
                    tree.set_label('CLITICS')
                    return

            if not len(node.childNodes):
                return
            first = node.childNodes[0]
            if first.tagName == 'w':
                pos = extract_tags(first)
                return Tree(node.tagName, [(first.childNodes[0].data.replace(
                    'می ', 'می‌'), self._pos_map(pos))])
            childs = node.childNodes[
                2:] if node.tagName == 'S' else node.childNodes
            for child in childs:
                if not len(child.childNodes):
                    childs.remove(child)
            tree = Tree(node.tagName, map(traverse, childs))
            if self._join_clitics and len(tree) > 1 and type(
                    tree[1]) == Tree and tree[1].label(
                    ) == 'CLITIC' and tree[1][0][1] not in {'P', 'V'}:
                clitic = tree[-1]
                tree = Tree(tree.label(), [subtree for subtree in tree[0]])
                clitic_join(tree, clitic)
            if self._join_verb_parts and len(tree) > 1 and type(
                    tree[1]) == Tree and type(
                        tree[0]) == Tree and tree[0].label() == 'AUX' and tree[
                            0][0][0] in self._tokenizer.before_verbs:
                tree[1][0] = (tree[0][0][0] + ' ' + tree[1][0][0],
                              tree[1][0][1])
                tree.remove(tree[0])
            if self._join_verb_parts and len(
                    tree.leaves()) > 1 and tree.leaves(
                    )[-1][0] in self._tokenizer.after_verbs and tree.leaves(
                    )[-2][0] in self._tokenizer.verbe:
                tree[1][0] = (tree[0].leaves()[-1][0] + ' ' + tree[1][0][0],
                              tree[1][0][1])
                path = tree.leaf_treeposition(len(tree.leaves()) - 2)
                removingtree = tree
                while len(path) > 2:
                    removingtree = removingtree[path[0]]
                    path = path[1:]
                removingtree.remove(
                    Tree(tree.pos()[-2][1], [tree.pos()[-2][0]]))
            if self._join_verb_parts and len(
                    tree.leaves()) > 1 and tree.leaves(
                    )[-1][0] in self._tokenizer.after_verbs and tree.leaves(
                    )[-2][0] in self._tokenizer.verbe:
                tree[1][0] = (tree[0].leaves()[-1][0] + ' ' + tree[1][0][0],
                              tree[1][0][1])
                path = tree.leaf_treeposition(len(tree.leaves()) - 2)
                removingtree = tree
                while len(path) > 2:
                    removingtree = removingtree[path[0]]
                    path = path[1:]
                removingtree.remove(
                    Tree(tree.pos()[-2][1], [tree.pos()[-2][0]]))
            return tree
Пример #2
0
		def traverse(node):
			def extract_tags(W):
				pos = [W.getAttribute('lc') if W.getAttribute('lc') else None]
				if W.getAttribute('clitic') in {'ezafe', 'pronominal', 'verb', 'prep', 'adv', 'det'}:
					pos.append(W.getAttribute('clitic'))
				if W.getAttribute('ne_sort'):
					pos.append(W.getAttribute('ne_sort'))
				if W.getAttribute('n_type'):
					pos.append(W.getAttribute('n_type'))
				if W.getAttribute('ya_type'):
					pos.append(W.getAttribute('ya_type'))
				if W.getAttribute('ke_type'):
					pos.append(W.getAttribute('ke_type'))
				if W.getAttribute('type'):
					pos.append(W.getAttribute('type'))
				if W.getAttribute('kind'):
					pos.append(W.getAttribute('kind'))
				return pos

			def clitic_join(tree, clitic):
				if type(tree[-1]) == Tree:
					return clitic_join(tree[-1], clitic)
				else:
					if(clitic[0][0][0] == 'ا'):
						clitic[0] = ('‌' + clitic[0][0], clitic[0][1])
					tree[-1]=(tree[-1][0] + clitic[0][0], clitic[0][1])
					tree.set_label('CLITICS')
					return

			if not len(node.childNodes):
				return
			first = node.childNodes[0]
			if first.tagName == 'w':
				pos=extract_tags(first)
				return Tree(node.tagName, [(first.childNodes[0].data.replace('می ', 'می‌'), self._pos_map(pos))])
			childs = node.childNodes[2:] if node.tagName == 'S' else node.childNodes
			for child in childs:
				if not len(child.childNodes):
					childs.remove(child)
			tree = Tree(node.tagName, map(traverse, childs))
			if self._join_clitics and len(tree) > 1 and type(tree[1]) == Tree and tree[1].label() == 'CLITIC' and tree[1][0][1] not in {'P', 'V'}:
				clitic=tree[-1]
				tree = Tree(tree.label(), [subtree for subtree in tree[0]])
				clitic_join(tree, clitic)
			if self._join_verb_parts and len(tree) > 1 and type(tree[1]) == Tree and type(tree[0]) == Tree and tree[0].label() == 'AUX' and tree[0][0][0] in self._tokenizer.before_verbs:
				tree[1][0] = (tree[0][0][0] + ' ' + tree[1][0][0], tree[1][0][1])
				tree.remove(tree[0])
			if self._join_verb_parts and len(tree.leaves()) > 1 and tree.leaves()[-1][0] in self._tokenizer.after_verbs and tree.leaves()[-2][0] in self._tokenizer.verbe :
				tree[1][0] = (tree[0].leaves()[-1][0] + ' ' + tree[1][0][0], tree[1][0][1])
				path = tree.leaf_treeposition(len(tree.leaves())-2)
				removingtree = tree
				while len(path) > 2 :
					removingtree = removingtree[path[0]]
					path = path[1:]
				removingtree.remove(Tree(tree.pos()[-2][1],[tree.pos()[-2][0]]))
			if self._join_verb_parts and len(tree.leaves()) > 1 and tree.leaves()[-1][0] in self._tokenizer.after_verbs and tree.leaves()[-2][0] in self._tokenizer.verbe :
				tree[1][0] = (tree[0].leaves()[-1][0] + ' ' + tree[1][0][0], tree[1][0][1])
				path = tree.leaf_treeposition(len(tree.leaves())-2)
				removingtree = tree
				while len(path) > 2 :
					removingtree = removingtree[path[0]]
					path = path[1:]
				removingtree.remove(Tree(tree.pos()[-2][1],[tree.pos()[-2][0]]))
			return tree