Exemplo n.º 1
0
def wsjtree2pos(wsj_corpus_path):
    print >> sys.stderr, "Reading in corpus..."
    sentences = []
    for d in os.listdir(wsj_corpus_path):
        if os.path.isdir(wsj_corpus_path + "/" + d) and d != "CVS" and int(d) < 8:
            for f in os.listdir(wsj_corpus_path + "/" + d):
                if f.endswith(".mrg"):
                    fname = wsj_corpus_path + "/" + d + "/" + f
                    # print fname
                    tree_f = open(fname, "r")
                    tree_string = ""
                    for line in tree_f:
                        if line.strip():
                            if line.startswith("( (") or line.startswith("(("):
                                if tree_string:
                                    tr = Tree(tree_string)
                                    sentences.append(tr.pos())
                                    tree_string = line.strip()
                                else:
                                    tree_string = line.strip()
                            else:
                                tree_string += line.strip()
                    if tree_string:
                        tr = Tree(tree_string)
                        sentences.append(tr.pos())

    return sentences
Exemplo n.º 2
0
    def parse_coord(t: Tree, is_coord: bool):
        if len(t.leaves()) == 1:
            if t.pos()[0][1] == '-NONE-':
                return []
            else:
                return [(t.leaves()[0], is_coord)]

        res = []
        for subtree in t:
            res += parse_coord(subtree, is_coord or "COORD" in subtree.label())

        return res
Exemplo n.º 3
0
def getPOS(message):
    body = {
        "language" : "en",
        "analyzerIds" : ["22a6b758-420f-4745-8a3c-46835a67c0d2"],
        "text" : message
    };

    try:
        conn = HTTPSConnection('api.projectoxford.ai')
        conn.connect()
        conn.request("POST", "/linguistics/v1.0/analyze", json.dumps(body), headers)
        response = conn.getresponse()
        data = json.loads(response.read().decode())
        conn.close()
        ct = Tree.fromstring(data[0]['result'][0])
        pos = Tree.pos(ct)
        return pos;
    except Exception as e: #Error occurred
        print(e)
        return False;
Exemplo n.º 4
0
        Tree('Item', ['f1']),
        Tree('List',
             [Tree('Item', ['f2']),
              Tree('List', [Tree('Item', ['f3'])])])
    ]), 'to',
    Tree('Item', ['folder'])
])

print(t2)
print(t2.flatten())
print(type(t2.flatten()))
print(t2.collapse_unary())

max_subtree = Tree('', [])
for subtree in t2.subtrees(filter=lambda x: x.label() == 'List'):
    if len(subtree.flatten().pos()) > len(max_subtree.pos()):
        max_subtree = subtree
print(max_subtree)
'''
tmp = t2
i = 
while tmp.label() != 'List':
	i += 1
	tmp = tmp.pos()
	print("the " + str(i) + " time:")
	print(tmp)
	print(type(tmp))
tmp = tmp.flatten()

print(t2)
'''
Exemplo n.º 5
0
 def from_parsed_sentence(cls, parsed_sent: Tree) -> 'GenerativeOracle':
     actions = cls.get_actions(parsed_sent)
     _, pos_tags = zip(*parsed_sent.pos())
     return cls(actions, list(pos_tags))
Exemplo n.º 6
0
 def from_parsed_sentence(cls, parsed_sent: Tree) -> 'DiscriminativeOracle':
     actions = cls.get_actions(parsed_sent)
     words, pos_tags = zip(*parsed_sent.pos())
     return cls(actions, list(pos_tags), list(words))
Exemplo n.º 7
0
	Tree('List', [
		Tree('Item', ['f1']), 
		Tree('List', [
			Tree('Item', ['f2']),
			Tree('List', [Tree('Item', ['f3'])])])]),
	'to', 
	Tree('Item', ['folder'])])

print(t2)
print(t2.flatten())
print(type(t2.flatten()))
print(t2.collapse_unary())

max_subtree = Tree('', [])
for subtree in t2.subtrees(filter = lambda x: x.label() == 'List'):
	if len(subtree.flatten().pos()) > len(max_subtree.pos()):
		max_subtree = subtree
print(max_subtree)


'''
tmp = t2
i = 
while tmp.label() != 'List':
	i += 1
	tmp = tmp.pos()
	print("the " + str(i) + " time:")
	print(tmp)
	print(type(tmp))
tmp = tmp.flatten()
Exemplo n.º 8
0
 def from_tree(cls, tree: Tree) -> 'DiscOracle':
     actions = cls.get_actions(tree)
     words, pos_tags = zip(*tree.pos())
     return cls(actions, list(pos_tags), list(words))
Exemplo n.º 9
0
 def from_tree(cls, tree: Tree) -> 'GenOracle':
     actions = cls.get_actions(tree)
     _, pos_tags = zip(*tree.pos())
     return cls(actions, list(pos_tags))
Exemplo n.º 10
0
        def traverse(node):
            def extract_tags(W):
                pos = [W.getAttribute('lc') if W.getAttribute('lc') else None]
                if W.getAttribute('clitic') in {
                        'ezafe', 'pronominal', 'verb', 'prep', 'adv', 'det'
                }:
                    pos.append(W.getAttribute('clitic'))
                if W.getAttribute('ne_sort'):
                    pos.append(W.getAttribute('ne_sort'))
                if W.getAttribute('n_type'):
                    pos.append(W.getAttribute('n_type'))
                if W.getAttribute('ya_type'):
                    pos.append(W.getAttribute('ya_type'))
                if W.getAttribute('ke_type'):
                    pos.append(W.getAttribute('ke_type'))
                if W.getAttribute('type'):
                    pos.append(W.getAttribute('type'))
                if W.getAttribute('kind'):
                    pos.append(W.getAttribute('kind'))
                return pos

            def clitic_join(tree, clitic):
                if type(tree[-1]) == Tree:
                    return clitic_join(tree[-1], clitic)
                else:
                    if (clitic[0][0][0] == 'ا'):
                        clitic[0] = ('‌' + clitic[0][0], clitic[0][1])
                    tree[-1] = (tree[-1][0] + clitic[0][0], clitic[0][1])
                    tree.set_label('CLITICS')
                    return

            if not len(node.childNodes):
                return
            first = node.childNodes[0]
            if first.tagName == 'w':
                pos = extract_tags(first)
                return Tree(node.tagName, [(first.childNodes[0].data.replace(
                    'می ', 'می‌'), self._pos_map(pos))])
            childs = node.childNodes[
                2:] if node.tagName == 'S' else node.childNodes
            for child in childs:
                if not len(child.childNodes):
                    childs.remove(child)
            tree = Tree(node.tagName, map(traverse, childs))
            if self._join_clitics and len(tree) > 1 and type(
                    tree[1]) == Tree and tree[1].label(
                    ) == 'CLITIC' and tree[1][0][1] not in {'P', 'V'}:
                clitic = tree[-1]
                tree = Tree(tree.label(), [subtree for subtree in tree[0]])
                clitic_join(tree, clitic)
            if self._join_verb_parts and len(tree) > 1 and type(
                    tree[1]) == Tree and type(
                        tree[0]) == Tree and tree[0].label() == 'AUX' and tree[
                            0][0][0] in self._tokenizer.before_verbs:
                tree[1][0] = (tree[0][0][0] + ' ' + tree[1][0][0],
                              tree[1][0][1])
                tree.remove(tree[0])
            if self._join_verb_parts and len(
                    tree.leaves()) > 1 and tree.leaves(
                    )[-1][0] in self._tokenizer.after_verbs and tree.leaves(
                    )[-2][0] in self._tokenizer.verbe:
                tree[1][0] = (tree[0].leaves()[-1][0] + ' ' + tree[1][0][0],
                              tree[1][0][1])
                path = tree.leaf_treeposition(len(tree.leaves()) - 2)
                removingtree = tree
                while len(path) > 2:
                    removingtree = removingtree[path[0]]
                    path = path[1:]
                removingtree.remove(
                    Tree(tree.pos()[-2][1], [tree.pos()[-2][0]]))
            if self._join_verb_parts and len(
                    tree.leaves()) > 1 and tree.leaves(
                    )[-1][0] in self._tokenizer.after_verbs and tree.leaves(
                    )[-2][0] in self._tokenizer.verbe:
                tree[1][0] = (tree[0].leaves()[-1][0] + ' ' + tree[1][0][0],
                              tree[1][0][1])
                path = tree.leaf_treeposition(len(tree.leaves()) - 2)
                removingtree = tree
                while len(path) > 2:
                    removingtree = removingtree[path[0]]
                    path = path[1:]
                removingtree.remove(
                    Tree(tree.pos()[-2][1], [tree.pos()[-2][0]]))
            return tree
Exemplo n.º 11
0
		def traverse(node):
			def extract_tags(W):
				pos = [W.getAttribute('lc') if W.getAttribute('lc') else None]
				if W.getAttribute('clitic') in {'ezafe', 'pronominal', 'verb', 'prep', 'adv', 'det'}:
					pos.append(W.getAttribute('clitic'))
				if W.getAttribute('ne_sort'):
					pos.append(W.getAttribute('ne_sort'))
				if W.getAttribute('n_type'):
					pos.append(W.getAttribute('n_type'))
				if W.getAttribute('ya_type'):
					pos.append(W.getAttribute('ya_type'))
				if W.getAttribute('ke_type'):
					pos.append(W.getAttribute('ke_type'))
				if W.getAttribute('type'):
					pos.append(W.getAttribute('type'))
				if W.getAttribute('kind'):
					pos.append(W.getAttribute('kind'))
				return pos

			def clitic_join(tree, clitic):
				if type(tree[-1]) == Tree:
					return clitic_join(tree[-1], clitic)
				else:
					if(clitic[0][0][0] == 'ا'):
						clitic[0] = ('‌' + clitic[0][0], clitic[0][1])
					tree[-1]=(tree[-1][0] + clitic[0][0], clitic[0][1])
					tree.set_label('CLITICS')
					return

			if not len(node.childNodes):
				return
			first = node.childNodes[0]
			if first.tagName == 'w':
				pos=extract_tags(first)
				return Tree(node.tagName, [(first.childNodes[0].data.replace('می ', 'می‌'), self._pos_map(pos))])
			childs = node.childNodes[2:] if node.tagName == 'S' else node.childNodes
			for child in childs:
				if not len(child.childNodes):
					childs.remove(child)
			tree = Tree(node.tagName, map(traverse, childs))
			if self._join_clitics and len(tree) > 1 and type(tree[1]) == Tree and tree[1].label() == 'CLITIC' and tree[1][0][1] not in {'P', 'V'}:
				clitic=tree[-1]
				tree = Tree(tree.label(), [subtree for subtree in tree[0]])
				clitic_join(tree, clitic)
			if self._join_verb_parts and len(tree) > 1 and type(tree[1]) == Tree and type(tree[0]) == Tree and tree[0].label() == 'AUX' and tree[0][0][0] in self._tokenizer.before_verbs:
				tree[1][0] = (tree[0][0][0] + ' ' + tree[1][0][0], tree[1][0][1])
				tree.remove(tree[0])
			if self._join_verb_parts and len(tree.leaves()) > 1 and tree.leaves()[-1][0] in self._tokenizer.after_verbs and tree.leaves()[-2][0] in self._tokenizer.verbe :
				tree[1][0] = (tree[0].leaves()[-1][0] + ' ' + tree[1][0][0], tree[1][0][1])
				path = tree.leaf_treeposition(len(tree.leaves())-2)
				removingtree = tree
				while len(path) > 2 :
					removingtree = removingtree[path[0]]
					path = path[1:]
				removingtree.remove(Tree(tree.pos()[-2][1],[tree.pos()[-2][0]]))
			if self._join_verb_parts and len(tree.leaves()) > 1 and tree.leaves()[-1][0] in self._tokenizer.after_verbs and tree.leaves()[-2][0] in self._tokenizer.verbe :
				tree[1][0] = (tree[0].leaves()[-1][0] + ' ' + tree[1][0][0], tree[1][0][1])
				path = tree.leaf_treeposition(len(tree.leaves())-2)
				removingtree = tree
				while len(path) > 2 :
					removingtree = removingtree[path[0]]
					path = path[1:]
				removingtree.remove(Tree(tree.pos()[-2][1],[tree.pos()[-2][0]]))
			return tree