def wsjtree2pos(wsj_corpus_path): print >> sys.stderr, "Reading in corpus..." sentences = [] for d in os.listdir(wsj_corpus_path): if os.path.isdir(wsj_corpus_path + "/" + d) and d != "CVS" and int(d) < 8: for f in os.listdir(wsj_corpus_path + "/" + d): if f.endswith(".mrg"): fname = wsj_corpus_path + "/" + d + "/" + f # print fname tree_f = open(fname, "r") tree_string = "" for line in tree_f: if line.strip(): if line.startswith("( (") or line.startswith("(("): if tree_string: tr = Tree(tree_string) sentences.append(tr.pos()) tree_string = line.strip() else: tree_string = line.strip() else: tree_string += line.strip() if tree_string: tr = Tree(tree_string) sentences.append(tr.pos()) return sentences
def parse_coord(t: Tree, is_coord: bool): if len(t.leaves()) == 1: if t.pos()[0][1] == '-NONE-': return [] else: return [(t.leaves()[0], is_coord)] res = [] for subtree in t: res += parse_coord(subtree, is_coord or "COORD" in subtree.label()) return res
def getPOS(message): body = { "language" : "en", "analyzerIds" : ["22a6b758-420f-4745-8a3c-46835a67c0d2"], "text" : message }; try: conn = HTTPSConnection('api.projectoxford.ai') conn.connect() conn.request("POST", "/linguistics/v1.0/analyze", json.dumps(body), headers) response = conn.getresponse() data = json.loads(response.read().decode()) conn.close() ct = Tree.fromstring(data[0]['result'][0]) pos = Tree.pos(ct) return pos; except Exception as e: #Error occurred print(e) return False;
Tree('Item', ['f1']), Tree('List', [Tree('Item', ['f2']), Tree('List', [Tree('Item', ['f3'])])]) ]), 'to', Tree('Item', ['folder']) ]) print(t2) print(t2.flatten()) print(type(t2.flatten())) print(t2.collapse_unary()) max_subtree = Tree('', []) for subtree in t2.subtrees(filter=lambda x: x.label() == 'List'): if len(subtree.flatten().pos()) > len(max_subtree.pos()): max_subtree = subtree print(max_subtree) ''' tmp = t2 i = while tmp.label() != 'List': i += 1 tmp = tmp.pos() print("the " + str(i) + " time:") print(tmp) print(type(tmp)) tmp = tmp.flatten() print(t2) '''
def from_parsed_sentence(cls, parsed_sent: Tree) -> 'GenerativeOracle': actions = cls.get_actions(parsed_sent) _, pos_tags = zip(*parsed_sent.pos()) return cls(actions, list(pos_tags))
def from_parsed_sentence(cls, parsed_sent: Tree) -> 'DiscriminativeOracle': actions = cls.get_actions(parsed_sent) words, pos_tags = zip(*parsed_sent.pos()) return cls(actions, list(pos_tags), list(words))
Tree('List', [ Tree('Item', ['f1']), Tree('List', [ Tree('Item', ['f2']), Tree('List', [Tree('Item', ['f3'])])])]), 'to', Tree('Item', ['folder'])]) print(t2) print(t2.flatten()) print(type(t2.flatten())) print(t2.collapse_unary()) max_subtree = Tree('', []) for subtree in t2.subtrees(filter = lambda x: x.label() == 'List'): if len(subtree.flatten().pos()) > len(max_subtree.pos()): max_subtree = subtree print(max_subtree) ''' tmp = t2 i = while tmp.label() != 'List': i += 1 tmp = tmp.pos() print("the " + str(i) + " time:") print(tmp) print(type(tmp)) tmp = tmp.flatten()
def from_tree(cls, tree: Tree) -> 'DiscOracle': actions = cls.get_actions(tree) words, pos_tags = zip(*tree.pos()) return cls(actions, list(pos_tags), list(words))
def from_tree(cls, tree: Tree) -> 'GenOracle': actions = cls.get_actions(tree) _, pos_tags = zip(*tree.pos()) return cls(actions, list(pos_tags))
def traverse(node): def extract_tags(W): pos = [W.getAttribute('lc') if W.getAttribute('lc') else None] if W.getAttribute('clitic') in { 'ezafe', 'pronominal', 'verb', 'prep', 'adv', 'det' }: pos.append(W.getAttribute('clitic')) if W.getAttribute('ne_sort'): pos.append(W.getAttribute('ne_sort')) if W.getAttribute('n_type'): pos.append(W.getAttribute('n_type')) if W.getAttribute('ya_type'): pos.append(W.getAttribute('ya_type')) if W.getAttribute('ke_type'): pos.append(W.getAttribute('ke_type')) if W.getAttribute('type'): pos.append(W.getAttribute('type')) if W.getAttribute('kind'): pos.append(W.getAttribute('kind')) return pos def clitic_join(tree, clitic): if type(tree[-1]) == Tree: return clitic_join(tree[-1], clitic) else: if (clitic[0][0][0] == 'ا'): clitic[0] = ('' + clitic[0][0], clitic[0][1]) tree[-1] = (tree[-1][0] + clitic[0][0], clitic[0][1]) tree.set_label('CLITICS') return if not len(node.childNodes): return first = node.childNodes[0] if first.tagName == 'w': pos = extract_tags(first) return Tree(node.tagName, [(first.childNodes[0].data.replace( 'می ', 'می'), self._pos_map(pos))]) childs = node.childNodes[ 2:] if node.tagName == 'S' else node.childNodes for child in childs: if not len(child.childNodes): childs.remove(child) tree = Tree(node.tagName, map(traverse, childs)) if self._join_clitics and len(tree) > 1 and type( tree[1]) == Tree and tree[1].label( ) == 'CLITIC' and tree[1][0][1] not in {'P', 'V'}: clitic = tree[-1] tree = Tree(tree.label(), [subtree for subtree in tree[0]]) clitic_join(tree, clitic) if self._join_verb_parts and len(tree) > 1 and type( tree[1]) == Tree and type( tree[0]) == Tree and tree[0].label() == 'AUX' and tree[ 0][0][0] in self._tokenizer.before_verbs: tree[1][0] = (tree[0][0][0] + ' ' + tree[1][0][0], tree[1][0][1]) tree.remove(tree[0]) if self._join_verb_parts and len( tree.leaves()) > 1 and tree.leaves( )[-1][0] in self._tokenizer.after_verbs and tree.leaves( )[-2][0] in self._tokenizer.verbe: tree[1][0] = (tree[0].leaves()[-1][0] + ' ' + tree[1][0][0], tree[1][0][1]) path = tree.leaf_treeposition(len(tree.leaves()) - 2) removingtree = tree while len(path) > 2: removingtree = removingtree[path[0]] path = path[1:] removingtree.remove( Tree(tree.pos()[-2][1], [tree.pos()[-2][0]])) if self._join_verb_parts and len( tree.leaves()) > 1 and tree.leaves( )[-1][0] in self._tokenizer.after_verbs and tree.leaves( )[-2][0] in self._tokenizer.verbe: tree[1][0] = (tree[0].leaves()[-1][0] + ' ' + tree[1][0][0], tree[1][0][1]) path = tree.leaf_treeposition(len(tree.leaves()) - 2) removingtree = tree while len(path) > 2: removingtree = removingtree[path[0]] path = path[1:] removingtree.remove( Tree(tree.pos()[-2][1], [tree.pos()[-2][0]])) return tree
def traverse(node): def extract_tags(W): pos = [W.getAttribute('lc') if W.getAttribute('lc') else None] if W.getAttribute('clitic') in {'ezafe', 'pronominal', 'verb', 'prep', 'adv', 'det'}: pos.append(W.getAttribute('clitic')) if W.getAttribute('ne_sort'): pos.append(W.getAttribute('ne_sort')) if W.getAttribute('n_type'): pos.append(W.getAttribute('n_type')) if W.getAttribute('ya_type'): pos.append(W.getAttribute('ya_type')) if W.getAttribute('ke_type'): pos.append(W.getAttribute('ke_type')) if W.getAttribute('type'): pos.append(W.getAttribute('type')) if W.getAttribute('kind'): pos.append(W.getAttribute('kind')) return pos def clitic_join(tree, clitic): if type(tree[-1]) == Tree: return clitic_join(tree[-1], clitic) else: if(clitic[0][0][0] == 'ا'): clitic[0] = ('' + clitic[0][0], clitic[0][1]) tree[-1]=(tree[-1][0] + clitic[0][0], clitic[0][1]) tree.set_label('CLITICS') return if not len(node.childNodes): return first = node.childNodes[0] if first.tagName == 'w': pos=extract_tags(first) return Tree(node.tagName, [(first.childNodes[0].data.replace('می ', 'می'), self._pos_map(pos))]) childs = node.childNodes[2:] if node.tagName == 'S' else node.childNodes for child in childs: if not len(child.childNodes): childs.remove(child) tree = Tree(node.tagName, map(traverse, childs)) if self._join_clitics and len(tree) > 1 and type(tree[1]) == Tree and tree[1].label() == 'CLITIC' and tree[1][0][1] not in {'P', 'V'}: clitic=tree[-1] tree = Tree(tree.label(), [subtree for subtree in tree[0]]) clitic_join(tree, clitic) if self._join_verb_parts and len(tree) > 1 and type(tree[1]) == Tree and type(tree[0]) == Tree and tree[0].label() == 'AUX' and tree[0][0][0] in self._tokenizer.before_verbs: tree[1][0] = (tree[0][0][0] + ' ' + tree[1][0][0], tree[1][0][1]) tree.remove(tree[0]) if self._join_verb_parts and len(tree.leaves()) > 1 and tree.leaves()[-1][0] in self._tokenizer.after_verbs and tree.leaves()[-2][0] in self._tokenizer.verbe : tree[1][0] = (tree[0].leaves()[-1][0] + ' ' + tree[1][0][0], tree[1][0][1]) path = tree.leaf_treeposition(len(tree.leaves())-2) removingtree = tree while len(path) > 2 : removingtree = removingtree[path[0]] path = path[1:] removingtree.remove(Tree(tree.pos()[-2][1],[tree.pos()[-2][0]])) if self._join_verb_parts and len(tree.leaves()) > 1 and tree.leaves()[-1][0] in self._tokenizer.after_verbs and tree.leaves()[-2][0] in self._tokenizer.verbe : tree[1][0] = (tree[0].leaves()[-1][0] + ' ' + tree[1][0][0], tree[1][0][1]) path = tree.leaf_treeposition(len(tree.leaves())-2) removingtree = tree while len(path) > 2 : removingtree = removingtree[path[0]] path = path[1:] removingtree.remove(Tree(tree.pos()[-2][1],[tree.pos()[-2][0]])) return tree