def post_order(s): nonlocal index nonlocal leaf_cnt label, phrase = s[1:-1].split(None, 1) leafs = sexpr.sexpr_tokenize(phrase) if len(leafs) == 2: lstr, rstr = leafs lrst = post_order(lstr) rrst = post_order(rstr) prts[lrst].append(index) prts[rrst].append(index) else: leaf_cnt += 1 labels.append(label) prts.append([]) # childs.append([lrst, rrst] if len(leafs)==2 else []) cur = index childs_list = [lrst, rrst] if len(leafs) == 2 else [] childs_list.append(cur) # self-loop childs.append(childs_list) # print(index) index += 1 return cur
def parse_subtree(self, s): try: root, children = s[1:-1].lstrip().split(' ', 1) except ValueError: root = '' children = s[1:-1].lstrip() return (root.strip(punctuation + ' '), sexpr_tokenize(children.strip()))
def parse(root_sexpr): label, sub_sexpr = root_sexpr[1:-1].split(None, 1) tokens = [] stack = Stack() for sub_sexpr in reversed(sexpr.sexpr_tokenize(sub_sexpr)): stack.push(sub_sexpr) while not stack.empty: _, next_sexpr = stack.pop()[1:-1].split(None, 1) # Leaf: if the length of the next is 1 and the string isn't in brackets next_sexprs = sexpr.sexpr_tokenize(next_sexpr) if len(next_sexprs) == 1 and ('(' not in next_sexprs[0] and ')' not in next_sexprs): tokens.append(next_sexprs[0]) # Otherwise, add them to the stack in reverse order else: for sub_sexpr in reversed(next_sexprs): stack.push(sub_sexpr) return label, ' '.join(tokens)
def post_order(s): label, phrase = s[1:-1].split(None, 1) leafs = sexpr.sexpr_tokenize(phrase) if len(leafs) == 2: lstr, rstr = leafs post_order(lstr) post_order(rstr) else: words.append(leafs[0]) labels.append(label) return label
def tokenize(x): """Tokenizes S-expression dependency parse trees that come with NLI data. This one has been tested here: https://github.com/timniven/hsnli/blob/master/hsnli/tests/tree_sexpr_tests.py Args: x: String, the tree (or subtree) S-expression. Returns: String, List(String), Boolean: tag, [S-expression for the node], is_leaf flag indicating whether this node is a leaf. """ remove_outer_brackets = x[1:-1] if '(' not in remove_outer_brackets: # means it's a leaf split = remove_outer_brackets.split(' ') tag, data = split[0], [split[1]] else: sexpr_tokenized = sexpr.sexpr_tokenize(remove_outer_brackets) tag = sexpr_tokenized[0] del sexpr_tokenized[0] data = sexpr_tokenized is_leaf = len(data) == 1 and not (data[0][0] == '(' and data[0][-1] == ')') return tag, data, is_leaf
def tokenize(s): # sexpr_tokenize can't parse 'foo bar', only '(foo) (bar)', so we # use split to handle the case of a leaf (e.g. 'label word'). label, phrase = s[1:-1].split(None, 1) return label, sexpr.sexpr_tokenize(phrase)
def tokenize(s): labelAndDepth, phrase = s[1:-1].split(None, 1) label, outerContext, ent1Posit, ent2Posit = labelAndDepth.split("/") # classification return label, (sexpr.sexpr_tokenize(phrase), outerContext, ent1Posit, ent2Posit)
def tokenize(s): label, phrase = s[1:-1].split(None, 1) return label, sexpr.sexpr_tokenize(phrase)
def tokenize(self, s): if not s[1:-1].strip(): return [''] return sexpr_tokenize(s[1:-1].strip())