def get_features(ptree: nltk.ParentedTree, conn_idxs): leave_list = ptree.leaves() lca_loc = ptree.treeposition_spanning_leaves(conn_idxs[0], conn_idxs[-1] + 1)[:-1] self_category = ptree[lca_loc].label() parent_category = ptree[lca_loc].parent().label() if lca_loc else self_category left_sibling = get_sibling_label(ptree[lca_loc], 'left') right_sibling = get_sibling_label(ptree[lca_loc], 'right') labels = {n.label() for n in ptree.subtrees(lambda t: t.height() > 2)} bool_vp = 'VP' in labels bool_trace = 'T' in labels c = ' '.join(leave_list[conn_idxs[0]:conn_idxs[-1] + 1]).lower() prev, prev_conn, prev_pos, prev_pos_conn_pos = get_pos_features(ptree, conn_idxs, c, -1) next, next_conn, next_pos, next_pos_conn_pos = get_pos_features(ptree, conn_idxs, c, 1) prev = lemmatizer.lemmatize(prev) next = lemmatizer.lemmatize(next) r2l = [ptree[lca_loc[:i + 1]].label() for i in range(len(lca_loc))] r2lcomp = get_compressed_chain(r2l) feat = {'connective': c, 'connectivePOS': self_category, 'prevWord': prev, 'prevPOSTag': prev_conn, 'prevPOS+cPOS': prev_pos_conn_pos, 'nextWord': next, 'nextPOSTag': next_pos, 'cPOS+nextPOS': next_pos_conn_pos, 'root2LeafCompressed': ','.join(r2lcomp), 'root2Leaf': ','.join(r2l), 'left_sibling': left_sibling, 'right_sibling': right_sibling, 'parentCategory': parent_category, 'boolVP': bool_vp, 'boolTrace': bool_trace} return feat
def get_features(relation: Relation, ptree: nltk.ParentedTree): conn_raw = ' '.join(t.surface for t in relation.conn.tokens) conn_idxs = [t.local_idx for t in relation.conn.tokens] lca_loc = lca(ptree, conn_idxs) conn_tag = ptree[lca_loc].label() if conn_idxs[0] == 0: prev = "NONE" else: prev = ptree.leaves()[conn_idxs[0] - 1][0] prev = lemmatizer.lemmatize(prev) conn_pos_relative = get_connective_sentence_position(conn_idxs, ptree) feat = {'Connective': conn_raw, 'ConnectivePOS': conn_tag, 'ConnectivePrev': prev, 'connectivePosition': conn_pos_relative} return feat
tree=NewForest[index] tree_str1=tree.pprint(margin=10000) tree_str2=remove_all_subscript(tree).pprint(margin=10000) tree_str3=remove_crl_subscript(tree).pprint(margin=10000) # # blocking substituting '_' between tag and subtag, into white spaces, !!! ### diff from refined_word_structure_gen # #new_tree_str1=' '.join([i[:-2]+' '+c2l(i[-1]) if len(i)>1 and i[-2]=='_' else i for i in tree_str1.split()]) # remove '_' to merge subscript to merge it to the non-terminal #new_tree_str2=tree_str2 #new_tree_str3=' '.join([i[:-2]+' '+c2l(i[-1]) if len(i)>1 and i[-2]=='_' else i for i in tree_str3.split()]) # remove '_' to merge subscript to merge Annotation1.append((new_tree_str1, counter)) Corpus1.append(''.join(tree.leaves())+' '+tree_str1) Corpus2.append(''.join(tree.leaves())+' '+tree_str2) Corpus3.append(''.join(tree.leaves())+' '+tree_str3) print('\ndone!') p2='../working_data/word_str_annotation1.zpar' p3='../working_data/word_str_annotation2.zpar' p4='../working_data/word_str_annotation3.zpar' if len(sys.argv)>4: p2=sys.argv[2] p3=sys.argv[3] p4=sys.argv[4]
def getHead(syntac_sen): t = ParentedTree(syntac_sen.text) target = t[0] while target.height() != 2: ### non-trivial rules: no.1 flag = 0 parent = target if target.node == "SBARQ": for ts in target: if ts.node in ["WHNP", "WHPP", "WHADJP", "WHADVP"] and len(ts) > 1: target = ts flag = 1 break ### if not flag: rules = head_trace_rule[target.node] #rules = head_trace_rule.get(target.node, []) for rule in rules: if rule[0] == "L": newTarget = LookByL(target, rule[1:]) elif rule[0] == "R": newTarget = LookByR(target, rule[1:]) elif rule[0] == "LBP": newTarget = LookByLBP(target, rule[1:]) elif rule[0] == "RBP": newTarget = LookByRBP(target, rule[1:]) if newTarget != "": break if newTarget == "": target = target[0] else: target = newTarget #print target #print target.height() ### non-trivial rules: no.2: if flag: leafPos = getLeafPOS(target) m = re.search(r'(NN|NNS)_(\d+) POS_', leafPos) if m != None: lvs = target.leaves() print m.groups() target = ParentedTree("("+m.group(1)+" "+lvs[int(m.group(2))]+")") ### non-trivial rules: no.3 if target.height() == 2 and target.leaves()[0] in ["name", "kind", "type", "genre", "group", "part"]: print parent for k in parent: if k.node == "PP": target = k break pr = parent.right_sibling() for p in pr: if pr.node == "PP": target = pr break return target.leaves()[0]
def getHead(syntac_sen): t = ParentedTree(syntac_sen.text) target = t[0] while target.height() != 2: ### non-trivial rules: no.1 flag = 0 parent = target if target.node == "SBARQ": for ts in target: if ts.node in ["WHNP", "WHPP", "WHADJP", "WHADVP" ] and len(ts) > 1: target = ts flag = 1 break ### if not flag: rules = head_trace_rule[target.node] #rules = head_trace_rule.get(target.node, []) for rule in rules: if rule[0] == "L": newTarget = LookByL(target, rule[1:]) elif rule[0] == "R": newTarget = LookByR(target, rule[1:]) elif rule[0] == "LBP": newTarget = LookByLBP(target, rule[1:]) elif rule[0] == "RBP": newTarget = LookByRBP(target, rule[1:]) if newTarget != "": break if newTarget == "": target = target[0] else: target = newTarget #print target #print target.height() ### non-trivial rules: no.2: if flag: leafPos = getLeafPOS(target) m = re.search(r'(NN|NNS)_(\d+) POS_', leafPos) if m != None: lvs = target.leaves() print m.groups() target = ParentedTree("(" + m.group(1) + " " + lvs[int(m.group(2))] + ")") ### non-trivial rules: no.3 if target.height() == 2 and target.leaves()[0] in [ "name", "kind", "type", "genre", "group", "part" ]: print parent for k in parent: if k.node == "PP": target = k break pr = parent.right_sibling() for p in pr: if pr.node == "PP": target = pr break return target.leaves()[0]