Exemplo n.º 1
0
def get_features(ptree: nltk.ParentedTree, conn_idxs):
    leave_list = ptree.leaves()
    lca_loc = ptree.treeposition_spanning_leaves(conn_idxs[0], conn_idxs[-1] + 1)[:-1]

    self_category = ptree[lca_loc].label()
    parent_category = ptree[lca_loc].parent().label() if lca_loc else self_category

    left_sibling = get_sibling_label(ptree[lca_loc], 'left')
    right_sibling = get_sibling_label(ptree[lca_loc], 'right')

    labels = {n.label() for n in ptree.subtrees(lambda t: t.height() > 2)}
    bool_vp = 'VP' in labels
    bool_trace = 'T' in labels

    c = ' '.join(leave_list[conn_idxs[0]:conn_idxs[-1] + 1]).lower()
    prev, prev_conn, prev_pos, prev_pos_conn_pos = get_pos_features(ptree, conn_idxs, c, -1)
    next, next_conn, next_pos, next_pos_conn_pos = get_pos_features(ptree, conn_idxs, c, 1)
    prev = lemmatizer.lemmatize(prev)
    next = lemmatizer.lemmatize(next)

    r2l = [ptree[lca_loc[:i + 1]].label() for i in range(len(lca_loc))]
    r2lcomp = get_compressed_chain(r2l)

    feat = {'connective': c, 'connectivePOS': self_category,
            'prevWord': prev, 'prevPOSTag': prev_conn, 'prevPOS+cPOS': prev_pos_conn_pos,
            'nextWord': next, 'nextPOSTag': next_pos, 'cPOS+nextPOS': next_pos_conn_pos,
            'root2LeafCompressed': ','.join(r2lcomp), 'root2Leaf': ','.join(r2l),
            'left_sibling': left_sibling, 'right_sibling': right_sibling,
            'parentCategory': parent_category, 'boolVP': bool_vp, 'boolTrace': bool_trace}

    return feat
Exemplo n.º 2
0
def get_features(relation: Relation, ptree: nltk.ParentedTree):
    conn_raw = ' '.join(t.surface for t in relation.conn.tokens)
    conn_idxs = [t.local_idx for t in relation.conn.tokens]

    lca_loc = lca(ptree, conn_idxs)
    conn_tag = ptree[lca_loc].label()

    if conn_idxs[0] == 0:
        prev = "NONE"
    else:
        prev = ptree.leaves()[conn_idxs[0] - 1][0]
        prev = lemmatizer.lemmatize(prev)

    conn_pos_relative = get_connective_sentence_position(conn_idxs, ptree)

    feat = {'Connective': conn_raw,
            'ConnectivePOS': conn_tag,
            'ConnectivePrev': prev, 'connectivePosition': conn_pos_relative}
    return feat
  tree=NewForest[index]
  
  tree_str1=tree.pprint(margin=10000)
  tree_str2=remove_all_subscript(tree).pprint(margin=10000)
  tree_str3=remove_crl_subscript(tree).pprint(margin=10000)

#
# blocking substituting '_' between tag and subtag, into white spaces, !!! ### diff from refined_word_structure_gen
#

  #new_tree_str1=' '.join([i[:-2]+' '+c2l(i[-1]) if len(i)>1 and i[-2]=='_' else i for i in tree_str1.split()]) # remove '_' to merge subscript to merge it to the non-terminal
  #new_tree_str2=tree_str2
  #new_tree_str3=' '.join([i[:-2]+' '+c2l(i[-1]) if len(i)>1 and i[-2]=='_' else i for i in tree_str3.split()]) # remove '_' to merge subscript to merge    Annotation1.append((new_tree_str1, counter))

  
  Corpus1.append(''.join(tree.leaves())+'  '+tree_str1)
  Corpus2.append(''.join(tree.leaves())+'  '+tree_str2)
  Corpus3.append(''.join(tree.leaves())+'  '+tree_str3)

print('\ndone!')
    

p2='../working_data/word_str_annotation1.zpar'
p3='../working_data/word_str_annotation2.zpar'
p4='../working_data/word_str_annotation3.zpar'

if len(sys.argv)>4:
  p2=sys.argv[2]
  p3=sys.argv[3]
  p4=sys.argv[4]
Exemplo n.º 4
0
def getHead(syntac_sen):
	t = ParentedTree(syntac_sen.text)


	target = t[0]

	while target.height() != 2:
		### non-trivial rules: no.1 
		flag = 0
		parent = target
		if target.node == "SBARQ":
			for ts in target:
				if ts.node in ["WHNP", "WHPP", "WHADJP", "WHADVP"] and len(ts) > 1:
					
					target = ts
					flag = 1
					break	
		###
		if not flag:
			rules = head_trace_rule[target.node]
			#rules = head_trace_rule.get(target.node, [])
			for rule in rules:
				if rule[0] == "L":
					newTarget = LookByL(target, rule[1:])
				elif rule[0] == "R":
					newTarget = LookByR(target, rule[1:])
				elif rule[0] == "LBP":
					newTarget = LookByLBP(target, rule[1:])
				elif rule[0] == "RBP":
					newTarget = LookByRBP(target, rule[1:])
				if newTarget != "":
					break
			if newTarget == "":
				target = target[0]
			else:
				target = newTarget
			#print target
			#print target.height()
		
		### non-trivial rules: no.2:
		if flag:
			leafPos = getLeafPOS(target)
			m = re.search(r'(NN|NNS)_(\d+) POS_', leafPos)
			if m != None:
				lvs = target.leaves()
				print m.groups()
				target = ParentedTree("("+m.group(1)+" "+lvs[int(m.group(2))]+")")

		### non-trivial rules: no.3
		
		if target.height() == 2 and target.leaves()[0] in ["name", "kind", "type", "genre", "group", "part"]:
			print parent
			for k in parent:
				if k.node == "PP":
					target = k
					break
			pr = parent.right_sibling()
			for p in pr:
				if pr.node == "PP":
					target = pr
					break
				
	return target.leaves()[0]
Exemplo n.º 5
0
def getHead(syntac_sen):
    t = ParentedTree(syntac_sen.text)

    target = t[0]

    while target.height() != 2:
        ### non-trivial rules: no.1
        flag = 0
        parent = target
        if target.node == "SBARQ":
            for ts in target:
                if ts.node in ["WHNP", "WHPP", "WHADJP", "WHADVP"
                               ] and len(ts) > 1:

                    target = ts
                    flag = 1
                    break
        ###
        if not flag:
            rules = head_trace_rule[target.node]
            #rules = head_trace_rule.get(target.node, [])
            for rule in rules:
                if rule[0] == "L":
                    newTarget = LookByL(target, rule[1:])
                elif rule[0] == "R":
                    newTarget = LookByR(target, rule[1:])
                elif rule[0] == "LBP":
                    newTarget = LookByLBP(target, rule[1:])
                elif rule[0] == "RBP":
                    newTarget = LookByRBP(target, rule[1:])
                if newTarget != "":
                    break
            if newTarget == "":
                target = target[0]
            else:
                target = newTarget
            #print target
            #print target.height()

        ### non-trivial rules: no.2:
        if flag:
            leafPos = getLeafPOS(target)
            m = re.search(r'(NN|NNS)_(\d+) POS_', leafPos)
            if m != None:
                lvs = target.leaves()
                print m.groups()
                target = ParentedTree("(" + m.group(1) + " " +
                                      lvs[int(m.group(2))] + ")")

        ### non-trivial rules: no.3

        if target.height() == 2 and target.leaves()[0] in [
                "name", "kind", "type", "genre", "group", "part"
        ]:
            print parent
            for k in parent:
                if k.node == "PP":
                    target = k
                    break
            pr = parent.right_sibling()
            for p in pr:
                if pr.node == "PP":
                    target = pr
                    break

    return target.leaves()[0]