Exemplo n.º 1
0
def get_features(ptree: nltk.ParentedTree, conn_idxs):
    leave_list = ptree.leaves()
    lca_loc = ptree.treeposition_spanning_leaves(conn_idxs[0], conn_idxs[-1] + 1)[:-1]

    self_category = ptree[lca_loc].label()
    parent_category = ptree[lca_loc].parent().label() if lca_loc else self_category

    left_sibling = get_sibling_label(ptree[lca_loc], 'left')
    right_sibling = get_sibling_label(ptree[lca_loc], 'right')

    labels = {n.label() for n in ptree.subtrees(lambda t: t.height() > 2)}
    bool_vp = 'VP' in labels
    bool_trace = 'T' in labels

    c = ' '.join(leave_list[conn_idxs[0]:conn_idxs[-1] + 1]).lower()
    prev, prev_conn, prev_pos, prev_pos_conn_pos = get_pos_features(ptree, conn_idxs, c, -1)
    next, next_conn, next_pos, next_pos_conn_pos = get_pos_features(ptree, conn_idxs, c, 1)
    prev = lemmatizer.lemmatize(prev)
    next = lemmatizer.lemmatize(next)

    r2l = [ptree[lca_loc[:i + 1]].label() for i in range(len(lca_loc))]
    r2lcomp = get_compressed_chain(r2l)

    feat = {'connective': c, 'connectivePOS': self_category,
            'prevWord': prev, 'prevPOSTag': prev_conn, 'prevPOS+cPOS': prev_pos_conn_pos,
            'nextWord': next, 'nextPOSTag': next_pos, 'cPOS+nextPOS': next_pos_conn_pos,
            'root2LeafCompressed': ','.join(r2lcomp), 'root2Leaf': ','.join(r2l),
            'left_sibling': left_sibling, 'right_sibling': right_sibling,
            'parentCategory': parent_category, 'boolVP': bool_vp, 'boolTrace': bool_trace}

    return feat
Exemplo n.º 2
0
def get_production_rules(ptree: nltk.ParentedTree):
    return [
        "{} <- {}".format(t.label(), ' '.join([tt.label() for tt in t]))
        for t in ptree.subtrees(lambda t: t.height() > 2)
    ]
Exemplo n.º 3
0
count=0
print('\n>>1st pass of process the trees')
for tree in Forest:

  count +=1
  if count%int(len(Forest)/10)==0:
      print('progress------->',str(count/len(Forest)*100)[:2], '% finished')


  new_tree=ParentedTree(tree.pprint())


  

  for subtree in new_tree.subtrees():  #update current tree

    string=''.join(subtree.leaves())

    if  string in Vec:  #leaves/string in the record

      tag, subscript= decompose_tag(subtree.node)

      tag_vec_str=set2str(Vec[string]) #get the tag-set of the node according to the leaves and convert it to str

      subtree.node=tag_vec_str+'_'+subscript  #update the node with the new_tag


  NewForest.append(new_tree)