def remove_all_subscript(tree): #new_tree=ParentedTree(tree.pprint()) new_tree=tree.copy(deep=True) for subtree in new_tree.subtrees(): tag, subscript=decompose_tag(subtree.node) #subtree.node=StrEncoder.str2code(tag) subtree.node=tag return new_tree
def remove_crl_subscript(tree): new_tree=tree.copy(deep=True) for subtree in new_tree.subtrees(): tag, subscript=decompose_tag(subtree.node) if subscript in {'l','r','c'}: subtree.node=tag else: subtree.node=subtree.node return new_tree
def remove_crl_subscript(tree): new_tree=tree.copy(deep=True) for subtree in new_tree.subtrees(): tag, subscript=decompose_tag(subtree.node) if subscript in {'l','r','c'}: # ---> revert on Oct 5 #if subscript in {'l','r','c','u'}: #------> XXX Change on Oct 4 <---------- subtree.node=tag else: subtree.node=subtree.node return new_tree
print('\n\nprocessing annotation from ', path_annotation, '... \nprograss:') f=codecs.open(path_annotation, 'rU', 'utf-8') lines=f.readlines() f.close() Production=[] count=0 total_nth=int(len(lines)/10) for line in lines: if count%total_nth==0: print(count/total_nth*10, '% finished') count +=1 tree=Tree(line.strip()) tag, subscript=decompose_tag(tree.node) word=''.join(tree.leaves()) word_pos2tree_str[(word, tag)]=line.strip() print('done!') # # gen single-char annotation from the corpus # print('\n\ngenerating rules for single-char words from corpus') #---> one needs to run 2a_gen_tag_set_for_word_type.py to gen word2newtag.pickle before using it path_word2newtag='../working_data/word2newtag.pickle'
if len(sys.argv)>1: path_to_annotaiton=sys.argv[1] if len(sys.argv)>2: path_to_rule=sys.argv[2] else: path_to_rule='/'+'/'.join(os.path.realpath(path_to_annotation).split('/')[:-1])+'/'+'rules.zpar' def remove_p(d_str): return ' ' if d_str in {'(',')'} else d_str l_set={'l','c'} r_set={'r'} rules={}#dictionary to keep rules print('\ncollecting non-termianls...') lines=codecs.open(path_to_annotation, 'rU','utf-8').readlines() non_terminals={d_string for line in lines for d_string in ''.join([remove_p(char) for char in line]).split() if len(d_string)>2 and d_string[-2]=='_' and d_string[-1] in string.ascii_letters } print('\nconstructing rules...') for full_tag in non_terminals: main_tag, subscript = decompose_tag(full_tag) #only dealing with l/r/c tag, not b/i tag if subscript in {'l','c','r'}: rules[full_tag]='l' if subscript in l_set else 'r' print('\nwriting rules to file', path_to_rule) f=codecs.open(path_to_rule,'w','utf-8') for tag in rules: f.write(tag+' :'+rules[tag]+'\n') f.close()
if count%int(len(Forest)/10)==0: print('progress------->',str(count/len(Forest)*100)[:2], '% finished') new_tree=ParentedTree(tree.pprint()) for subtree in new_tree.subtrees(): #update current tree string=''.join(subtree.leaves()) if string in Vec: #leaves/string in the record tag, subscript= decompose_tag(subtree.node) tag_vec_str=set2str(Vec[string]) #get the tag-set of the node according to the leaves and convert it to str subtree.node=tag_vec_str+'_'+subscript #update the node with the new_tag NewForest.append(new_tree) for subtree in new_tree.subtrees(lambda x: len(x)>1 and ''.join(x.leaves()) in Vec ): # extraction known production rules string=''.join(subtree.leaves()) left_child=subtree[0]