Corpus1=[] Corpus2=[] Corpus3=[] print('\nGenerating three vesions of word-structure annotation...') count=0 for word in Word2treeID: index=Word2treeID[word] tree=NewForest[index] tree_str1=tree.pprint(margin=10000) tree_str2=remove_all_subscript(tree).pprint(margin=10000) tree_str3=remove_crl_subscript(tree).pprint(margin=10000) # removing '_' between tag and subtag, as Stanford parser will remove the subtag when seeing '_' new_tree_str1=' '.join([i[:-2]+i[-1] if len(i)>1 and i[-2]=='_' else i for i in tree_str1.split()]) # remove '_' to merge subscript to merge it to the non-terminal new_tree_str2=tree_str2 # tree_str2 have already remove all the subscripts new_tree_str3=' '.join([i[:-2]+i[-1] if len(i)>1 and i[-2]=='_' else i for i in tree_str3.split()]) # remove '_' to merge subscript to merge Annotation1.append((new_tree_str1, counter)) Corpus1.append(new_tree_str1) Corpus2.append(new_tree_str2) Corpus3.append(new_tree_str3) print('\ndone!')
if count%int(len(lines)/10)==0: print(int(count/len(lines)*100),'% finished...') count +=1 mini_tree_seq1=[] mini_tree_seq2=[] mini_tree_seq3=[] for word in sent: index=Word2treeID[word] tree=NewForest[index] mini_tree_seq1.append(tree.pprint(margin=10000)) mini_tree_seq2.append(remove_all_subscript(tree).pprint(margin=10000)) mini_tree_seq3.append(remove_crl_subscript(tree).pprint(margin=10000)) #mini_tree_seq1.append(keep_all_subscript(tree, Encoder1).pprint(margin=10000)) #mini_tree_seq2.append(remove_all_subscript(tree,Encoder2).pprint(margin=10000)) #mini_tree_seq3.append(remove_crl_subscript(tree, Encoder3).pprint(margin=10000)) Corpus1.append(mini_tree_seq1) Corpus2.append(mini_tree_seq2) Corpus3.append(mini_tree_seq3) print('\ndone!')