def get_data(file_name): wrd_voc, wrd_list, pos_voc, pos_list, depen_voc, depen_list = read_data(file_name) sentences = getTree(file_name) #skip the empty catagories # input data: # top 2 words at buffer & stack; leftmost and rightmost child of the top 2 words at stack # wordIndex ===> wordVec, posIndex ==>posVec, arc_labels ==> arcLabels cnt_case = 1 trn_case_name = file_name + '_case' #trn_label_name = '../data/trn_label' f_case = codecs.open(trn_case_name, 'w', 'utf-8') #f_label = codecs.open(trn_label_name, 'w', 'utf-8') for i in range(0, len(sentences)): sentence = sentences[i] words = sentence['words'] poses = sentence['poses'] indexs = sentence['indexs'] arc_labels = sentence['arc_labels'] trans = Transition(sentence) while not trans.finish(): index_i = trans.stack[-1] index_j = trans.buffer[0] #print index_i if index_i == -1: if indexs[index_j] == index_i: if legal_right_arc(index_j, indexs, trans.buffer): save_case(f_case, words, poses, 'R_' + arc_labels[index_j], trans.buffer, trans.stack) trans.right_arc(arc_labels[index_j]) #if len(trans.stack) != 0: print len(trans.stack) if len(trans.buffer) != 1: print len(trans.buffer) break# finish parsing else: save_case(f_case, words, poses, 'SHIFT\n', trans.buffer, trans.stack) trans.shift() continue # we must confirm there are no any arc belong to the buffer header if indexs[index_j] == index_i: if legal_right_arc(index_j, indexs, trans.buffer): save_case(f_case, words, poses, 'R_' + arc_labels[index_j], trans.buffer, trans.stack) trans.right_arc(arc_labels[index_j]) else: save_case(f_case, words, poses, 'SHIFT\n', trans.buffer, trans.stack) trans.shift() continue # all the left arc is OK~ if indexs[index_i] == index_j: save_case(f_case, words, poses, 'L_' + arc_labels[index_i], trans.buffer, trans.stack) trans.left_arc(arc_labels[index_i]) continue save_case(f_case, words, poses, 'SHIFT\n', trans.buffer, trans.stack) trans.shift() f_case.close()
sentences = getTree(dev_file_name) word_vec = get_word_vec() pos_vec, label_index = get_pos_label_vec() f_result = codecs.open(dev_result_name, 'w', 'utf-8') for i in range(0, len(sentences)): sentence = sentences[i] words = sentence['words'] poses = sentence['poses'] indexs = sentence['indexs'] arc_labels = sentence['arc_labels'] for i in range(len(indexs)): indexs[i] = -2 arc_labels[i] = 'None\n' trans = Transition(sentence) while not trans.finish(): arc_index = 0 # everytime we should find an arc index # get test_vec input_array = [] if len(trans.stack) == 0: trans.shift() continue ''' if len(trans.stack) == 0: wrd_v = word_vec['unk'] pos_v = pos_vec['unk'] together = np.hstack((wrd_v, pos_v)) input_array = np.hstack((input_array, together)) wrd_v = word_vec['unk'] pos_v = pos_vec['unk'] together = np.hstack((wrd_v, pos_v))
def get_data(file_name): wrd_voc, wrd_list, pos_voc, pos_list, depen_voc, depen_list = read_data( file_name) sentences = getTree(file_name) #skip the empty catagories # input data: # top 2 words at buffer & stack; leftmost and rightmost child of the top 2 words at stack # wordIndex ===> wordVec, posIndex ==>posVec, arc_labels ==> arcLabels cnt_case = 1 trn_case_name = file_name + '_case' #trn_label_name = '../data/trn_label' f_case = codecs.open(trn_case_name, 'w', 'utf-8') #f_label = codecs.open(trn_label_name, 'w', 'utf-8') for i in range(0, len(sentences)): sentence = sentences[i] words = sentence['words'] poses = sentence['poses'] indexs = sentence['indexs'] arc_labels = sentence['arc_labels'] trans = Transition(sentence) while not trans.finish(): index_i = trans.stack[-1] index_j = trans.buffer[0] #print index_i if index_i == -1: if indexs[index_j] == index_i: if legal_right_arc(index_j, indexs, trans.buffer): save_case(f_case, words, poses, 'R_' + arc_labels[index_j], trans.buffer, trans.stack) trans.right_arc(arc_labels[index_j]) #if len(trans.stack) != 0: print len(trans.stack) if len(trans.buffer) != 1: print len(trans.buffer) break # finish parsing else: save_case(f_case, words, poses, 'SHIFT\n', trans.buffer, trans.stack) trans.shift() continue # we must confirm there are no any arc belong to the buffer header if indexs[index_j] == index_i: if legal_right_arc(index_j, indexs, trans.buffer): save_case(f_case, words, poses, 'R_' + arc_labels[index_j], trans.buffer, trans.stack) trans.right_arc(arc_labels[index_j]) else: save_case(f_case, words, poses, 'SHIFT\n', trans.buffer, trans.stack) trans.shift() continue # all the left arc is OK~ if indexs[index_i] == index_j: save_case(f_case, words, poses, 'L_' + arc_labels[index_i], trans.buffer, trans.stack) trans.left_arc(arc_labels[index_i]) continue save_case(f_case, words, poses, 'SHIFT\n', trans.buffer, trans.stack) trans.shift() f_case.close()