Exemplo n.º 1
0
def get_data(file_name):
	wrd_voc, wrd_list, pos_voc, pos_list, depen_voc, depen_list = read_data(file_name)
	sentences = getTree(file_name)

	#skip the empty catagories
	# input data:
	#		top 2 words at buffer & stack; leftmost and rightmost child of the top 2 words at stack
	#		wordIndex ===> wordVec, posIndex ==>posVec, arc_labels ==> arcLabels
	cnt_case = 1
	trn_case_name = file_name + '_case'
	#trn_label_name = '../data/trn_label'
	f_case = codecs.open(trn_case_name, 'w', 'utf-8')
	#f_label = codecs.open(trn_label_name, 'w', 'utf-8')


	for i in range(0, len(sentences)):
		sentence = sentences[i]
		words = sentence['words']
		poses = sentence['poses']
		indexs = sentence['indexs']
		arc_labels = sentence['arc_labels']
		trans = Transition(sentence)
		while not trans.finish():
			index_i = trans.stack[-1]
			index_j = trans.buffer[0]
			#print index_i
			if index_i == -1:
				if indexs[index_j] == index_i:
					if legal_right_arc(index_j, indexs, trans.buffer):
						save_case(f_case, words, poses, 'R_' + arc_labels[index_j], trans.buffer, trans.stack)
						trans.right_arc(arc_labels[index_j])
						#if len(trans.stack) != 0: print len(trans.stack)
						if len(trans.buffer) != 1: print len(trans.buffer)
						break# finish parsing
				else:
					save_case(f_case, words, poses, 'SHIFT\n', trans.buffer, trans.stack) 
					trans.shift()
					continue
			# we must confirm there are no any arc belong to the buffer header
			if indexs[index_j] == index_i:
				if legal_right_arc(index_j, indexs, trans.buffer):
					save_case(f_case, words, poses, 'R_' + arc_labels[index_j], trans.buffer, trans.stack)
					trans.right_arc(arc_labels[index_j])
				else:
					save_case(f_case, words, poses, 'SHIFT\n', trans.buffer, trans.stack) 
					trans.shift()
				continue
			# all the left arc is OK~
			if indexs[index_i] == index_j:
				save_case(f_case, words, poses, 'L_' + arc_labels[index_i], trans.buffer, trans.stack)
				trans.left_arc(arc_labels[index_i])
				continue
			save_case(f_case, words, poses, 'SHIFT\n', trans.buffer, trans.stack) 
			trans.shift()

	f_case.close()
Exemplo n.º 2
0
sentences = getTree(dev_file_name)
word_vec = get_word_vec()
pos_vec, label_index = get_pos_label_vec()
f_result = codecs.open(dev_result_name, 'w', 'utf-8')

for i in range(0, len(sentences)):
	sentence = sentences[i]
	words = sentence['words']
	poses = sentence['poses']
	indexs = sentence['indexs']
	arc_labels = sentence['arc_labels']
	for i in range(len(indexs)):
		indexs[i] = -2
		arc_labels[i] = 'None\n'
	trans = Transition(sentence)
	while not trans.finish():
		arc_index = 0	# everytime we should find an arc index
		# get test_vec
		input_array = []
		if len(trans.stack) == 0:
			trans.shift()
			continue
		'''
		if len(trans.stack) == 0:
			wrd_v = word_vec['unk']
			pos_v = pos_vec['unk']
			together = np.hstack((wrd_v, pos_v))
			input_array = np.hstack((input_array, together))
			wrd_v = word_vec['unk']
			pos_v = pos_vec['unk']
			together = np.hstack((wrd_v, pos_v))
Exemplo n.º 3
0
def get_data(file_name):
    wrd_voc, wrd_list, pos_voc, pos_list, depen_voc, depen_list = read_data(
        file_name)
    sentences = getTree(file_name)

    #skip the empty catagories
    # input data:
    #		top 2 words at buffer & stack; leftmost and rightmost child of the top 2 words at stack
    #		wordIndex ===> wordVec, posIndex ==>posVec, arc_labels ==> arcLabels
    cnt_case = 1
    trn_case_name = file_name + '_case'
    #trn_label_name = '../data/trn_label'
    f_case = codecs.open(trn_case_name, 'w', 'utf-8')
    #f_label = codecs.open(trn_label_name, 'w', 'utf-8')

    for i in range(0, len(sentences)):
        sentence = sentences[i]
        words = sentence['words']
        poses = sentence['poses']
        indexs = sentence['indexs']
        arc_labels = sentence['arc_labels']
        trans = Transition(sentence)
        while not trans.finish():
            index_i = trans.stack[-1]
            index_j = trans.buffer[0]
            #print index_i
            if index_i == -1:
                if indexs[index_j] == index_i:
                    if legal_right_arc(index_j, indexs, trans.buffer):
                        save_case(f_case, words, poses,
                                  'R_' + arc_labels[index_j], trans.buffer,
                                  trans.stack)
                        trans.right_arc(arc_labels[index_j])
                        #if len(trans.stack) != 0: print len(trans.stack)
                        if len(trans.buffer) != 1: print len(trans.buffer)
                        break  # finish parsing
                else:
                    save_case(f_case, words, poses, 'SHIFT\n', trans.buffer,
                              trans.stack)
                    trans.shift()
                    continue
            # we must confirm there are no any arc belong to the buffer header
            if indexs[index_j] == index_i:
                if legal_right_arc(index_j, indexs, trans.buffer):
                    save_case(f_case, words, poses, 'R_' + arc_labels[index_j],
                              trans.buffer, trans.stack)
                    trans.right_arc(arc_labels[index_j])
                else:
                    save_case(f_case, words, poses, 'SHIFT\n', trans.buffer,
                              trans.stack)
                    trans.shift()
                continue
            # all the left arc is OK~
            if indexs[index_i] == index_j:
                save_case(f_case, words, poses, 'L_' + arc_labels[index_i],
                          trans.buffer, trans.stack)
                trans.left_arc(arc_labels[index_i])
                continue
            save_case(f_case, words, poses, 'SHIFT\n', trans.buffer,
                      trans.stack)
            trans.shift()

    f_case.close()