def load_data(dataset):
	train_data = [[],[]]
	test_data = [[],[]]
	valid_data = [[],[]]
	train = 1000
	test = 500
	vectors = readVectors('bc3_vectors.txt')
	corpus = 'bc3/bc3corpus.1.0/corpus.xml'
	annotation = 'bc3/bc3corpus.1.0/annotation.xml'
	mails = parse_bc3(corpus,annotation)
	assignVectors(mails,vectors)
	count = 0
	for i in mails:
        	for j in i.text:
			if j.score > 0:
                        	score = 1
                        else:
                        	score = 0
			if count<train:
                		train_data[0].append(j.vector + i.vector)
				train_data[1].append(score)
			elif count>train and count<train+test:
				test_data[0].append(j.vector + i.vector)
                                test_data[1].append(score)
			else:
				valid_data[0].append(j.vector + i.vector)
                                valid_data[1].append(score)
			count += 1
	return (process_dataset(train_data,test_data,valid_data),len(train_data[0][0]))
Exemplo n.º 2
0
def load_data(datasets):
	train_data = [[],[]]
	test_data = [[],[]]
	valid_data = [[],[]]
	train = 1000
	test = 500
	vectors = readVectors('bc3_vector_with_subject')
	corpus = 'bc3/bc3corpus.1.0/corpus.xml'
	annotation = 'bc3/bc3corpus.1.0/annotation.xml'
	mails = parse_bc3(corpus,annotation)
	assignVectors(mails,vectors)
	count = 0
	
	trainSet,validSet,testSet = divide_data(len(mails),0.6,0.2,mails)
	def assignBinaryScore(dataset,output):
		for i in dataset:
			for j in i.text:
				if j.score>=0.32 and j.score<0.65:
					score = 1
				elif j.score >= 0.65 and j.score<0.99:
					score = 2
				elif j.score == 0:
					score = 0
				else:
					score = 3
				subject = i.subject_feature[int(j.index.split('.')[0]) - 1]
                                thread = i.thread_feature[int(j.index.split('.')[0]) - 1]
				output[0].append(j.vector + subject + thread)
				output[1].append(score)
	assignBinaryScore(trainSet,train_data)
	assignBinaryScore(validSet,valid_data)
	assignBinaryScore(testSet,test_data)
	'''
	for i in mails:
        	for j in i.text:
			if j.score > 0:
                        	score = 1
                        else:
                        	score = 0
			if count<train:
                		train_data[0].append(j.vector + i.vector)
				train_data[1].append(score)
			elif count>train and count<train+test:
				test_data[0].append(j.vector + i.vector)
                                test_data[1].append(score)
			else:
				valid_data[0].append(j.vector + i.vector)
                                valid_data[1].append(score)
			count += 1
	'''
	return (process_dataset(train_data,test_data,valid_data),len(train_data[0][0]),testSet)
def calculate_sentence_features(mails):
	token_dict = {}
	stemmer = PorterStemmer()
	tfidf = tfidf_generize(mails,token_dict)
	for i in mails:
		for j in i.thread:
			for k in j.sentences:
				k.sentence_features = []
	max_sum = 0
	min_sum = 100
	max_avg = 0
	min_avg = 100
	max_sub_sim = 0
	min_sub_sim = 100
	max_clue_score = 0
	min_clue_score = 100
	#clue_score_calculation(mails,repeat_words)
	speech_act_score(mails)
	for i in mails:
		relative_position(i)
		tmp_max,tmp_min = subject_sim(i)
		if tmp_max > max_sub_sim:
			max_sub_sim = tmp_max
		if tmp_min < min_sub_sim:
			min_sub_sim = tmp_min
		message_number(i)
		t_index = 0
		for j in i.thread:
			tmp_subject = i.subject[t_index]
			subject_tfidf = numpy.array(tfidf.transform([tmp_subject]).todense())[0]
			t_index += 1
			m_rel_pos(j)
			fol_quote(j)
			for k in j.sentences:
				length(k)
				is_question(k)
				k.tfidf = numpy.array(tfidf.transform([k.sentence]).todense())[0]
				tfidf_sum(k)
				'''
				if k.tfidf_sum_score > max_sum:
					max_sum = k.tfidf_sum_score
				elif k.tfidf_sum_score <min_sum:
					min_sum = k.tfidf_sum_score
				'''
				tfidf_avg(k)
				#print tmp_subject
				k.tfidf_subject_similarity_score = cosine_similarity(k.tfidf,subject_tfidf)
				'''
				if k.tfidf_avg_score > max_avg:
					max_avg = k.tfidf_avg_score
				elif k.tfidf_avg_score < min_avg:
					min_avg = k.tfidf_avg_score
				if k.clue_score > max_clue_score:
					max_clue_score = k.clue_score
				elif k.clue_score < min_clue_score:
					min_clue_score = k.clue_score
				'''
	
	vectors = readVectors('bc3_vector_with_subject')
        assignVectors(mails,vectors)
        for i in mails:
                #print len(i.thread)
                #print len(i.thread_feature)
		subjectivity_score(i)
                t_index = 0
                for j in i.thread:
                        #print len(i.thread)
			question_similarity(j,paragraph_sim)	
                        j.vector = i.thread_feature[t_index]
                        j.subject_vector = i.subject_feature[t_index]
                        t_index += 1
			pre_sim(j,paragraph_sim)
                        #print t_index
                        for k in j.sentences:
                                for l in i.text:
                                        if k.index == l.index:
                                                k.vector = l.vector
                                                k.subject_similarity_score = cosine_similarity(k.vector,j.subject_vector)
                                                k.topic_similarity_score = cosine_similarity(k.vector,j.vector)
                                                break
	
	#input_x = []
	#input_y = []
	clue_score_calculation(mails,repeat_words,paragraph_sim)
	sentiment_score(mails,'bc3_sentiment_vectors.txt')
	for i in mails:
		for j in i.thread:
			for k in j.sentences:
				#k.subject_sim_score = (k.subject_sim_score - min_sub_sim)/float(max_sub_sim-min_sub_sim)
				#k.tfidf_sum_score = (k.tfidf_sum_score - min_sum) / float(max_sum - min_sum)
				#k.tfidf_avg_score = (k.tfidf_avg_score - min_avg) / float(max_avg - min_avg)
				k.unnor_sentence_features = [k.req,k.dlv,k.cmt,k.prop,k.meet,k.ddata,k.subjectivity_score,k.tfidf_subject_similarity_score,k.clue_score,k.subject_sim_score,k.message_number_score,k.fol_quote_score,k.m_rel_pos_score,k.is_question_score,k.tfidf_sum_score,k.tfidf_avg_score,k.length_score,k.relative_position_score]

				#k.clue_score = (k.clue_score - min_clue_score) / float(max_clue_score - min_clue_score)
				k.para_features = [k.subjectivity_score,k.qa_score,k.question_similarity_score,k.seq_sim_score,k.para_clue_score,k.fol_quote_score,k.is_question_score,k.tfidf_sum_score,k.tfidf_avg_score,k.subject_similarity_score,k.topic_similarity_score]
				#print k.subject_sim_score
				#k.sentence_features = [k.clue_score,k.subject_sim_score,k.message_number_score,k.fol_quote_score,k.m_rel_pos_score,k.is_question_score,k.tfidf_sum_score,k.tfidf_avg_score,k.length_score,k.relative_position_score]
				#print k.sentence_features
				#input_x.append(k.sentence_features)
				#input_y.append(k.score)
	unnor_label = {'subj':'real','tfidf_sim':'real','clue':'real','sub_sim':'real','m_num':'real','fol':'real','rel':'real','is_q':'real','tfidf_sum':'real','tfidf_avg':'real','leng':'real','rel_pos':'real','score':'real'}
	para_label = {'sen':'real','subj':'real','qa':'real','qs':'real','seq':'real','clue':'real','fol':'real','is_q':'real','tfidf_sum':'real','tfidf_avg':'real','subj_sim':'real','topic_sim':'real','score':'real'}
	
	'''
	clue_score_calculation(mails,paragraph_sim)
	for i in mails:
		for j in i.thread:
			for j in j.sentences:
				k.para_features = [k.clue_score,k.message_number_score,k.fol_quote_score,k.m_rel_pos_score,k.is_question_score,k.tfidf_sum_score,k.tfidf_avg_score,k.length_score,k.relative_position_score,k.subject_similarity_score,k.topic_similarity_score]
	'''
	#input_x = numpy.asarray(input_x)
	#input_y = numpy.asarray(input_y)
	# basic feature extraction end

	#clue_score
	
	#clue_score end
	'''
	vectors = readVectors('bc3_vector_with_subject')
	assignVectors(mails,vectors)
	for i in mails:
		#print len(i.thread)
		#print len(i.thread_feature)
		t_index = 0
		for j in i.thread:
			#print len(i.thread)
			j.vector = i.thread_feature[t_index]
			j.subject_vector = i.subject_feature[t_index]
			t_index += 1
			#print t_index
			for k in j.sentences:
				for l in i.text:
					if k.index == l.index:
						k.vector = l.vector
						k.subject_similarity_score = cosine_similarity(k.vector,j.subject_vector)
						k.topic_similarity_score = cosine_similarity(k.vector,j.vector)
						k.para_features = k.sentence_features
						k.para_features.append(k.subject_similarity_score)
						k.para_features.append(k.topic_similarity_score)
						break
	'''
	ori = 0.0
	para = 0.0
	o_ro = 0.0
	p_ro = 0.0
	rnn_rouge = 0.0
	rnn_score = 0.0
	for p in range(5):
		train,valid,test = divide_data(len(mails),0.8,0,mails)
		if p == 0:
			tmp_rnn = rnn_summ(train,test,"rnn_hidden")
		else:
			tmp_rnn = rnn_summ(train,test)
		tmp_rnn.init_rnn(0.01,0)
		rate = 0.33
		for j in range(6000):
			if j % 100 == 0:
				rate = rate * 0.9
			error = tmp_rnn.rnn_train(rate)
			if j % 100 == 0:
                        	print error
		if p==0:
			tmp_rnn.close_file()
		unnor_input_x = []
		unnor_test_x = []
		para_input_x = []
		para_test_x = []
		input_x = []
		input_y = []
		for i in train:
			for j in i.thread:
				for k in j.sentences:
					input_x.append(k.sentence_features)
					input_y.append(k.score)
					unnor_input_x.append(k.unnor_sentence_features)
					para_input_x.append(k.para_features)
		test_x = []
		test_y = []
		for i in test:
			for j in i.thread:
				for k in j.sentences:
					test_x.append(k.sentence_features)
					unnor_test_x.append(k.unnor_sentence_features)
					para_test_x.append(k.para_features)
					test_y.append(k.score)
		tmp_input = unnor_input_x + unnor_test_x
       	 	tmp_input = preprocessing.scale(tmp_input)
        	unnor_input_x = tmp_input[0:len(unnor_input_x)]
        	unnor_test_x = tmp_input[len(unnor_input_x):len(tmp_input)]

		tmp_input = para_input_x + para_test_x
		tmp_input = preprocessing.scale(tmp_input)
		para_input_x = tmp_input[0:len(para_input_x)]
		para_test_x = tmp_input[len(para_input_x):len(tmp_input)]
		def eval(input_x,input_y,test_x,test,label,write_folder = None):
			tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
              		  			'C': [1, 10, 100, 1000]},
                				{'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]
			grid_clf = GridSearchCV(SVR(C=1,epsilon=0.2), tuned_parameters)
			grid_clf.fit(input_x,input_y)
			print "params : \t"
			print grid_clf.get_params()
			result = grid_clf.predict(test_x)
			#py_weka = python_weka(input_x,input_y,label)
			#py_weka.train()
			#result = py_weka.predict(test_x)
			#py_weka.close()
			#clf = SVR(C=1.0, epsilon=0.2)
			#clf.fit(input_x,input_y)
			#result =  clf.predict(test_x)
			score_index = 0
			produce_set = []
			for i in test:
				produce_set.append([])
				score_list = []
				index_list = []
				for j in i.thread:
					for k in j.sentences:
						k.predict_score = result[score_index]
						score_index += 1
						score_list.append(k.predict_score)
						index_list.append(k.index)
				sorted_index_array = sorted_index(score_list)
				sen_length = 0
				for j in range(len(index_list)):
					if sen_length < float(len(index_list))*0.3:
						produce_set[-1].append(index_list[sorted_index_array[len(index_list)-j-1]])
						sen_length += 1
					else:
						break
			score =  weightRecall(test,produce_set,write_folder)
			print score
			rouge_eval = rouge(test,produce_set)
			rouge_score =  rouge_eval.eval()['rouge_l_f_score']
			print rouge_score
			return score,rouge_score
		print "rnn:"
		rnn_tmp_score,rnn_tmp_rouge = tmp_rnn.rnn_test()
		'''
		if p != 0:
			rnn_tmp_score,rnn_tmp_rouge = tmp_rnn.rnn_test()
		else:
			rnn_tmp_score,rnn_tmp_rouge = tmp_rnn.rnn_test("rnn_folder")
		'''
		rnn_score += rnn_tmp_score
		rnn_rouge += rnn_tmp_rouge
		print "avg rnn:"
		print rnn_score
		print "avg rnn rouge:"
		print rnn_rouge
		print "\n"
		print "ori:"
		if p == 0:
			ori_score,ori_rouge = eval(unnor_input_x,input_y,unnor_test_x,test,unnor_label,"ori_folder")
		else:
			ori_score,ori_rouge = eval(unnor_input_x,input_y,unnor_test_x,test,unnor_label)
		ori += ori_score
		o_ro += ori_rouge
		print "avg_ori:"
		print ori
		print "avg_ori_rouge:"
		print o_ro
		print "\n" 

		print "para"
		if p != 4:
			para_score,para_rouge = eval(para_input_x,input_y,para_test_x,test,para_label)
		else:
			para_score,para_rouge = eval(para_input_x,input_y,para_test_x,test,para_label,"para_folder")
		para += para_score
		p_ro += para_rouge
		print "avg_para:"
		print para
		print "avg_para_rouge:"
		print p_ro
		print "\n"