예제 #1
0
def main():
	f_clf = bc3_eval()
	print('over')
	title = 'david_chalmers_how_do_you_explain_consciousness'
	#process_document(title,sen2=1,exist=0)
	sample_vector = []
	vec,important,title_sim = process_document(title,sen2=1)
        topic_similarity = lda_process(vec)
	maxs = 0
	maxi = 0
	phrase_list,vec_phrase_index = phrase_extraction(vec)
        word_list = word_score_calculate(phrase_list)
        vec_phrase_score = important_word(vec,word_list,phrase_list,vec_phrase_index)
	cue_word_score = cue_word(vec)
	#print(important)
	for j in range(len(vec)):
                        tmp = f_clf.predict([important[j],title_sim[j],topic_similarity[j],vec_phrase_score[j],cue_word_score[j]])
			if(tmp>=maxs):
				maxs = tmp
				maxi = j
	print(vec[maxi].sentence)
예제 #2
0
    print('phrase table loaded')
    senid = 0
    phrase_result = []  # segment result
    for fe_phrase, ef_phrase in zip(fe_phrases, ef_phrases):
        print(senid)
        # stopline = 1899
        # if senid != stopline:
        #     continue
            # senid = stopline
        fe_alignment, ef_alignment = get_alignments(fe_phrase, ef_phrase)
        alignment = do_alignment(fe_alignment, ef_alignment,
                                 len(ef_phrase[0]), len(fe_phrase[0]))
        # fe_phrase = fe_phrases[id]
        # ef_phrase = ef_phrases[id]
        BP, BP_pos = phrase_extraction(fe_phrase[0], ef_phrase[0], alignment)  # fe_phrase[0] 是 e 句子

        # # find each e_start and its count
        # phrase_start_pos_counter = Counter()
        # for idx, ((e_start, e_end), (f_start, f_end)) in enumerate(BP_pos):
        #     phrase_start_pos_counter[e_start] += 1
        # e_start_list = phrase_start_pos_counter.items()
        # accumulated = 0
        # # in order to speed search, for a pos in sentence, build a dict to index the start-index and end-index in BP_pos
        e_start_dict = {}
        # for e_start, count in e_start_list:
        #     e_start_dict[e_start] = (accumulated, accumulated + count)
        #     accumulated += count
        for idx, ((e_start, e_end), (f_start, f_end)) in enumerate(BP_pos):
            e_pos_list = e_start_dict.get(e_start, [])
            e_pos_list.append(idx)
예제 #3
0
    f_word = codecs.open('../word_results.txt', 'wb', encoding='utf-8')

    print('getting the phrases and counts')
    for i, line_en in enumerate(f_en):
        if (i + 1) % 100 == 0:
            print('line no: ', i + 1)
            print('time:', time.time() - start)
            break

        # Read the input data
        line_de = f_de.readline()
        line_align = f_align.readline()

        # Extract the phrases like we did for Assignment 1, but now with additional information
        phrases_str, phrases, data_aligns, de_align_dict, en_align_dict, phrases_begin, phrases_end =\
            phrase_extraction(line_en, line_de, line_align, max_phrase_len)

        # For every phrase pair...
        for pos_de, pos_en in phrases:

            N_LR_phrase_monotone, \
            N_LR_word_monotone, \
            N_LR_phrase_swap, \
            N_LR_word_swap, \
            N_LR_phrase_discontinuous_l, \
            N_LR_phrase_discontinuous_r, \
            phrase_discont_dist_LR_l, \
            phrase_discont_dist_LR_r, \
            N_LR_word_discontinuous_r, \
            N_LR_word_discontinuous_l, \
            word_discont_dist_LR_r, \
예제 #4
0
def bc3_eval():
	corpus = 'bc3/bc3corpus.1.0/corpus.xml'	
	annotation = 'bc3/bc3corpus.1.0/annotation.xml'
	mails = parse_file(corpus)
	mails = parse_anno(annotation,mails)
	sample_vector = []
        target_vector = []
        precision_vector = []
        recall_vector = []
        F_measure_vector = []
	imp = 0
        ti_s = 0
        to_s = 0
	train = 20
	index =0
	for i in mails:
		if(index>=train):
			break
		index += 1
		vec,important,title_sim = process_mail(i)
		topic_similarity = lda_process(vec)
		phrase_list,vec_phrase_index = phrase_extraction(vec)
		word_list = word_score_calculate(phrase_list)
		vec_phrase_score = important_word(vec,word_list,phrase_list,vec_phrase_index)
		tmp_produce = []
		standard_summary = [anno(i)]
		standard_name = [[]]
		cue_word_score = cue_word(vec)
		for j in range(len(standard_summary[0])):
			standard_name[0].append('PythonROUGE/'+i.name+'/'+str(j)+'_standard.txt')
		#print(standard_name)
		#standard_name[0].append('PythonROUGE/'+i.name+'/'+str(0)+'_standard.txt')
		#print(standard_name)
                #standard_name = [['PythonROUGE/'+i+'_standard.txt']]
                newpath = 'PythonROUGE/'+i.name
                if not os.path.exists(newpath):
                        os.makedirs(newpath)
                for j in range(len(vec)):
                        produce_name = ['PythonROUGE/'+i.name+'/'+str(j)+'.txt']
                        produce_summary = [[vec[j].sentence]]
                        sample_vector.append([important[j],title_sim[j],topic_similarity[j],vec_phrase_score[j],cue_word_score[j]])
                        imp += important[j]
                        ti_s += title_sim[j]
                        to_s += topic_similarity[j]
                        recall,precision,F_measure = summary_eval(standard_summary,standard_name,produce_summary,produce_name)
                        target_vector.append(recall[0]*precision[0]*F_measure[0])
                        recall_vector.append(recall[0])
                        precision_vector.append(precision[0])
                        F_measure_vector.append(F_measure[0])
		shutil.rmtree(newpath)
	for i in sample_vector:
                i[0] = i[0]/imp
                i[1] = i[1]/ti_s
                i[2] = i[2]/to_s
        x_recall_train, x_recall_test, y_recall_train, y_recall_test = cross_validation.train_test_split(sample_vector,recall_vector, test_size=0.2, random_state=0)
        x_precision_train, x_precision_test, y_precision_train, y_precision_test = cross_validation.train_test_split(sample_vector,precision_vector, test_size=0.2, random_state=0)
        x_fmeasure_train, x_fmeasure_test, y_fmeasure_train, y_fmeasure_test = cross_validation.train_test_split(sample_vector,F_measure_vector, test_size=0.2, random_state=0)
        #parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
        tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-1,1e-2,1e-3, 1e-4,1e-5],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]
        r_clf =  GridSearchCV(SVR(C=1,epsilon=0.2), tuned_parameters, cv=5)
        p_clf =  GridSearchCV(SVR(C=1,epsilon=0.2), tuned_parameters, cv=5)
        f_clf =  GridSearchCV(SVR(C=1,epsilon=0.2), tuned_parameters, cv=5)
	r_clf.fit(sample_vector,recall_vector)
        p_clf.fit(sample_vector,precision_vector)
        f_clf.fit(sample_vector,F_measure_vector)
	index = 0
	produce_summary = []
	produce_name = []
	standard_summary = []
	standard_name = []
	lex_summary = []
	lex_name = []
	for i in mails:
		if(index<train):
			index += 1
			continue
		if(i.name == 'Re:_StarOffice' or i.name == 'Try_Unsubscribing&ndash;&ndash;You_Can\'t'):
			continue
		#print(i.name)
		vec,important,title_sim = process_mail(i)
                topic_similarity = lda_process(vec)
		phrase_list,vec_phrase_index = phrase_extraction(vec)
                word_list = word_score_calculate(phrase_list)
                vec_phrase_score = important_word(vec,word_list,phrase_list,vec_phrase_index)
		cue_word_score = cue_word(vec)
                #word_list = word_score_calculate(phrase_extraction(vec))
                #print(word_list)
                tmp_produce = []
                standard_summary.append(anno(i))
                tmp_name = []
                for j in range(len(standard_summary[-1])):
                        tmp_name.append('PythonROUGE/'+i.name+'/'+str(j)+'_standard.txt')
		standard_name.append(tmp_name)
		newpath = 'PythonROUGE/'+i.name
                if not os.path.exists(newpath):
                        os.makedirs(newpath)
		maxs = 0
        	maxi = 0
		tmp_summary = []
		predict_rouge = []
        	for j in range(len(vec)):
                        #tmp = r_clf.predict([important[j],title_sim[j],topic_similarity[j],vec_phrase_score[j]])*p_clf.predict([important[j],title_sim[j],topic_similarity[j],vec_phrase_score[j]])*f_clf.predict([important[j],title_sim[j],topic_similarity[j],vec_phrase_score[j]])
                        tmp = f_clf.predict([important[j],title_sim[j],topic_similarity[j],vec_phrase_score[j],cue_word_score[j]])
			predict_rouge.append(tmp)
		sort_index = numpy.argsort(predict_rouge)[::-1]
		sort_index2 = numpy.argsort(important)[::-1]
		'''
		for j in range(10):
			tmp_summary.append(vec[sort_index[j]].sentence)
			tmp_name.append('PythonROUGE/'+i.name+'/'+str(j)+'.txt')
		'''
		lex_summary.append(vec[sort_index2[0]].sentence)
		tmp_summary.append(vec[sort_index[0]].sentence)
		produce_name.append('PythonROUGE/'+i.name+'/'+str(j)+'.txt')
		lex_name.append('PythonROUGE/'+i.name+'/'+str(j)+'_lex'+'.txt')
		produce_summary.append(tmp_summary)
	#print(standard_name)
	recall,precision,F_measure = summary_eval(standard_summary,standard_name,produce_summary,produce_name)
	print('recall:')
	print(recall)
	print('precision:')
	print(precision)
	print('F_measure:')
	print(F_measure)
	recall,precision,F_measure = summary_eval(standard_summary,standard_name,lex_summary,lex_name)
	print('lex_recall:')
        print(recall)
        print('lex_precision:')
        print(precision)
        print('lex_F_measure:')
        print(F_measure)
	return f_clf
예제 #5
0
#!/usr/bin/env python
import argparse
from phrase_extraction import phrase_extraction

if __name__ == '__main__':
    
    parser = argparse.ArgumentParser(prog="Phrase extraction")
    parser.add_argument('f_file', type=str)
    parser.add_argument('e_file', type=str)
    parser.add_argument('alignment', type=str)
    args = parser.parse_args()
    
    phrases = phrase_extraction(args.e_file, args.f_file, args.alignment)

    for phrase in phrases:
        print phrase[0], ' - ', phrase[1]


예제 #6
0
        alignment.append([ (pairs.split("-")[0], pairs.split("-")[1]) \
                            for pairs in single_align ])

    return alignment


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    # parser.add_argument("f_file", type=str)
    # parser.add_argument("e_file", type=str)
    # parser.add_argument("align", type=str)
    parser.add_argument("fe_file", type=str)
    parser.add_argument("ef_file", type=str)
    args = parser.parse_args()

    fe_phrases = get_giza_file_content(args.fe_file)
    ef_phrases = get_giza_file_content(args.ef_file)

    for fe_phrase, ef_phrase in zip(fe_phrases, ef_phrases):
        fe_alignment, ef_alignment = get_alignments(fe_phrase, ef_phrase)

        alignment = do_alignment(fe_alignment, ef_alignment, len(fe_phrase[0]),
                                 len(ef_phrase[0]))
        print(alignment)

        BP = phrase_extraction(fe_phrase[0], ef_phrase[0], alignment)

        for (pl_phrase, pt_phrase) in BP:
            print(pl_phrase, "<=>", pt_phrase)
        print("\n")
예제 #7
0
    parser = argparse.ArgumentParser()
    # parser.add_argument("f_file", type=str)
    # parser.add_argument("e_file", type=str)
    # parser.add_argument("align", type=str)
    parser.add_argument("fe_file", type=str)
    parser.add_argument("ef_file", type=str)
    args = parser.parse_args()

    fe_phrases = get_giza_file_content(args.fe_file)
    ef_phrases = get_giza_file_content(args.ef_file)

    for fe_phrase, ef_phrase in zip(fe_phrases, ef_phrases):
        fe_alignment, ef_alignment = get_alignments(fe_phrase, ef_phrase)
        # 例如fe_alignment=(1, 2) 这里 2是f 1是e
        # 原来的由问题,source 和traget混淆了
        # alignment = do_alignment(fe_alignment, ef_alignment,
        # len(fe_phrase[0]), len(ef_phrase[0]))
        alignment = do_alignment(fe_alignment, ef_alignment, len(ef_phrase[0]),
                                 len(fe_phrase[0]))
        print(alignment)
        # import pdb; pdb.set_trace()

        BP = phrase_extraction(ef_phrase[0], fe_phrase[0],
                               alignment)  # fe_phrase[0] 是 e 句子
        # 修改错误
        #BP = phrase_extraction(ef_phrase[0],fe_phrase[0], alignment)

        for (pl_phrase, pt_phrase) in BP:
            print(pl_phrase, "<=>", pt_phrase)
        print("\n")