def main(): f_clf = bc3_eval() print('over') title = 'david_chalmers_how_do_you_explain_consciousness' #process_document(title,sen2=1,exist=0) sample_vector = [] vec,important,title_sim = process_document(title,sen2=1) topic_similarity = lda_process(vec) maxs = 0 maxi = 0 phrase_list,vec_phrase_index = phrase_extraction(vec) word_list = word_score_calculate(phrase_list) vec_phrase_score = important_word(vec,word_list,phrase_list,vec_phrase_index) cue_word_score = cue_word(vec) #print(important) for j in range(len(vec)): tmp = f_clf.predict([important[j],title_sim[j],topic_similarity[j],vec_phrase_score[j],cue_word_score[j]]) if(tmp>=maxs): maxs = tmp maxi = j print(vec[maxi].sentence)
print('phrase table loaded') senid = 0 phrase_result = [] # segment result for fe_phrase, ef_phrase in zip(fe_phrases, ef_phrases): print(senid) # stopline = 1899 # if senid != stopline: # continue # senid = stopline fe_alignment, ef_alignment = get_alignments(fe_phrase, ef_phrase) alignment = do_alignment(fe_alignment, ef_alignment, len(ef_phrase[0]), len(fe_phrase[0])) # fe_phrase = fe_phrases[id] # ef_phrase = ef_phrases[id] BP, BP_pos = phrase_extraction(fe_phrase[0], ef_phrase[0], alignment) # fe_phrase[0] 是 e 句子 # # find each e_start and its count # phrase_start_pos_counter = Counter() # for idx, ((e_start, e_end), (f_start, f_end)) in enumerate(BP_pos): # phrase_start_pos_counter[e_start] += 1 # e_start_list = phrase_start_pos_counter.items() # accumulated = 0 # # in order to speed search, for a pos in sentence, build a dict to index the start-index and end-index in BP_pos e_start_dict = {} # for e_start, count in e_start_list: # e_start_dict[e_start] = (accumulated, accumulated + count) # accumulated += count for idx, ((e_start, e_end), (f_start, f_end)) in enumerate(BP_pos): e_pos_list = e_start_dict.get(e_start, []) e_pos_list.append(idx)
f_word = codecs.open('../word_results.txt', 'wb', encoding='utf-8') print('getting the phrases and counts') for i, line_en in enumerate(f_en): if (i + 1) % 100 == 0: print('line no: ', i + 1) print('time:', time.time() - start) break # Read the input data line_de = f_de.readline() line_align = f_align.readline() # Extract the phrases like we did for Assignment 1, but now with additional information phrases_str, phrases, data_aligns, de_align_dict, en_align_dict, phrases_begin, phrases_end =\ phrase_extraction(line_en, line_de, line_align, max_phrase_len) # For every phrase pair... for pos_de, pos_en in phrases: N_LR_phrase_monotone, \ N_LR_word_monotone, \ N_LR_phrase_swap, \ N_LR_word_swap, \ N_LR_phrase_discontinuous_l, \ N_LR_phrase_discontinuous_r, \ phrase_discont_dist_LR_l, \ phrase_discont_dist_LR_r, \ N_LR_word_discontinuous_r, \ N_LR_word_discontinuous_l, \ word_discont_dist_LR_r, \
def bc3_eval(): corpus = 'bc3/bc3corpus.1.0/corpus.xml' annotation = 'bc3/bc3corpus.1.0/annotation.xml' mails = parse_file(corpus) mails = parse_anno(annotation,mails) sample_vector = [] target_vector = [] precision_vector = [] recall_vector = [] F_measure_vector = [] imp = 0 ti_s = 0 to_s = 0 train = 20 index =0 for i in mails: if(index>=train): break index += 1 vec,important,title_sim = process_mail(i) topic_similarity = lda_process(vec) phrase_list,vec_phrase_index = phrase_extraction(vec) word_list = word_score_calculate(phrase_list) vec_phrase_score = important_word(vec,word_list,phrase_list,vec_phrase_index) tmp_produce = [] standard_summary = [anno(i)] standard_name = [[]] cue_word_score = cue_word(vec) for j in range(len(standard_summary[0])): standard_name[0].append('PythonROUGE/'+i.name+'/'+str(j)+'_standard.txt') #print(standard_name) #standard_name[0].append('PythonROUGE/'+i.name+'/'+str(0)+'_standard.txt') #print(standard_name) #standard_name = [['PythonROUGE/'+i+'_standard.txt']] newpath = 'PythonROUGE/'+i.name if not os.path.exists(newpath): os.makedirs(newpath) for j in range(len(vec)): produce_name = ['PythonROUGE/'+i.name+'/'+str(j)+'.txt'] produce_summary = [[vec[j].sentence]] sample_vector.append([important[j],title_sim[j],topic_similarity[j],vec_phrase_score[j],cue_word_score[j]]) imp += important[j] ti_s += title_sim[j] to_s += topic_similarity[j] recall,precision,F_measure = summary_eval(standard_summary,standard_name,produce_summary,produce_name) target_vector.append(recall[0]*precision[0]*F_measure[0]) recall_vector.append(recall[0]) precision_vector.append(precision[0]) F_measure_vector.append(F_measure[0]) shutil.rmtree(newpath) for i in sample_vector: i[0] = i[0]/imp i[1] = i[1]/ti_s i[2] = i[2]/to_s x_recall_train, x_recall_test, y_recall_train, y_recall_test = cross_validation.train_test_split(sample_vector,recall_vector, test_size=0.2, random_state=0) x_precision_train, x_precision_test, y_precision_train, y_precision_test = cross_validation.train_test_split(sample_vector,precision_vector, test_size=0.2, random_state=0) x_fmeasure_train, x_fmeasure_test, y_fmeasure_train, y_fmeasure_test = cross_validation.train_test_split(sample_vector,F_measure_vector, test_size=0.2, random_state=0) #parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]} tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-1,1e-2,1e-3, 1e-4,1e-5], 'C': [1, 10, 100, 1000]}, {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}] r_clf = GridSearchCV(SVR(C=1,epsilon=0.2), tuned_parameters, cv=5) p_clf = GridSearchCV(SVR(C=1,epsilon=0.2), tuned_parameters, cv=5) f_clf = GridSearchCV(SVR(C=1,epsilon=0.2), tuned_parameters, cv=5) r_clf.fit(sample_vector,recall_vector) p_clf.fit(sample_vector,precision_vector) f_clf.fit(sample_vector,F_measure_vector) index = 0 produce_summary = [] produce_name = [] standard_summary = [] standard_name = [] lex_summary = [] lex_name = [] for i in mails: if(index<train): index += 1 continue if(i.name == 'Re:_StarOffice' or i.name == 'Try_Unsubscribing––You_Can\'t'): continue #print(i.name) vec,important,title_sim = process_mail(i) topic_similarity = lda_process(vec) phrase_list,vec_phrase_index = phrase_extraction(vec) word_list = word_score_calculate(phrase_list) vec_phrase_score = important_word(vec,word_list,phrase_list,vec_phrase_index) cue_word_score = cue_word(vec) #word_list = word_score_calculate(phrase_extraction(vec)) #print(word_list) tmp_produce = [] standard_summary.append(anno(i)) tmp_name = [] for j in range(len(standard_summary[-1])): tmp_name.append('PythonROUGE/'+i.name+'/'+str(j)+'_standard.txt') standard_name.append(tmp_name) newpath = 'PythonROUGE/'+i.name if not os.path.exists(newpath): os.makedirs(newpath) maxs = 0 maxi = 0 tmp_summary = [] predict_rouge = [] for j in range(len(vec)): #tmp = r_clf.predict([important[j],title_sim[j],topic_similarity[j],vec_phrase_score[j]])*p_clf.predict([important[j],title_sim[j],topic_similarity[j],vec_phrase_score[j]])*f_clf.predict([important[j],title_sim[j],topic_similarity[j],vec_phrase_score[j]]) tmp = f_clf.predict([important[j],title_sim[j],topic_similarity[j],vec_phrase_score[j],cue_word_score[j]]) predict_rouge.append(tmp) sort_index = numpy.argsort(predict_rouge)[::-1] sort_index2 = numpy.argsort(important)[::-1] ''' for j in range(10): tmp_summary.append(vec[sort_index[j]].sentence) tmp_name.append('PythonROUGE/'+i.name+'/'+str(j)+'.txt') ''' lex_summary.append(vec[sort_index2[0]].sentence) tmp_summary.append(vec[sort_index[0]].sentence) produce_name.append('PythonROUGE/'+i.name+'/'+str(j)+'.txt') lex_name.append('PythonROUGE/'+i.name+'/'+str(j)+'_lex'+'.txt') produce_summary.append(tmp_summary) #print(standard_name) recall,precision,F_measure = summary_eval(standard_summary,standard_name,produce_summary,produce_name) print('recall:') print(recall) print('precision:') print(precision) print('F_measure:') print(F_measure) recall,precision,F_measure = summary_eval(standard_summary,standard_name,lex_summary,lex_name) print('lex_recall:') print(recall) print('lex_precision:') print(precision) print('lex_F_measure:') print(F_measure) return f_clf
#!/usr/bin/env python import argparse from phrase_extraction import phrase_extraction if __name__ == '__main__': parser = argparse.ArgumentParser(prog="Phrase extraction") parser.add_argument('f_file', type=str) parser.add_argument('e_file', type=str) parser.add_argument('alignment', type=str) args = parser.parse_args() phrases = phrase_extraction(args.e_file, args.f_file, args.alignment) for phrase in phrases: print phrase[0], ' - ', phrase[1]
alignment.append([ (pairs.split("-")[0], pairs.split("-")[1]) \ for pairs in single_align ]) return alignment if __name__ == '__main__': parser = argparse.ArgumentParser() # parser.add_argument("f_file", type=str) # parser.add_argument("e_file", type=str) # parser.add_argument("align", type=str) parser.add_argument("fe_file", type=str) parser.add_argument("ef_file", type=str) args = parser.parse_args() fe_phrases = get_giza_file_content(args.fe_file) ef_phrases = get_giza_file_content(args.ef_file) for fe_phrase, ef_phrase in zip(fe_phrases, ef_phrases): fe_alignment, ef_alignment = get_alignments(fe_phrase, ef_phrase) alignment = do_alignment(fe_alignment, ef_alignment, len(fe_phrase[0]), len(ef_phrase[0])) print(alignment) BP = phrase_extraction(fe_phrase[0], ef_phrase[0], alignment) for (pl_phrase, pt_phrase) in BP: print(pl_phrase, "<=>", pt_phrase) print("\n")
parser = argparse.ArgumentParser() # parser.add_argument("f_file", type=str) # parser.add_argument("e_file", type=str) # parser.add_argument("align", type=str) parser.add_argument("fe_file", type=str) parser.add_argument("ef_file", type=str) args = parser.parse_args() fe_phrases = get_giza_file_content(args.fe_file) ef_phrases = get_giza_file_content(args.ef_file) for fe_phrase, ef_phrase in zip(fe_phrases, ef_phrases): fe_alignment, ef_alignment = get_alignments(fe_phrase, ef_phrase) # 例如fe_alignment=(1, 2) 这里 2是f 1是e # 原来的由问题,source 和traget混淆了 # alignment = do_alignment(fe_alignment, ef_alignment, # len(fe_phrase[0]), len(ef_phrase[0])) alignment = do_alignment(fe_alignment, ef_alignment, len(ef_phrase[0]), len(fe_phrase[0])) print(alignment) # import pdb; pdb.set_trace() BP = phrase_extraction(ef_phrase[0], fe_phrase[0], alignment) # fe_phrase[0] 是 e 句子 # 修改错误 #BP = phrase_extraction(ef_phrase[0],fe_phrase[0], alignment) for (pl_phrase, pt_phrase) in BP: print(pl_phrase, "<=>", pt_phrase) print("\n")