def get_lda_input_format(): file_content = read_file( "..\\data\\sample_datas\\evasampledata4-TaskAA.txt") lda_format = list() cur_target = "" count = 0 for cur_line in file_content: if cur_line == "": continue (id_, target, text, stance) = cur_line.split("\t") if cur_target == "": cur_target = target count = 0 if cur_target != target: cur_target = target count = 0 if count < 500: lda_format.append(text) count += 1 # if target != "IphoneSE": # continue # lda_format.append(" ".join(jieba.cut(text))) write_file(lda_format, "../nlpcc2016.txt")
def __processing_using_ros(original_path="../data/origin", data_backup_path="../data/ros_result"): px = ParseXML() for cur_file in os.listdir(original_path): dir_name = cur_file.split(".")[0] if dir_name == "test": continue out_path = os.path.join("%s/%s/" % (data_backup_path, dir_name)) if os.path.exists(out_path + "/tokenized_body.temp") and \ os.path.exists(out_path + "/tokenized_title.temp") and False: continue if not os.path.exists(out_path): os.makedirs(out_path) #px.parse(os.path.join("%s" % (cur_file))) # cur_content = read_file(os.path.join("%s/%s" % (lang_path, cur_file))) contents = px.all_content print contents titles = px.title word_segmented = api.tokenize(u" ".join(contents)) write_file(contents, os.path.join("%s/%s.txt" % (out_path, dir_name)), False) write_file(word_segmented, out_path + "/tokenized_body.temp", False) write_file(word_segmented, out_path + "/lemmatized_body.temp", False) word_segmented = api.tokenize(u" ".join(titles)) write_file(word_segmented, out_path + "/tokenized_title.temp", False) write_file(word_segmented, out_path + "/lemmatized_title.temp", False)
def get_rouge_ans(): memog_res = read_file("../data/2017_results/MSS2017_ROUGE_1.5.7_CI.csv") ori_res_value = np.array( [line.replace(" ", "").split(",")[4] for line in memog_res]) ori_team = np.array( [line.replace(" ", "").split(",")[0] for line in memog_res]) ori_priority = np.array( [line.replace(" ", "").split(",")[1] for line in memog_res]) ori_lang = np.array( [line.replace(" ", "").split(",")[2] for line in memog_res]) rouge_n = np.array( [line.replace(" ", "").split(",")[3] for line in memog_res]) for cur_rouge in ["ROUGE-1", "ROUGE-2", "ROUGE-3", "ROUGE-4"]: ans = [] tmp_set = set() rouge_idx = np.where(rouge_n == cur_rouge)[0] lang = ori_lang[rouge_idx] priority = ori_priority[rouge_idx] team = ori_team[rouge_idx] res_value = ori_res_value[rouge_idx] for cur_lang in set(lang): print cur_lang idx = np.where(lang == cur_lang)[0] cur_value = res_value[idx] cur_priority = priority[idx] cur_team = team[idx] tmp_ans = cur_lang + '\t' tmp_len = 0 while tmp_len < len(cur_value): idx_min = np.where(cur_value == max(cur_value))[0] tmp_len += len(idx_min) for idx_1 in idx_min: tmp_ans += cur_team[idx_1] + "-" + cur_priority[ idx_1] + '\t' cur_value[idx_min] = -1 ans.append(tmp_ans) tmp_set = set([team[i] + "-" + priority[i] for i in range(len(team))]) write_file(ans, "../" + cur_rouge + "_ans.txt") final_ans = [] final_value = [] tmp_set = sorted(list(tmp_set)) final_ans.append("lang," + ",".join(tmp_set)) for cur_ans in ans: ans_list = cur_ans.split('\t') print ans_list final_tmp_ans = ["" for i in range(len(tmp_set) + 1)] final_tmp_ans[0] = ans_list[0] for i in range(len(tmp_set)): if tmp_set[i] in ans_list: final_tmp_ans[i + 1] = str( np.where(np.array(ans_list) == tmp_set[i])[0][0]) final_ans.append(",".join(final_tmp_ans)) write_file(final_ans, "../final_" + cur_rouge + "_ans.csv") print "end"
def get_mss_paper_summary(self, file_name, if_write_file=True): """ generate summary for one paper, single document summarization :param lang: :param file_name: current file name, used for write summary answer :param if_write_file: whether write generated summary to answer file named file_name :return: """ # initial self.__quality, self.__paper_name = None, file_name self.quality_method__ = "" if self.stop_word_method == "remove_stop": self.__paper = read_file(self.__child_path + "RemoveStop.temp") elif self.stop_word_method == "with_stop": self.__paper = read_file(self.__child_path + "word_segment.temp") self.__titles = read_file(self.__child_path + "titles.temp") self.__paper_original = read_file(self.__child_path + "word_segment.temp") self.__sub_paper_len = [ int(i) for i in read_file(self.__child_path + "sec_idx.temp") ] # extract sentence feature_subset, eig = self.__cal_candidate_set() #print len(feature_subset) # feature_subset = range(len(self.__paper_original)) # eig = [] log.info(feature_subset) log.debug(eig) # use feature list to extract summary summary = self.__construct_summary(feature_subset, eig) if if_write_file: if file_name == '': log.error("file name is empty") return "" # write answer to file for ROUGE #print self.__rouge_path answer_path = self.__child_path write_file( summary, os.path.join('%s/%s.txt' % (answer_path, file_name + '_result')), False) return "".join(summary)
def model_temp(content, out_path): hlda_model = [] word_list = set(" ".join(content).split(" ")) if "" in word_list: word_list.remove("") for sentence in content: sen_word = sentence.split(" ") union_word = set(sen_word) if "" in union_word: union_word.remove("") sen_model = str(len(union_word)) for word in union_word: idx = np.where(np.array(list(word_list)) == word) sen_model += " " + str(idx[0][0]) + ":" + str(sen_word.count(word)) hlda_model.append(sen_model) write_file(hlda_model, out_path + "/model.temp", False) write_file(word_list, out_path + "/words.temp", False)
def get_memog_ans(): memog_res = read_file("../data/2017_results/MSS2017_MeMoG_CI_March30.csv") res_value = np.array( [line.replace(" ", "").split(",")[4] for line in memog_res]) team = np.array( [line.replace(" ", "").split(",")[0] for line in memog_res]) priority = np.array( [line.replace(" ", "").split(",")[1] for line in memog_res]) lang = np.array( [line.replace(" ", "").split(",")[2] for line in memog_res]) ans = [] memog_answer_value = [] tmp_set = set() for cur_lang in set(lang): print cur_lang idx = np.where(lang == cur_lang)[0] cur_value = res_value[idx] cur_priority = priority[idx] cur_team = team[idx] tmp_ans = cur_lang + '\t' tmp_len = 0 while tmp_len < len(cur_value): idx_max = np.where(cur_value == max(cur_value))[0] tmp_len += len(idx_max) for idx_1 in idx_max: tmp_ans += cur_team[idx_1] + "-" + cur_priority[idx_1] + '\t' cur_value[idx_max] = -1 ans.append(tmp_ans) tmp_set = set([team[i] + "-" + priority[i] for i in range(len(team))]) write_file(ans, "../memog_ans_march30.txt") final_ans = [] tmp_set = sorted(list(tmp_set)) final_ans.append("lang," + ",".join(tmp_set)) for cur_ans in ans: ans_list = cur_ans.split('\t') print ans_list final_tmp_ans = ["" for i in range(len(tmp_set) + 1)] final_tmp_ans[0] = ans_list[0] for i in range(len(tmp_set)): if tmp_set[i] in ans_list: final_tmp_ans[i + 1] = str( np.where(np.array(ans_list) == tmp_set[i])[0][0]) final_ans.append(",".join(final_tmp_ans)) write_file(final_ans, "../final_memog_ans_march30.csv") print "end"
def get_rouge_value(): memog_res = read_file("../data/2017_results/MSS2017_ROUGE_1.5.7_CI.csv") ori_res_value = np.array( [line.replace(" ", "").split(",")[4] for line in memog_res]) ori_team_priority = np.array([ line.replace(" ", "").split(",")[0] + "-" + line.replace(" ", "").split(",")[1] for line in memog_res ]) ori_lang = np.array( [line.replace(" ", "").split(",")[2] for line in memog_res]) rouge_n = np.array( [line.replace(" ", "").split(",")[3] for line in memog_res]) for cur_rouge in ["ROUGE-1", "ROUGE-2", "ROUGE-3", "ROUGE-4"]: ans = [] # get team priority tmp_team_priority = [] rouge_idx = np.where(rouge_n == cur_rouge)[0] lang = ori_lang[rouge_idx] team_priority = ori_team_priority[rouge_idx] res_value = ori_res_value[rouge_idx] for cur_lang in set(lang): idx = np.where(lang == cur_lang)[0] if len(tmp_team_priority) < len(team_priority[idx]): tmp_team_priority = team_priority[idx] tmp_team_priority = sorted(tmp_team_priority) print tmp_team_priority print len(tmp_team_priority) ans.append("lang\t" + "\t".join(tmp_team_priority)) for cur_lang in set(lang): print cur_lang tmp_ans = cur_lang idx = np.where(lang == cur_lang)[0] cur_lang_value = res_value[idx] print team_priority[idx] for cur_team_priority in tmp_team_priority: idx_1 = np.where(team_priority[idx] == cur_team_priority)[0] if len(idx_1) == 0: tmp_ans += "\t-" continue tmp_ans += "\t" + str(float(cur_lang_value[idx_1[0]]) * 100) ans.append(tmp_ans) write_file(ans, "../final_" + cur_rouge + "_value.csv") print "end"
def analyse_rouge_value(file_path, rouge_n): log.info("analysing rouge result ...") analysed_log = [] for cur_log_file in os.listdir(file_path): if not cur_log_file.endswith(".log"): continue log_content = read_file( os.path.join("%s/%s" % (file_path, cur_log_file))) file_name = os.path.basename(cur_log_file) tmp_log = "" for i in range(len(log_content)): if log_content[i].endswith("configure_all.out"): tmp_log = "all\t" tmp_log += "\t".join( file_name.replace(".log", "").split(".")[3:]) tmp_log += "\t" + log_content[i + 2].split(" ")[3] tmp_log += "\t" + log_content[i + 3].split(" ")[3] tmp_log += "\t" + log_content[i + 4].split(" ")[3] break analysed_log.append(tmp_log) write_file(analysed_log, "./data/log_analysis/ana.log", False)
def get_memog_value(): memog_res = read_file("../data/2017_results/MSS2017_MeMoG_CI_March30.csv") res_value = np.array( [line.replace(" ", "").split(",")[4] for line in memog_res]) team_priority = np.array([ line.replace(" ", "").split(",")[0] + "-" + line.replace(" ", "").split(",")[1] for line in memog_res ]) lang = np.array( [line.replace(" ", "").split(",")[2] for line in memog_res]) ans = [] # get team priority tmp_team_priority = [] for cur_lang in set(lang): idx = np.where(lang == cur_lang)[0] if len(tmp_team_priority) < len(team_priority[idx]): tmp_team_priority = team_priority[idx] tmp_team_priority = sorted(tmp_team_priority) print tmp_team_priority print len(tmp_team_priority) ans.append("lang\t" + "\t".join(tmp_team_priority)) for cur_lang in set(lang): print cur_lang tmp_ans = cur_lang idx = np.where(lang == cur_lang)[0] cur_lang_value = res_value[idx] print team_priority[idx] for cur_team_priority in tmp_team_priority: idx_1 = np.where(team_priority[idx] == cur_team_priority)[0] if len(idx_1) == 0: tmp_ans += "\t-" continue tmp_ans += "\t" + str(float(cur_lang_value[idx_1[0]]) * 100) ans.append(tmp_ans) write_file(ans, "../final_memog_value_march30.csv") print "end"
def get_mss_paper_summary(self, lang, file_name, if_write_file=True): """ generate summary for one paper, single document summarization :param lang: :param file_name: current file name, used for write summary answer :param if_write_file: whether write generated summary to answer file named file_name :return: """ # initial self.__quality, self.__paper_name = None, file_name self.quality_method__ = "" ''' if DATA == "mms2015": self.__all_file.merge_mms_2015(os.path.dirname(self.__child_path), "chinese") elif DATA == "mss2017": if lang in ["vi", "ka"]: self.__all_file.merge_mss_2017(os.path.dirname(self.__child_path)) else: self.__all_file.merge_mss_2017_ros(os.path.dirname(self.__child_path)) self.__paper_original = self.__all_file.get_merged_paper() if self.stop_word_method == "remove_stop": self.__paper = self.__all_file.get_filtered_paper() elif self.stop_word_method == "with_stop": self.__paper = self.__all_file.get_merged_paper() self.__titles = self.__all_file.get_titles() # used for generate hLDA input file and calculate level method. if (not os.path.exists(self.__child_path + "model.temp")) or False: write_file(self.__paper, self.__child_path + "RemoveStop.temp", False) write_file(self.__paper_original, self.__child_path + "word_segment.temp", False) model_temp(self.__paper, self.__child_path) return "" ''' if self.stop_word_method == "remove_stop": self.__paper = read_file(self.__child_path + "RemoveStop.temp") elif self.stop_word_method == "with_stop": self.__paper = read_file(self.__child_path + "word_segment.temp") self.__titles = read_file(self.__child_path + "titles.temp") self.__paper_original = read_file(self.__child_path + "word_segment.temp") self.__sub_paper_len = [ int(i) for i in read_file(self.__child_path + "sec_idx.temp") ] # extract sentence feature_subset, eig = self.__cal_candidate_set() # feature_subset = range(len(self.__paper_original)) # eig = [] log.error("results is: ") log.info(feature_subset) log.debug(eig) # use feature list to extract summary summary = self.__construct_summary(feature_subset, eig, lang) if if_write_file: if file_name == '': log.error("file name is empty") return "" # write answer to file for ROUGE answer_path = self.__rouge_path + lang + "/systems/" write_file(summary, os.path.join('%s%s.txt' % (answer_path, file_name)), False) ''' # generate gold summary split by CHAR gold_path = self.__rouge_path + lang + "/models/" if not os.path.exists(gold_path): os.makedirs(gold_path) tmp_name = lang + "/" + file_name + "_summary.txt" abs_human = read_file('./data/MultiLing2015-MSS/multilingMss2015Eval/summary/' + tmp_name) if not os.path.exists(gold_path + file_name + "_summary.txt") and lang != "vi" and lang != 'ka': write_file([" ".join(api.tokenize("\n".join(abs_human)))], gold_path + file_name + "_summary.txt", False) if lang == "vi": write_file(abs_human, gold_path + file_name + "_summary.txt", False) # generate configure file of each document for ROUGE conf_path = self.__rouge_path + lang + "/configure/" if not os.path.exists(conf_path): os.makedirs(conf_path) tmp_conf_ = answer_path + file_name + ".txt " + gold_path + file_name + "_summary.txt" self.__all_conf.append(tmp_conf_) write_file([tmp_conf_], os.path.join('%s/%s.txt' % (conf_path, file_name)), False) ''' return "".join(summary)
def ini_mss2015_data(root_path, out_path): #__processing_using_ros() #__processing_using_nltk() os.makedirs(out_path) #__stop_word = read_file("./third_part/dict/stop_list.txt") for cur_file in os.listdir(root_path): out_dir_name = cur_file out_dir_path = os.path.join("%s/%s" % (out_path, out_dir_name)) os.mkdir(out_dir_path) content = read_file(root_path + "/" + cur_file + "/" + cur_file + ".txt") write_file(content, out_dir_path + "/" + out_dir_name + ".txt", False) # start generate temp file tokenized_paper = read_file(root_path + "/" + cur_file + "/lemmatized_body.temp") remove_stop = [] segmented_paper = [] no_bracket_str = [] section_set = [] tmp_str = "" tmp_removed_str = "" tmp_no_bracket_str = "" __brackets = False tmp_int = 0 for word in tokenized_paper: if word == "(" or word == u"(": __brackets = True elif word == ")" or word == u")": __brackets = False #if word not in __stop_word: tmp_removed_str += word + " " if __brackets: tmp_str += word + " " continue if word != "#": tmp_no_bracket_str += word + " " tmp_str += word + " " if word.endswith(".") or word in [u"。", u"!", u"?", u";", u"#"]: if tmp_removed_str != "": segmented_paper.append(tmp_str) remove_stop.append(tmp_removed_str) no_bracket_str.append(tmp_no_bracket_str) tmp_int += 1 if word == "#": section_set.append(str(tmp_int - 1)) tmp_str = "" tmp_removed_str = "" tmp_no_bracket_str = "" section_set.append(str(len(segmented_paper))) write_file(remove_stop, out_dir_path + "/RemoveStop.temp", False) write_file(segmented_paper, out_dir_path + "/word_segment.temp", False) write_file(no_bracket_str, out_dir_path + "/word_remove_bracket.temp", False) titles = read_file(root_path + "/" + cur_file + "/tokenized_title.temp") write_file([" ".join(titles)], out_dir_path + "/titles.temp", False) write_file(section_set, out_dir_path + "/sec_idx.temp", False) model_temp(segmented_paper, out_dir_path) return ""
def __processing_using_nltk(original_path, data_backup_path): px = ParseXML() for cur_file in os.listdir(original_path): dir_name = cur_file.split(".")[0] if dir_name == "test": continue out_path = os.path.join("%s/%s/" % (data_backup_path, dir_name)) if os.path.exists(out_path + "/tokenized_body.temp") and \ os.path.exists(out_path + "/tokenized_title.temp") and False: continue if not os.path.exists(out_path): os.makedirs(out_path) #px.parse(os.path.join("%s/%s" % (original_path, cur_file))) cur_content = read_file(os.path.join("%s/%s" % (original_path, cur_file))) contents = cur_content #print contents titles = cur_content[0] print titles word_segmented = word_tokenize(" ".join(contents)) write_file(contents, os.path.join("%s/%s.txt" % (out_path, dir_name)), False) write_file(word_segmented, out_path + "/tokenized_paper.temp", False) write_file(word_segmented, out_path + "/lemmatized_paper.temp", False) write_file(word_tokenize(" ".join(titles)), out_path + "/tokenized_title.temp", False) write_file(word_tokenize(" ".join(titles)), out_path + "/lemmatized_title.temp", False) write_file(word_tokenize(" ".join(cur_content)), out_path + "/tokenized_body.temp", False) write_file(word_tokenize(" ".join(cur_content)), out_path + "/lemmatized_body.temp", False)