def ini_mss2015_data(root_path, out_path): #__processing_using_ros() #__processing_using_nltk() os.makedirs(out_path) #__stop_word = read_file("./third_part/dict/stop_list.txt") for cur_file in os.listdir(root_path): out_dir_name = cur_file out_dir_path = os.path.join("%s/%s" % (out_path, out_dir_name)) os.mkdir(out_dir_path) content = read_file(root_path + "/" + cur_file + "/" + cur_file + ".txt") write_file(content, out_dir_path + "/" + out_dir_name + ".txt", False) # start generate temp file tokenized_paper = read_file(root_path + "/" + cur_file + "/lemmatized_body.temp") remove_stop = [] segmented_paper = [] no_bracket_str = [] section_set = [] tmp_str = "" tmp_removed_str = "" tmp_no_bracket_str = "" __brackets = False tmp_int = 0 for word in tokenized_paper: if word == "(" or word == u"(": __brackets = True elif word == ")" or word == u")": __brackets = False #if word not in __stop_word: tmp_removed_str += word + " " if __brackets: tmp_str += word + " " continue if word != "#": tmp_no_bracket_str += word + " " tmp_str += word + " " if word.endswith(".") or word in [u"。", u"!", u"?", u";", u"#"]: if tmp_removed_str != "": segmented_paper.append(tmp_str) remove_stop.append(tmp_removed_str) no_bracket_str.append(tmp_no_bracket_str) tmp_int += 1 if word == "#": section_set.append(str(tmp_int - 1)) tmp_str = "" tmp_removed_str = "" tmp_no_bracket_str = "" section_set.append(str(len(segmented_paper))) write_file(remove_stop, out_dir_path + "/RemoveStop.temp", False) write_file(segmented_paper, out_dir_path + "/word_segment.temp", False) write_file(no_bracket_str, out_dir_path + "/word_remove_bracket.temp", False) titles = read_file(root_path + "/" + cur_file + "/tokenized_title.temp") write_file([" ".join(titles)], out_dir_path + "/titles.temp", False) write_file(section_set, out_dir_path + "/sec_idx.temp", False) model_temp(segmented_paper, out_dir_path) return ""
def test(self): """ :param root_path, file path of hLDA results, must contains runXXX :return: """ run_path = self.__root_path + "/run000" # path assign mode_assign = read_file(run_path + "/mode.assign") # word level assign mode_levels = read_file(run_path + "/mode.levels") # word list word_list = read_file(self.__root_path + "/words.temp") # mode.temp model_temp = read_file(self.__root_path + "model.temp") # sentences paths self.path_list = dict() self.nodes = dict() self.node_word_freq = dict() for idx in range(len(mode_assign)): line = mode_assign[idx] new_path = " ".join(line.split(" ")[2:]) if new_path not in self.path_list: self.path_list[new_path] = [] if idx in self.__candidata: self.path_list[new_path].append(idx) self.__ori_allocation = np.array( [float(len(self.path_list[i])) for i in self.path_list]) self.__ori_allocation /= (np.sum(self.__ori_allocation)) self.__cur_allocation = np.zeros(self.__ori_allocation.shape).tolist() # print path # for path in sorted(self.path_list.items(), key=lambda x: len(x[1])): # print path[0], "\t: ", str(len(path[1])), path[1] for level in range(3): for path in mode_assign: cur_node = path.split(" ")[2:level + 3] if " ".join(cur_node) not in self.nodes: self.nodes[" ".join(cur_node)] = [] self.node_word_freq[" ".join(cur_node)] = [] for i in range( len(mode_levels[int(path.split(" ")[0])].split(" "))): word = mode_levels[int(path.split(" ")[0])].split(" ")[i] if int(word.split(":")[1]) == level: self.nodes[" ".join(cur_node)].append(word_list[int( word.split(":")[0])]) # self.node_word_freq[" ".join(cur_node)].append(model_temp[int(path.split(" ")[0])].split(" ")[i + 1]) for node in self.nodes: print node, ": ", "\t".join( self.nodes[node]) # , "\t".join(self.node_word_freq[node])
def __processing_using_nltk(original_path, data_backup_path): px = ParseXML() for cur_file in os.listdir(original_path): dir_name = cur_file.split(".")[0] if dir_name == "test": continue out_path = os.path.join("%s/%s/" % (data_backup_path, dir_name)) if os.path.exists(out_path + "/tokenized_body.temp") and \ os.path.exists(out_path + "/tokenized_title.temp") and False: continue if not os.path.exists(out_path): os.makedirs(out_path) #px.parse(os.path.join("%s/%s" % (original_path, cur_file))) cur_content = read_file(os.path.join("%s/%s" % (original_path, cur_file))) contents = cur_content #print contents titles = cur_content[0] print titles word_segmented = word_tokenize(" ".join(contents)) write_file(contents, os.path.join("%s/%s.txt" % (out_path, dir_name)), False) write_file(word_segmented, out_path + "/tokenized_paper.temp", False) write_file(word_segmented, out_path + "/lemmatized_paper.temp", False) write_file(word_tokenize(" ".join(titles)), out_path + "/tokenized_title.temp", False) write_file(word_tokenize(" ".join(titles)), out_path + "/lemmatized_title.temp", False) write_file(word_tokenize(" ".join(cur_content)), out_path + "/tokenized_body.temp", False) write_file(word_tokenize(" ".join(cur_content)), out_path + "/lemmatized_body.temp", False)
def launch_multiling_single_summary(self, dic_path): self.__rouge_path = ini_rouge_data(name_suffix=self.feature_merge + "-" + "-" + self.summary_method) path_dir = os.listdir(dic_path) for cur_lang in path_dir: if cur_lang not in ["zh", "en"]: continue lang_dir = os.path.join("%s/%s" % (dic_path, cur_lang)) self.__all_conf = [] # get target length of current language self.__target_len = dict() for line in read_file(self.__target_len_dir + cur_lang + ".txt"): self.__target_len[line.split("_")[0]] = int(line.split(",")[1]) # get summary of current file(cur_file) for cur_file in os.listdir(lang_dir): self.max_sum_len__ = self.__target_len[cur_file] child_path = os.path.join('%s/%s/%s/' % (dic_path, cur_lang, cur_file)) self.__child_path = child_path log.info(child_path) self.get_mss_paper_summary(cur_lang, cur_file) # write_file(self.__all_conf, self.__rouge_path + cur_lang + "/configure/.configure_all_" + cur_lang + ".txt", False) # if not os.path.exists(self.__rouge_path + cur_lang + "/output"): # os.makedirs(self.__rouge_path + cur_lang + "/output") return self.__rouge_path
def get_lda_input_format(): file_content = read_file( "..\\data\\sample_datas\\evasampledata4-TaskAA.txt") lda_format = list() cur_target = "" count = 0 for cur_line in file_content: if cur_line == "": continue (id_, target, text, stance) = cur_line.split("\t") if cur_target == "": cur_target = target count = 0 if cur_target != target: cur_target = target count = 0 if count < 500: lda_format.append(text) count += 1 # if target != "IphoneSE": # continue # lda_format.append(" ".join(jieba.cut(text))) write_file(lda_format, "../nlpcc2016.txt")
def __extract_rouge_result(result_path): log.info("extracting rouge result to log.") if not os.path.isdir(result_path + "output/"): log.error("result path is not a dictionary") for cur_file in os.listdir(result_path + "output/"): cur_rouge_value = read_file( os.path.join("%s/%s/%s" % (result_path, 'output', cur_file))) log.debug(cur_file + "\n" + "\n".join(cur_rouge_value))
def get_mss_paper_summary(self, file_name, if_write_file=True): """ generate summary for one paper, single document summarization :param lang: :param file_name: current file name, used for write summary answer :param if_write_file: whether write generated summary to answer file named file_name :return: """ # initial self.__quality, self.__paper_name = None, file_name self.quality_method__ = "" if self.stop_word_method == "remove_stop": self.__paper = read_file(self.__child_path + "RemoveStop.temp") elif self.stop_word_method == "with_stop": self.__paper = read_file(self.__child_path + "word_segment.temp") self.__titles = read_file(self.__child_path + "titles.temp") self.__paper_original = read_file(self.__child_path + "word_segment.temp") self.__sub_paper_len = [ int(i) for i in read_file(self.__child_path + "sec_idx.temp") ] # extract sentence feature_subset, eig = self.__cal_candidate_set() #print len(feature_subset) # feature_subset = range(len(self.__paper_original)) # eig = [] log.info(feature_subset) log.debug(eig) # use feature list to extract summary summary = self.__construct_summary(feature_subset, eig) if if_write_file: if file_name == '': log.error("file name is empty") return "" # write answer to file for ROUGE #print self.__rouge_path answer_path = self.__child_path write_file( summary, os.path.join('%s/%s.txt' % (answer_path, file_name + '_result')), False) return "".join(summary)
def __init(self, root_path): """ :param root_path, file path of hLDA results, must contains runXXX :return: """ run_path = root_path + "/run000" # path assign mode_assign = read_file(run_path + "/mode.assign") # word level assign mode_levels = read_file(run_path + "/mode.levels") # word list word_list = read_file(root_path + "/words.temp") # sentences paths self.path_list = dict() for idx in range(len(mode_assign)): line = mode_assign[idx] new_path = " ".join(line.split(" ")[2:]) if new_path not in self.path_list: self.path_list[new_path] = [] if idx in self.__candidata: self.path_list[new_path].append(idx) self.__ori_allocation = np.array( [float(len(self.path_list[i])) for i in self.path_list]) self.__ori_allocation /= (np.sum(self.__ori_allocation)) self.__cur_allocation = np.zeros(self.__ori_allocation.shape).tolist() # print path # for path in sorted(self.path_list.items(), key=lambda x: len(x[1])): # print path[0], "\t: ", str(len(path[1])), path[1] # for path in self.path_list: # if self.path_list[path] # print path, len(self.path_list[path]), self.path_list[path] # sentence levels self.sen_levels = [] for i in range(len(mode_levels)): self.sen_levels.append([]) word_level = mode_levels[i].split(" ") for j in range(3): tmp = [] for word in word_level: w2l = word.split(":") if w2l[1] == str(j): tmp.append(w2l[0]) self.sen_levels[i].append(tmp)
def get_rouge_ans(): memog_res = read_file("../data/2017_results/MSS2017_ROUGE_1.5.7_CI.csv") ori_res_value = np.array( [line.replace(" ", "").split(",")[4] for line in memog_res]) ori_team = np.array( [line.replace(" ", "").split(",")[0] for line in memog_res]) ori_priority = np.array( [line.replace(" ", "").split(",")[1] for line in memog_res]) ori_lang = np.array( [line.replace(" ", "").split(",")[2] for line in memog_res]) rouge_n = np.array( [line.replace(" ", "").split(",")[3] for line in memog_res]) for cur_rouge in ["ROUGE-1", "ROUGE-2", "ROUGE-3", "ROUGE-4"]: ans = [] tmp_set = set() rouge_idx = np.where(rouge_n == cur_rouge)[0] lang = ori_lang[rouge_idx] priority = ori_priority[rouge_idx] team = ori_team[rouge_idx] res_value = ori_res_value[rouge_idx] for cur_lang in set(lang): print cur_lang idx = np.where(lang == cur_lang)[0] cur_value = res_value[idx] cur_priority = priority[idx] cur_team = team[idx] tmp_ans = cur_lang + '\t' tmp_len = 0 while tmp_len < len(cur_value): idx_min = np.where(cur_value == max(cur_value))[0] tmp_len += len(idx_min) for idx_1 in idx_min: tmp_ans += cur_team[idx_1] + "-" + cur_priority[ idx_1] + '\t' cur_value[idx_min] = -1 ans.append(tmp_ans) tmp_set = set([team[i] + "-" + priority[i] for i in range(len(team))]) write_file(ans, "../" + cur_rouge + "_ans.txt") final_ans = [] final_value = [] tmp_set = sorted(list(tmp_set)) final_ans.append("lang," + ",".join(tmp_set)) for cur_ans in ans: ans_list = cur_ans.split('\t') print ans_list final_tmp_ans = ["" for i in range(len(tmp_set) + 1)] final_tmp_ans[0] = ans_list[0] for i in range(len(tmp_set)): if tmp_set[i] in ans_list: final_tmp_ans[i + 1] = str( np.where(np.array(ans_list) == tmp_set[i])[0][0]) final_ans.append(",".join(final_tmp_ans)) write_file(final_ans, "../final_" + cur_rouge + "_ans.csv") print "end"
def __get_doc2vec_matrix(self, path): log.info('use word2vec') self.quality_method__ = "word2vec" self.distance_method__ = "100" word2vec_matrix = read_file(path) word2vec_matrix = word2vec_matrix[2:len(word2vec_matrix) - 1] self.__key_word = [vec_.split(u" ")[0] for vec_ in word2vec_matrix] log.debug("word2vec key words: \n" + "\t".join(self.__key_word)) word2vec = np.array([(vec_.encode("utf-8")).split(" ")[1:] for vec_ in word2vec_matrix]) word2vec = word2vec.astype(np.float64) return word2vec.dot(word2vec.transpose()) * 1000
def sentence_cluster(mode_path, run_name): assign_path = mode_path + "/" + run_name + "/mode.assign" mode_assign = read_file(assign_path) cluster_set = [ " ".join(cur_line.split(" ")[2:]) for cur_line in mode_assign ] ans = [] for cluster_id in set(cluster_set): idx = np.where(np.array(cluster_set) == cluster_id)[0] sen_cluster = [ int(line.split(" ")[0]) for line in np.array(mode_assign)[idx] ] ans.append(sen_cluster) ans.sort(key=len, reverse=True) print ans return ans
def get_memog_ans(): memog_res = read_file("../data/2017_results/MSS2017_MeMoG_CI_March30.csv") res_value = np.array( [line.replace(" ", "").split(",")[4] for line in memog_res]) team = np.array( [line.replace(" ", "").split(",")[0] for line in memog_res]) priority = np.array( [line.replace(" ", "").split(",")[1] for line in memog_res]) lang = np.array( [line.replace(" ", "").split(",")[2] for line in memog_res]) ans = [] memog_answer_value = [] tmp_set = set() for cur_lang in set(lang): print cur_lang idx = np.where(lang == cur_lang)[0] cur_value = res_value[idx] cur_priority = priority[idx] cur_team = team[idx] tmp_ans = cur_lang + '\t' tmp_len = 0 while tmp_len < len(cur_value): idx_max = np.where(cur_value == max(cur_value))[0] tmp_len += len(idx_max) for idx_1 in idx_max: tmp_ans += cur_team[idx_1] + "-" + cur_priority[idx_1] + '\t' cur_value[idx_max] = -1 ans.append(tmp_ans) tmp_set = set([team[i] + "-" + priority[i] for i in range(len(team))]) write_file(ans, "../memog_ans_march30.txt") final_ans = [] tmp_set = sorted(list(tmp_set)) final_ans.append("lang," + ",".join(tmp_set)) for cur_ans in ans: ans_list = cur_ans.split('\t') print ans_list final_tmp_ans = ["" for i in range(len(tmp_set) + 1)] final_tmp_ans[0] = ans_list[0] for i in range(len(tmp_set)): if tmp_set[i] in ans_list: final_tmp_ans[i + 1] = str( np.where(np.array(ans_list) == tmp_set[i])[0][0]) final_ans.append(",".join(final_tmp_ans)) write_file(final_ans, "../final_memog_ans_march30.csv") print "end"
def get_rouge_value(): memog_res = read_file("../data/2017_results/MSS2017_ROUGE_1.5.7_CI.csv") ori_res_value = np.array( [line.replace(" ", "").split(",")[4] for line in memog_res]) ori_team_priority = np.array([ line.replace(" ", "").split(",")[0] + "-" + line.replace(" ", "").split(",")[1] for line in memog_res ]) ori_lang = np.array( [line.replace(" ", "").split(",")[2] for line in memog_res]) rouge_n = np.array( [line.replace(" ", "").split(",")[3] for line in memog_res]) for cur_rouge in ["ROUGE-1", "ROUGE-2", "ROUGE-3", "ROUGE-4"]: ans = [] # get team priority tmp_team_priority = [] rouge_idx = np.where(rouge_n == cur_rouge)[0] lang = ori_lang[rouge_idx] team_priority = ori_team_priority[rouge_idx] res_value = ori_res_value[rouge_idx] for cur_lang in set(lang): idx = np.where(lang == cur_lang)[0] if len(tmp_team_priority) < len(team_priority[idx]): tmp_team_priority = team_priority[idx] tmp_team_priority = sorted(tmp_team_priority) print tmp_team_priority print len(tmp_team_priority) ans.append("lang\t" + "\t".join(tmp_team_priority)) for cur_lang in set(lang): print cur_lang tmp_ans = cur_lang idx = np.where(lang == cur_lang)[0] cur_lang_value = res_value[idx] print team_priority[idx] for cur_team_priority in tmp_team_priority: idx_1 = np.where(team_priority[idx] == cur_team_priority)[0] if len(idx_1) == 0: tmp_ans += "\t-" continue tmp_ans += "\t" + str(float(cur_lang_value[idx_1[0]]) * 100) ans.append(tmp_ans) write_file(ans, "../final_" + cur_rouge + "_value.csv") print "end"
def analyse_rouge_value(file_path, rouge_n): log.info("analysing rouge result ...") analysed_log = [] for cur_log_file in os.listdir(file_path): if not cur_log_file.endswith(".log"): continue log_content = read_file( os.path.join("%s/%s" % (file_path, cur_log_file))) file_name = os.path.basename(cur_log_file) tmp_log = "" for i in range(len(log_content)): if log_content[i].endswith("configure_all.out"): tmp_log = "all\t" tmp_log += "\t".join( file_name.replace(".log", "").split(".")[3:]) tmp_log += "\t" + log_content[i + 2].split(" ")[3] tmp_log += "\t" + log_content[i + 3].split(" ")[3] tmp_log += "\t" + log_content[i + 4].split(" ")[3] break analysed_log.append(tmp_log) write_file(analysed_log, "./data/log_analysis/ana.log", False)
def get_memog_value(): memog_res = read_file("../data/2017_results/MSS2017_MeMoG_CI_March30.csv") res_value = np.array( [line.replace(" ", "").split(",")[4] for line in memog_res]) team_priority = np.array([ line.replace(" ", "").split(",")[0] + "-" + line.replace(" ", "").split(",")[1] for line in memog_res ]) lang = np.array( [line.replace(" ", "").split(",")[2] for line in memog_res]) ans = [] # get team priority tmp_team_priority = [] for cur_lang in set(lang): idx = np.where(lang == cur_lang)[0] if len(tmp_team_priority) < len(team_priority[idx]): tmp_team_priority = team_priority[idx] tmp_team_priority = sorted(tmp_team_priority) print tmp_team_priority print len(tmp_team_priority) ans.append("lang\t" + "\t".join(tmp_team_priority)) for cur_lang in set(lang): print cur_lang tmp_ans = cur_lang idx = np.where(lang == cur_lang)[0] cur_lang_value = res_value[idx] print team_priority[idx] for cur_team_priority in tmp_team_priority: idx_1 = np.where(team_priority[idx] == cur_team_priority)[0] if len(idx_1) == 0: tmp_ans += "\t-" continue tmp_ans += "\t" + str(float(cur_lang_value[idx_1[0]]) * 100) ans.append(tmp_ans) write_file(ans, "../final_memog_value_march30.csv") print "end"
def get_hlda_message(path): mode = read_file(path + "/mode") mode_assign = read_file(path + "/mode.assign")
def get_mss_paper_summary(self, lang, file_name, if_write_file=True): """ generate summary for one paper, single document summarization :param lang: :param file_name: current file name, used for write summary answer :param if_write_file: whether write generated summary to answer file named file_name :return: """ # initial self.__quality, self.__paper_name = None, file_name self.quality_method__ = "" ''' if DATA == "mms2015": self.__all_file.merge_mms_2015(os.path.dirname(self.__child_path), "chinese") elif DATA == "mss2017": if lang in ["vi", "ka"]: self.__all_file.merge_mss_2017(os.path.dirname(self.__child_path)) else: self.__all_file.merge_mss_2017_ros(os.path.dirname(self.__child_path)) self.__paper_original = self.__all_file.get_merged_paper() if self.stop_word_method == "remove_stop": self.__paper = self.__all_file.get_filtered_paper() elif self.stop_word_method == "with_stop": self.__paper = self.__all_file.get_merged_paper() self.__titles = self.__all_file.get_titles() # used for generate hLDA input file and calculate level method. if (not os.path.exists(self.__child_path + "model.temp")) or False: write_file(self.__paper, self.__child_path + "RemoveStop.temp", False) write_file(self.__paper_original, self.__child_path + "word_segment.temp", False) model_temp(self.__paper, self.__child_path) return "" ''' if self.stop_word_method == "remove_stop": self.__paper = read_file(self.__child_path + "RemoveStop.temp") elif self.stop_word_method == "with_stop": self.__paper = read_file(self.__child_path + "word_segment.temp") self.__titles = read_file(self.__child_path + "titles.temp") self.__paper_original = read_file(self.__child_path + "word_segment.temp") self.__sub_paper_len = [ int(i) for i in read_file(self.__child_path + "sec_idx.temp") ] # extract sentence feature_subset, eig = self.__cal_candidate_set() # feature_subset = range(len(self.__paper_original)) # eig = [] log.error("results is: ") log.info(feature_subset) log.debug(eig) # use feature list to extract summary summary = self.__construct_summary(feature_subset, eig, lang) if if_write_file: if file_name == '': log.error("file name is empty") return "" # write answer to file for ROUGE answer_path = self.__rouge_path + lang + "/systems/" write_file(summary, os.path.join('%s%s.txt' % (answer_path, file_name)), False) ''' # generate gold summary split by CHAR gold_path = self.__rouge_path + lang + "/models/" if not os.path.exists(gold_path): os.makedirs(gold_path) tmp_name = lang + "/" + file_name + "_summary.txt" abs_human = read_file('./data/MultiLing2015-MSS/multilingMss2015Eval/summary/' + tmp_name) if not os.path.exists(gold_path + file_name + "_summary.txt") and lang != "vi" and lang != 'ka': write_file([" ".join(api.tokenize("\n".join(abs_human)))], gold_path + file_name + "_summary.txt", False) if lang == "vi": write_file(abs_human, gold_path + file_name + "_summary.txt", False) # generate configure file of each document for ROUGE conf_path = self.__rouge_path + lang + "/configure/" if not os.path.exists(conf_path): os.makedirs(conf_path) tmp_conf_ = answer_path + file_name + ".txt " + gold_path + file_name + "_summary.txt" self.__all_conf.append(tmp_conf_) write_file([tmp_conf_], os.path.join('%s/%s.txt' % (conf_path, file_name)), False) ''' return "".join(summary)