def wiki_preprocess(self, save_path=Dir.res + "/WikiCorpus/wiki.jian.seperate.txt"): tmp_result = [] # save_path = Dir.res+"WikiCorpus/wiki.jian.seperate.txt" index = 0 with open(self.train_file, "r") as train_corpus: # print("read complete") index = 0 for line in train_corpus: # print(line) # input() regex = "。。。。。。|?|。|!|;|\.\.\.\.\.\." sentences = re.split(regex, line) for sen in sentences: words = list(jieba.cut(sen.strip())) new_line = ' '.join(words) tmp_result.append(new_line) if tmp_result.__len__() == 5000: # print(index*5000 + 5000) tools.write_list(save_path, tmp_result, mode="a") index += 1 tmp_result = [] # print(tmp_result.__len__()) tools.write_list(save_path, tmp_result, mode="a")
def build_w2v_train_data(): file_dir = Dir.res + "data/news.sentences/" save_path = Dir.res + "data/all.txt" filelist = [] content = [] tools.get_filelist(file_dir, filelist) for file in filelist: sentences = tools.read_lines(file) content.extend(sentences) tools.write_list(save_path, content)
def rouge_detail(self,abstract_processed,save_dir): flist = tools.get_files(abstract_processed) save_content = [] for fname in flist: content = tools.read_lines(abstract_processed+fname) refence = tools.read_lines(self.ref_processed+fname) lines =[line.split(" ") for line in content] refen =[line.split(" ") for line in refence] rouge1 = self.rouge_1_simple(refen,lines) rouge2 = self.rouge_2_simple(refen, lines) save_content.append(fname+","+str(rouge1)+","+str(rouge2)) tools.write_list(save_dir+"/detials.txt",save_content)
def transfer(cleandata_root = Dir.res+"/cleandata_1189/news/",save_path = Dir.res+"/sen_data/1189_corpus.txt"): filelist = os.listdir(cleandata_root) lines = [] for name in filelist: filepath = cleandata_root+name for line in ftools.read_lines(filepath): words = tools.seperate(line) for i in range(len(words)): if words[i].isdigit(): words[i] = "num" lines.append(' '.join(words)) ftools.write_list(save_path,lines)
def generate_new_data(): npath = Dir.res + "/cleandata_highquality_3500/news/" # apath = Dir.res+"/cleandata_highquality_3500/abstract/" new_npath = Dir.res + "/cleandata_highquality_3500_new/news/" new_apath = Dir.res + "/cleandata_highquality_3500_new/abstract/" uper = Uper() for name in ftools.get_files(npath): path = npath + name content = ftools.read_lines(path) new_abstract = uper.summarize(content, num=3, fname=name[:-4]) ftools.copy(path, new_npath + name) ftools.write_list(new_apath + name, new_abstract)
def generate_data(file=Dir.res + "/extract_data_process/data_processed_9.9.txt", savePath=Dir.res + "/extract_data_process/data"): content = tools.read_lines(file)[1:-1] data = {} for file in content: file = file.replace(" ", "") tmp = str(file[1:-1]).split("', '") if tmp[1] not in data.keys(): data[tmp[1]] = tmp[2] index = 0 for key in sorted(data.keys()): save_content = savePath + "/news/training_" + str(index) + ".txt" save_abstract = savePath + "/abstract/training_" + str(index) + ".txt" tools.write_list(save_content, seperate_sentences(data[key])) tools.write_list(save_abstract, seperate_sentences(key)) index += 1
def result_process(file_dir,save_dir): if os.path.lexists(save_dir): shutil.rmtree(save_dir) filenames = [] tools.get_filelist(file_dir,filenames) for file in filenames: content = tools.read_lines(file) name = tools.get_name(file) result =[] for line in content: words = jieba.cut(line) string = "" for word in words: string+= word+" " string = string[:-1] result.append(string) save_path = save_dir+"/"+name+".txt" tools.write_list(save_path,result)
def check_extract(file_dir, save_path): files = [] tools.get_filelist(file_dir, files, filter) extract_result = set() un_first_result = set() analysis_result = {} for file in files: # print(file) content = tools.read(file) content = re.sub("\[|\]|", "", content) lines = content.split("\n") for line in lines: tmp = line.split("', '") if tmp.__len__() == 3: extract = check_if_extract(tmp[1], tmp[2]) if extract[0]: extract_result.add(line) if tmp[0] not in analysis_result.keys(): analysis_result[tmp[0]] = [] analysis_result[tmp[0]] = extract[1] all_value = sum(extract[1][:-2]) supose_value = 0 low_ = get_sum(extract[1][-2]) hight_ = get_sum(extract[1][-1]) # print(tmp[0], all_value, low_, hight_, extract[1][:-2],extract[1][-2:]) # print(extract_result.__len__()) if all_value > low_ + 2: # print(tmp[0], all_value, low_, hight_, extract[1][:-2], extract[1][-2:]) un_first_result.add(line) print(extract_result.__len__(), un_first_result.__len__()) else: pass else: # print("format error",tmp.__len__()) # print(line) pass # print("exract",extract_result.__len__()) tools.write_list(save_path, extract_result) tools.write_list(save_path + ".txt", un_first_result)
def workers(args): wname = args["n"] data = args["d"] model = args["m"] abstract = args["a"] count = 0 summarize_result = {} print(wname, "start") for text in data.keys(): # if text not in summarize_result.keys(): start = time.time() # try: summarize_result[text] = model.summarize(data[text], num=3,fname = text) count += 1 end = time.time() tools.write_list(abstract + text + ".txt", summarize_result[text]) # print(wname, text, count, "/", len(data), end - start) # except: # print(wname,text) # input() # print(wname,len(self.summarize_result)) print(wname, " done") return summarize_result
def preprocess_file(file, savepath): content = tools.read(file) result = preprocess(content) tools.write_list(savepath, result)
def evaluator_rouge(self,model,result_dir,num): summarize_result={} astart = time.time() ### 保存模型的摘要结果 abstract = result_dir+"/abstract/" keys = sorted(self.data.keys()) if self.parall: p = multiprocessing.Pool(self.cpu) inter = int(len(keys) / self.cpu) + 1 args = [] for i in range(self.cpu): tmp = {} if i == 0: key = keys[:inter] # print(i,"0",inter,len(key)) for k in key: tmp[k] = self.data[k] elif i == self.cpu-1: key = keys[i*inter:] # print(i, i * inter, "end", len(key)) for k in key: tmp[k] = self.data[k] else: key = keys[i*inter:(i+1)*inter] # print(i, i * inter, (i + 1) * inter, len(key)) for k in key: tmp[k] = self.data[k] args.append({"n":"work"+str(i), "d":tmp, "m":model, "a" :abstract } ) # input() rslt = p.map(workers,args) # for var in rslt: # for k in var.keys(): # summarize_result[k] = var[k] ### 处理摘要结果数据(数值化) # print("saving abstract ",len(summarize_result)) # for fname in summarize_result.keys(): # tools.write_list(abstract + fname + ".txt", summarize_result[fname]) abstract_processed = result_dir + "/abstract_processed/" abstract_seperate = result_dir + "/abstract_seperate/" RP.result_process(abstract, abstract_seperate) print("abstract separate done") RP.replace_words_by_num(self.word_index, abstract_seperate, abstract_processed) print("abstract replace done") # print(abstract_processed,result_dir) self.rouge_detail(abstract_processed, result_dir) ### 计算 ROUGE # import src.evaluation.ROUGE # self.rouge = src.evaluation.ROUGE.ROUGE() # print("evaling") result = self.rouge.eval(abstract_processed, self.ref_processed,num) eval_result = result_dir + "/eval_res.txt" print(result) tools.write(eval_result, model.info + "\n" + result, mode="a") aend = time.time() print(aend - astart) else: count = 0 for text in keys: if text not in summarize_result.keys(): start = time.time() count+=1 # if count <1530: # count+=1 # continue # print(text) summarize_result[text] = model.summarize(self.data[text], num,fname = text) end = time.time() # tools.print_proccess(count, len(self.data.keys())) print(text,count,"/",len(keys),end-start) # print(result_save_dir_abstract + text + ".txt") # print( model.summarize(self.data[text], num) ) tools.write_list(abstract + text + ".txt", summarize_result[text]) ### 处理摘要结果数据(数值化) abstract_processed = result_dir+"/abstract_processed/" abstract_seperate = result_dir + "/abstract_seperate/" RP.result_process(abstract,abstract_seperate) RP.replace_words_by_num(self.word_index,abstract_seperate,abstract_processed) # print(abstract_processed,result_dir) self.rouge_detail(abstract_processed,result_dir) ### 计算 ROUGE # import src.evaluation.ROUGE # self.rouge = src.evaluation.ROUGE.ROUGE() # print("evaling") result = self.rouge.eval(abstract_processed, self.ref_processed) eval_result = result_dir+"/eval_res.txt" print(result) tools.write(eval_result,model.info+"\n"+result,mode="a") aend = time.time() print(aend-astart)