class File_processing5(): """ 新文章和旧文章的拼接 要求每篇文章中新段落和旧段落的比例2:1 """ def __init__(self, path): self.util = Util() self.document = docx # 域名 self.read_path = path self.domain_name = "_".join(path.split("/")[-1].split("_")[:3]) self.start_keyword = 0 # 关键词开始的位置 self.end_keyword = int(path.split("/")[-1].split("_")[-1]) # 关键词结束的位置 # self.special_keyword = '苏州' self.used_keyword = [] # 保存路径 self.save_article_path = r'./data/save_path/{}_articles'.format( self.domain_name) self.save_img_path = r'./data/save_path/{}_imgs'.format( self.domain_name) #################### 打包 ########################## # self.save_article_path = r'../data/save_path/{}_articles'.format(self.domain_name) # self.save_img_path = r'../data/save_path/{}_imgs'.format(self.domain_name) ######################################################## # self.domain_name = 'uni_technology' # self.keywords_num = 180 # 关键词数量 # 已使用的关键字 self.used_keyword = [] # 未使用的段落 # self.unused_paragraphs = [] # 已使用的图片 # self.used_pictures = [] # 所有的段落 # self.paragraphs = [] # self.keywords = self.read_xlsx(self.read_path + '\keyword.xlsx') self.used_articles = list() def get_keyword(self): """ 获取关键字(不能重复) :return: """ try: unused_keyword = list(set(self.keywords) ^ set(self.used_keyword)) if len(unused_keyword) == 0: return None keyword = random.choice(unused_keyword) self.used_keyword.append(keyword) return keyword except: traceback.print_exc() def operate_picture(self, filepath): """ 处理图片 :param filepath: :return: 所有图片的路径 """ try: imgs = list() # for file in os.listdir(self.util.to_gbk(filepath)): # img = file.decode('gbk').encode('utf-8') # # imgs.append(os.path.join(filepath, img)) # imgs.append(img) for file in os.listdir(filepath): imgs.append(file) return imgs except: print('operate picture error', traceback.print_exc()) def get_article_len(self, article): """ 求文章长度 """ article_len = 0 for i in article: # article_len += len(i.decode('utf8')) article_len += len(i) return article_len def split_article(self, file_path, file): """ 将文章拆分为首段/中段/尾段 """ try: paragraphs = list() final_paragraphs = list() if file_path.endswith('txt'): # with open(file_path, 'r') as f: with open(file_path, 'r', encoding="UTF-8") as f: paragraphs = f.readlines() elif file_path.endswith("docx"): # document = Document(u'{}'.format(file)) document = Document(file_path) paragraphs = [ p.text for p in document.paragraphs if p.text != '\n' and p != '\n' and p is not None and len(p.text) > 1 ] else: # 读取doc word = Dispatch('Word.Application') # 打开word应用程序 # word = DispatchEx('Word.Application') # 启动独立的进程 word.Visible = 0 # 后台运行,不显示 word.DisplayAlerts = 0 # 不警告 dir_path = os.path.dirname( os.path.abspath(file)) + "\\" + file_path doc = word.Documents.Open(dir_path) # doc.SaveAs(os.path.splitext(dir_path)[0] + '.docx', 12, False, "", True, "", False, False, False, False) for para in doc.paragraphs: paragraphs.append(para.Range.Text) doc.Close() # 处理文档中的非正常回车 for p in paragraphs: ps = p.split("\n\t") for pp in ps: final_paragraphs.append(pp) start = final_paragraphs[0] middle = final_paragraphs[1:-1] end = final_paragraphs[-1] return start, middle, end except: traceback.print_exc() print("split_article error") doc.Close() def get_all_paragraphs(self, floder): """ 获取文章所有段落 """ try: start_paragraph_list = list() # 存放所有首段段落 middle_paragraph_list = list() # 存放所有中段段落 end_paragraph_list = list() # 存放所有尾段段落 filepath = self.read_path + '/' + floder file_list = [ file for file in self.util.get_file_list(filepath) if file ] for file in file_list: t_filepath = filepath + '/' + file start_paragraph, middle_paragraph, end_paragraph = self.split_article( t_filepath, file) if not start_paragraph or not middle_paragraph or not end_paragraph: continue start_paragraph_list.append(start_paragraph) middle_paragraph_list.extend(middle_paragraph) end_paragraph_list.append(end_paragraph) return start_paragraph_list, middle_paragraph_list, end_paragraph_list except: print("get_all_paragraphs error", traceback.print_exc()) # def get_keywords(self): # """ # 获取关键词 # 举例:现需要150个关键词,含有"苏州"的关键词优先, # 如果含有"苏州"的关键词超过150个,则取前150个, # 如果含有"苏州"的关键词不到150,则取完这些词还要再取一些普通关键词凑够150个 # """ # all_keywords = self.read_xlsx() # 所有的关键词 # special_keywords = [kw for kw in all_keywords if self.special_keyword in kw] # 特殊关键词,如:含有"苏州"的关键词 # all_keywords = [kw for kw in all_keywords if self.special_keyword not in kw] # 不含有"苏州"的所有的关键词 # # keywords_num = self.end_keyword - self.start_keyword # 需要的关键词个数 # if len(special_keywords) >= keywords_num: # needed_keywords = special_keywords[0:keywords_num] # remaining_keywords = special_keywords[keywords_num:] # remaining_keywords.extend(all_keywords) # else: # needed_keywords = copy.deepcopy(special_keywords) # needed_keywords.extend(all_keywords[0: keywords_num-len(special_keywords)]) # remaining_keywords = all_keywords[(keywords_num-len(special_keywords)):] # return needed_keywords, remaining_keywords # def write_unused_keywords_csv(self, keywords): # """ # 将未使用的关键词写入csv # :return: # """ # with open(self.save_article_path+'/unuserd_keywords.csv', 'w')as f: # for kw in keywords: # f.write(kw + '\n') def article_4_1(self, old_start_ps, new_middle_ps, old_end_ps): """ 1.old_start + 4 * new_middle + old_end """ start_paragraph = random.sample(old_start_ps, 1) try: middle_paragraphs = random.sample(new_middle_ps, 4) except: try: middle_paragraphs = random.sample(new_middle_ps, 3) except: middle_paragraphs = random.sample(new_middle_ps, 2) end_paragraph = random.sample(old_end_ps, 1) article_ps = copy.deepcopy(start_paragraph) # 文章 article_ps.extend(middle_paragraphs) article_ps.extend(end_paragraph) return article_ps def article_4_2(self, old_start_ps, new_middle_ps, old_middle_ps, new_end_ps): """ 2.old_start + 3 * new_middle + 1 * old_middle + new_end """ start_paragraph = random.sample(old_start_ps, 1) middle_paragraphs = random.sample(new_middle_ps, 3) middle_paragraphs.extend(random.sample(old_middle_ps, 1)) end_paragraph = random.sample(new_end_ps, 1) article_ps = copy.deepcopy(start_paragraph) # 文章 article_ps.extend(middle_paragraphs) article_ps.extend(end_paragraph) return article_ps def article_4_3(self, new_start_ps, new_middle_ps, old_middle_ps, old_end_ps): """3.new_start + 3 * new_middle + 1 * old_middle + old_end""" start_paragraph = random.sample(new_start_ps, 1) middle_paragraphs = random.sample(new_middle_ps, 3) middle_paragraphs.extend(random.sample(old_middle_ps, 1)) end_paragraph = random.sample(old_end_ps, 1) article_ps = copy.deepcopy(start_paragraph) # 文章 article_ps.extend(middle_paragraphs) article_ps.extend(end_paragraph) return article_ps def article_4_4(self, new_start_ps, new_middle_ps, old_middle_ps, new_end_ps): """4.new_start + 2 * new_middle + 2 * old_middle + new_end""" start_paragraph = random.sample(new_start_ps, 1) middle_paragraphs = random.sample(new_middle_ps, 2) middle_paragraphs.extend(random.sample(old_middle_ps, 2)) end_paragraph = random.sample(new_end_ps, 1) article_ps = copy.deepcopy(start_paragraph) # 文章 article_ps.extend(middle_paragraphs) article_ps.extend(end_paragraph) return article_ps def random_article(self, old_start, old_middle, old_end, new_start, new_middle, new_end): """ 任意使用一种方法拼接文章 """ try: methods = [ self.article_4_1(old_start, new_middle, old_end), self.article_4_2(old_start, new_middle, old_middle, old_end), self.article_4_3(new_start, new_middle, old_middle, new_end), self.article_4_4(new_start, new_middle, old_middle, new_end) ] return random.choice(methods) except: traceback.print_exc() def run(self): """ 拼接,生成一篇文章 """ if not os.path.exists(self.save_article_path): os.mkdir(self.save_article_path) if not os.path.exists(self.save_img_path): os.mkdir(self.save_img_path) # self.keywords, remaining_keywords = self.get_keywords() # 获取关键词 # 将剩余的关键词写入表格 # self.write_unused_keywords_csv(remaining_keywords) img_list = list() old_start_paragraph_list = list() old_all_mid_list = list() old_end_paragraph_list = list() new_start_paragraph_list = list() new_all_mid_list = list() new_end_paragraph_list = list() file_dir_list = self.util.get_file_dir(self.read_path) # 获取所有文件夹 for floder in file_dir_list: # 生成所有段落, 获取所有图片 floder_path = self.read_path + '/' + floder # 旧文章段落 if floder == 'old' and os.path.isdir(floder_path): old_start_paragraph_list, old_all_mid_list, old_end_paragraph_list = self.get_all_paragraphs( floder) old_all_mid_list = old_all_mid_list if len( old_all_mid_list) < 50 else random.sample( old_all_mid_list, 30) # 新文章段落 elif floder == 'new' and os.path.isdir(floder_path): new_start_paragraph_list, new_all_mid_list, new_end_paragraph_list = self.get_all_paragraphs( floder) new_all_mid_list = new_all_mid_list if len( new_all_mid_list) < 50 else random.sample( new_all_mid_list, 30) # 图片 elif floder == 'img': t_filepath = self.read_path + '/' + floder img_list = self.operate_picture(t_filepath) # 获取所有图片 for img in img_list: shutil.copy( u"{}".format(self.read_path + '/' + floder + '/' + img), self.save_img_path) elif "xlsx" in floder: self.keywords = self.util.read_xlsx(floder_path)[ self.start_keyword:self.end_keyword] # 获取关键词 # 拼接文章 index = 1 while True: keyword = self.util.get_keyword(self.keywords, self.used_keyword) # 每一篇文章使用一个关键词 if keyword is None: # 关键词使用完之后退出循环 break print(index, keyword) # 随机抽取文章,要求文章字数在730~870 # while True: # article = random.choice(article_list) # 随机抽一篇文章 # article_len = self.get_article_len(article) # if 730 < article_len < 870: # break try: while True: # 确保不会出现重复文章 article = self.random_article(old_start_paragraph_list, old_all_mid_list, old_end_paragraph_list, new_start_paragraph_list, new_all_mid_list, new_end_paragraph_list) if article not in self.used_articles: self.used_articles.append(article) break temp_article = copy.deepcopy(article) # 深拷贝,对新数据进行处理,不改变原数据 img = random.sample(img_list, 2) # 随机取两张图 article_str = '' #### 段落 -- 对每一段进行处理 for num in range(len(temp_article)): if num == 0 or num == len(temp_article) - 1: # 添加首段/尾段 temp_article[num] = self.util.insert_keyword( keyword, temp_article[num]) # 插入关键词 article_str += '<p>%s</p>\n' % temp_article[num] elif num == 1: # 添加第二段,并插入一张图片 article_str += '<p>%s</p>\n' % temp_article[num] article_str += '<p><img src="{imgpath}/%s_imgs/%s"></p>\n' % ( self.domain_name, img[0]) # 注意修改站点名称 elif num == 3: # 添加第四段,并插入一张图片 article_str += '<p>%s</p>\n' % temp_article[num] article_str += '<p><img src="{imgpath}/%s_imgs/%s"></p>\n' % ( self.domain_name, img[1]) # 注意修改站点名称 else: # 添加第三段 article_str += '<p>%s</p>\n' % temp_article[num] save_path = self.save_article_path + '/' + '{}.txt'.format( keyword) try: self.util.write_article(save_path, article_str) except: self.util.write_article( save_path, article_str.replace(u'\u200b', u'').replace( u'\xa0', u'').replace(u'\u2022', u'')) index += 1 except Exception as e: # 如果遇到错误,就将关键词从"used_keyword"列表中取出,这样就可以重新获取此关键词进行拼接 self.used_keyword.remove(keyword) print(e)
def run_mix(path): """ 新旧文章的混合拼接程序 """ file_splice = File_processing5(path) file_splice.run() if __name__ == '__main__': while True: util = Util() read_path = r'./data/read_path' # IDE执行 # read_path = r'../data/read_path' # 打包执行 folders = util.get_file_dir(read_path) for folder in folders: print("-" * 50) print(folder) print("-" * 50) folder_path = read_path + "/" + folder if "_a_" in folder: run_original(folder_path) else: run_mix(folder_path) while True: input_str = input( 'Press "Y(or y) + Enter" to continue, press "N(or n) + Enter" to exit:' )
class FileSplitAndSplicing(object): """ 将txt或docx的文章拆分下来首段,中段,尾段,然后其按照之前的逻辑拼接文章. """ def __init__(self, path): self.util = Util() self.document = docx # 读取 self.read_path = path self.domain_name = "_".join(path.split("/")[-1].split("_")[:3]) self.start_keyword = 0 # 关键词开始的位置 self.end_keyword = int(path.split("/")[-1].split("_")[-1]) # 关键词结束的位置 # self.special_keyword = '苏州' self.used_keyword = [] # 保存路径 self.save_article_path = r'./data/save_path/{}_articles'.format( self.domain_name) self.save_img_path = r'./data/save_path/{}_imgs'.format( self.domain_name) #################### 打包 ########################## # self.save_article_path = r'../data/save_path/{}_articles'.format(self.domain_name) # self.save_img_path = r'../data/save_path/{}_imgs'.format(self.domain_name) ######################################################## # def get_keywords(self): # """ # 获取关键词 # 举例:现需要150个关键词,含有"苏州"的关键词优先, # 如果含有"苏州"的关键词超过150个,则取前150个, # 如果含有"苏州"的关键词不到150,则取完这些词还要再取一些普通关键词凑够150个 # """ # all_keywords = self.read_xlsx() # 所有的关键词 # special_keywords = [kw for kw in all_keywords if self.special_keyword in kw] # 特殊关键词,如:含有"苏州"的关键词 # all_keywords = [kw for kw in all_keywords if self.special_keyword not in kw] # 不含有"苏州"的所有的关键词 # # keywords_num = self.end_keyword - self.start_keyword # 需要的关键词个数 # if len(special_keywords) >= keywords_num: # needed_keywords = special_keywords[0:keywords_num] # remaining_keywords = special_keywords[keywords_num:] # remaining_keywords.extend(all_keywords) # else: # needed_keywords = copy.deepcopy(special_keywords) # needed_keywords.extend(all_keywords[0: keywords_num-len(special_keywords)]) # remaining_keywords = all_keywords[(keywords_num-len(special_keywords)):] # return needed_keywords, remaining_keywords # def get_file_list(self, filepath): # '''获取("反射型光电传感器")目录下的所有文件及文件夹''' # # return os.listdir(self.util.to_gbk(filepath)) # return os.listdir(filepath) def split_article(self, file_path, file): """ 将文章拆分为首段/中段/尾段 """ paragraphs = list() final_paragraphs = list() try: if file_path.endswith('txt'): try: with open(file_path, 'r') as f: paragraphs = f.readlines() except: with open(file_path, 'r', encoding='UTF-8') as f: paragraphs = f.readlines() elif file_path.endswith("docx"): document = Document(u'{}'.format(file_path)) paragraphs = [ p.text for p in document.paragraphs if p.text != '\n' and p != '' and p is not None ] else: word = Dispatch('Word.Application') # 打开word应用程序 # word = DispatchEx('Word.Application') # 启动独立的进程 word.Visible = 0 # 后台运行,不显示 word.DisplayAlerts = 0 # 不警告 dir_path = os.path.dirname( os.path.abspath(file)) + "\\" + file_path doc = word.Documents.Open(dir_path) # doc.SaveAs(os.path.splitext(dir_path)[0] + '.docx', 12, False, "", True, "", False, False, False, False) for para in doc.paragraphs: paragraphs.append(para.Range.Text) doc.Close() # 处理文档中的非正常回车 for p in paragraphs: ps = p.split("\n\t") for pp in ps: final_paragraphs.append(pp) start = final_paragraphs[0] middle = final_paragraphs[1:-1] end = final_paragraphs[-1] return start, middle, end except: traceback.print_exc() def operate_picture(self, filepath): """ 处理图片 :param filepath: :return: 所有图片的路径 """ imgs = [] try: # for file in os.listdir(self.util.to_gbk(filepath)): # img = file.decode('gbk').encode('utf-8') # # imgs.append(os.path.join(filepath, img)) # imgs.append(img) for file in os.listdir(filepath): imgs.append(file) return imgs except: print('operate picture error', traceback.print_exc()) def run(self): if not os.path.exists(self.save_article_path): os.mkdir(self.save_article_path) if not os.path.exists(self.save_img_path): os.mkdir(self.save_img_path) img_list = list() article_list = list() start_paragraph_list = list() # 存放所有首段段落 middle_paragraph_list = list() # 存放所有中段段落 end_paragraph_list = list() # 存放所有尾段段落 file_dir_list = self.util.get_file_dir(self.read_path) # 获取所有文件 for file in file_dir_list: file_path = self.read_path + "/" + file if file != 'img' and 'xlsx' not in file: start_paragraph, middle_paragraph, end_paragraph = self.split_article( file_path, file) start_paragraph_list.append(start_paragraph) middle_paragraph_list.extend(middle_paragraph) end_paragraph_list.append(end_paragraph) elif file == "img": img_list = self.operate_picture(file_path) # 获取所有图片 for img in img_list: shutil.copy( u"{}".format(self.read_path + '/' + '/img/' + img), self.save_img_path) elif "xlsx" in file: self.keywords = self.util.read_xlsx(file_path)[ self.start_keyword:self.end_keyword] # 普通的取关键词 # middle_paragraph_list = middle_paragraph_list if len(middle_paragraph_list) < 100 else random.sample(middle_paragraph_list, 100) all_mid_list = self.util.mid_permutation_and_combination( middle_paragraph_list) # 中段所有排列组合之后的情况 all_mid_list = all_mid_list if len( all_mid_list) < 2000 else random.sample(all_mid_list, 2000) articles = self.util.article_permutation_and_combination( start_paragraph_list, all_mid_list, end_paragraph_list) # articles = self.util.article_permutation_and_combination(random.sample(start_paragraph_list, 10), random.sample(all_mid_list, 10) , random.sample(end_paragraph_list, 10)) article_list = self.util.get_article_list( articles, article_list) # 存储最终的所有的文章【单个文件夹下的】 # 下面每次循环生成一篇文章, 每个文件夹需要生成“every_article_num”篇文章 # for _ in range(every_article_num): index = 1 while True: keyword = self.util.get_keyword(self.keywords, self.used_keyword) if not keyword: # 关键词使用完之后退出循环 break print(index, keyword) # 随机抽取文章,要求文章字数在730~870 # while True: # article = random.choice(article_list) # 随机抽一篇文章 # article_len = self.file_paocessing1.get_article_len(article) # if 730 < article_len < 900: # break try: article = random.choice(article_list) # 随机抽一篇文章 temp_article = copy.deepcopy(article) # 深拷贝,对新数据进行处理,不改变原数据 img = random.sample(img_list, 2) # 随机取两张图 article_str = '' # 段落 -- 对每一段进行处理 for num in range(len(temp_article)): if num == 0 or num == len(temp_article) - 1: # 添加首段/尾段 temp_article[num] = self.util.insert_keyword( keyword, temp_article[num]) # 插入关键词 article_str += '<p>%s</p>\n' % temp_article[num] elif num == 1: # 添加第二段,并插入一张图片 article_str += '<p>%s</p>\n' % temp_article[num] article_str += '<p><img src="{imgpath}/%s_imgs/%s"></p>\n' % ( self.domain_name, img[0]) # 注意修改站点名称 elif num == 3: # 添加第四段,并插入一张图片 article_str += '<p>%s</p>\n' % temp_article[num] article_str += '<p><img src="{imgpath}/%s_imgs/%s"></p>\n' % ( self.domain_name, img[1]) # 注意修改站点名称 else: # 添加第三段 article_str += '<p>%s</p>\n' % temp_article[num] save_path = self.save_article_path + '/' + '{}.txt'.format( keyword) try: self.util.write_article(save_path, article_str) except: self.util.write_article( save_path, article_str.replace(u'\u200b', u'').replace( u'\xa0', u'').replace(u'\u2022', u'')) index += 1 except Exception as e: # 如果遇到错误,就将关键词从"used_keyword"列表中取出,重新获取此关键词进行拼接 self.used_keyword.remove(keyword) print(e)