class File_processing2(): def __init__(self): self.util = Util() self.file_process = File_processing1() # 每次运行需要修改的点 self.domain_name = self.file_process.domain_name self.kw_excel_name = self.file_process.kw_excel_name # 读取路径 self.read_path = self.file_process.read_path self.keyword_path = self.file_process.keyword_path # 保存路径 self.save_article_path = r'./data/save_path/{}_articles_no_picture'.format( self.domain_name) # self.save_img_path = r'./data/save_path/{}_imgs_with_no_picture'.format(self.domain_name) # self.domain_name = 'uni_technology' # self.keywords_num = 180 # 关键词数量 # 已使用的关键字 self.used_keyword = [] # 未使用的段落 # self.unused_paragraphs = [] # 已使用的图片 # self.used_pictures = [] # 所有的段落 # self.paragraphs = [] # self.keywords = self.read_xlsx(self.read_path + '\keyword.xlsx') def get_keyword(self): """ 获取关键字(不能重复) :return: """ try: unused_keyword = list(set(self.keywords) ^ set(self.used_keyword)) if len(unused_keyword) == 0: return None keyword = random.choice(unused_keyword) self.used_keyword.append(keyword) return keyword except: traceback.print_exc() def get_file_dir(self, filepath): '''获取("E:\workSpace\article_splicing\data\technology_article")路径下的所有文件夹''' try: file_dir_list = os.listdir(self.util.to_gbk(filepath)) file_dir_list = [ file.decode('gbk').encode('utf-8') for file in file_dir_list ] return file_dir_list except: print('get file dir error', traceback.print_exc()) def get_file_list(self, filepath): '''获取("反射型光电传感器")目录下的所有文件及文件夹''' return os.listdir(self.util.to_gbk(filepath)) def operate_picture(self, filepath): """ 处理图片 :param filepath: :return: 所有图片的路径 """ try: imgs = [] for file in os.listdir(self.util.to_gbk(filepath)): img = file.decode('gbk').encode('utf-8') # imgs.append(os.path.join(filepath, img)) imgs.append(img) return imgs except: print('operate picture error', traceback.print_exc()) def write_article(self, path, article): """保存文章为txt格式""" try: with open(path.decode('utf-8'), 'w') as f: f.write(article) except: print('write article error', traceback.print_exc()) def read_xlsx(self): '''读取表格''' workbook = load_workbook(self.keyword_path) # 工作表 sheet = workbook.get_sheet_by_name("Sheet1") # sheet1 keyword_list = [i.value for i in sheet['A']] # 读取A列的值 if keyword_list[0] == 1: keyword_list = [i.value for i in sheet['B']] # 读取A列的值 return keyword_list def get_all_article(self, dir_list): '''获取所有的文章列表''' all_articles = [] for floder in dir_list: filepath = self.read_path + '/' + floder if os.path.isdir(unicode( filepath, "utf-8")) and floder != 'image' and floder != 'video': file_list = [ file.decode('gbk').encode('utf-8') for file in self.get_file_list(filepath) if file ] for file in file_list: t_filepath = filepath + '/' + file filename = t_filepath.split('/')[-1] if filename.endswith(u'首段.txt'): start_paragraph_list = self.util.start_end_paragraph( t_filepath) # 首段所有段落 elif filename.endswith(u'中段.txt'): middle_paragraph_list = self.util.middle_paragraph( t_filepath) all_mid_list = self.util.mid_permutation_and_combination( middle_paragraph_list) # 中段所有排列组合之后的情况 elif filename.endswith(u'尾段.txt'): end_paragraph_list = self.util.start_end_paragraph( t_filepath) # 尾段所有段落 articles = self.util.article_permutation_and_combination( start_paragraph_list, all_mid_list, end_paragraph_list) article_list = [] article_list = self.util.get_article_list( articles, article_list) # 存储最终的所有的文章(无图)【单个文件夹下的】 for _ in article_list: all_articles.append(_) return all_articles def get_article_str(self, paragraph_list, keyword): '''对每一段进行拼接,包括插入关键词和插入图片''' article_str = '' for i in range(len(paragraph_list)): # first_keyword = keyword if i == 0 or i == len(paragraph_list) - 1: # 添加首段/尾段 paragraph_list[i] = self.util.insert_keyword( keyword, paragraph_list[i]) # 插入关键词 article_str += '<p>%s</p>\n' % paragraph_list[i] else: # 添加其他段落 article_str += '<p>%s</p>\n' % paragraph_list[i] return article_str def get_article_len(self, article): '''求文章长度''' article_len = 0 for i in article: article_len += len(i.decode('utf8')) return article_len def splice_article(self, all_articles): # img_list = self.operate_picture(self.read_path + '/imgs') # 获取所有图片 ## 每次循环生成一篇文章 for _ in range(len(self.keywords)): print(_) keyword = self.get_keyword() # 每一篇文章使用一个关键词 if keyword == None: # 关键词使用完之后退出循环 break print(keyword) # 随机抽取文章,要求文章字数在730~870 while True: article = random.choice(all_articles) # 随机抽一篇文章 article_len = self.get_article_len(article) if 730 < article_len < 870: break temp_article = copy.deepcopy(article) # 深拷贝,对新数据进行处理,不改变原数据 # try: # img = random.sample(img_list, 2) # 随机取两张图 # except: # break article_str = self.get_article_str(temp_article, keyword) # 从图片list移除图片,实现图片的不复用 # img_list.remove(img[0]) # img_list.remove(img[1]) save_path = os.path.join(self.save_article_path, keyword + '.txt') self.write_article(save_path, article_str.decode('utf-8').encode('gbk')) # 重置已使用的关键词 # self.used_keyword = [] def main(self): '''拼接,生成一篇文章''' if not os.path.exists(self.save_article_path): os.mkdir(self.save_article_path) # 拷贝文件夹 # if not os.path.exists(self.save_img_path): # shutil.copytree(self.read_path + '/imgs', self.save_img_path) self.keywords = self.read_xlsx()[0:30] # 取关键词 file_dir_list = self.get_file_dir(self.read_path) # 获取所有文件夹及文件 # every_article_num = len(self.keywords) // len(file_dir_list) + 1 # 平均每个文件夹中需要生成多少篇文章 all_articles = self.get_all_article(file_dir_list) self.splice_article(all_articles)
class File_processing1(): def __init__(self): self.util = Util() self.document = docx # 域名 self.domain_name = 'shiyantai_a_1' self.kw_excel_name = '{}_keywords.xlsx'.format(self.domain_name) #读取 self.read_path = r'./data/read_path/{}'.format(self.domain_name) self.keyword_path = r'./data/read_path/{}/{}'.format(self.domain_name, self.kw_excel_name) # 保存路径 self.save_article_path = r'./data/save_path/{}_articles'.format(self.domain_name) self.save_img_path = r'./data/save_path/{}_imgs'.format(self.domain_name) self.start_keyword = 0 # 关键词开始的位置 self.end_keyword = 150 # 关键词结束的位置 self.special_keyword = '苏州' #################### 打包 ########################## # self.domain_name = raw_input('please input domain name:') # self.kw_excel_name = '{}_keywords.xlsx'.format(self.domain_name) # self.read_path = r'../data/read_path/{}'.format(self.domain_name) # self.keyword_path = r'../data/read_path/{}/{}'.format(self.domain_name, self.kw_excel_name) # # 保存路径 # self.save_article_path = r'../data/save_path/{}_articles'.format(self.domain_name) # self.save_img_path = r'../data/save_path/{}_imgs'.format(self.domain_name) # self.start_keyword = int(raw_input('start keyword index:')) # 关键词开始的位置 # self.end_keyword = int(raw_input('end keyword index:')) # 关键词结束的位置 # self.special_keyword = raw_input('please input special keyword(example:"苏州"):') ######################################################## # self.domain_name = 'uni_technology' # self.keywords_num = 180 # 关键词数量 # 已使用的关键字 self.used_keyword = [] # 未使用的段落 # self.unused_paragraphs = [] # 已使用的图片 # self.used_pictures = [] # 所有的段落 # self.paragraphs = [] # self.keywords = self.read_xlsx(self.read_path + '\keyword.xlsx') def get_keywords(self): """ 获取关键词 举例:现需要150个关键词,含有"苏州"的关键词优先, 如果含有"苏州"的关键词超过150个,则取前150个, 如果含有"苏州"的关键词不到150,则取完这些词还要再取一些普通关键词凑够150个 """ all_keywords = self.read_xlsx() # 所有的关键词 special_keywords = [kw for kw in all_keywords if self.special_keyword in kw] # 特殊关键词,如:含有"苏州"的关键词 all_keywords = [kw for kw in all_keywords if self.special_keyword not in kw] # 不含有"苏州"的所有的关键词 keywords_num = self.end_keyword - self.start_keyword # 需要的关键词个数 if len(special_keywords) >= keywords_num: needed_keywords = special_keywords[0:keywords_num] remaining_keywords = special_keywords[keywords_num:] remaining_keywords.extend(all_keywords) else: needed_keywords = copy.deepcopy(special_keywords) needed_keywords.extend(all_keywords[0: keywords_num-len(special_keywords)]) remaining_keywords = all_keywords[(keywords_num-len(special_keywords)):] return needed_keywords, remaining_keywords def get_keyword(self): """ 获取关键字(不能重复) :return: """ try: unused_keyword = list(set(self.keywords) ^ set(self.used_keyword)) if len(unused_keyword) == 0: return None keyword = random.choice(unused_keyword) self.used_keyword.append(keyword) return keyword except: traceback.print_exc() def get_file_dir(self, filepath): '''获取("E:\workSpace\article_splicing\data\technology_article")路径下的所有文件夹''' try: file_dir_list = os.listdir(self.util.to_gbk(filepath)) # file_dir_list = [file.decode('gbk').encode('utf-8') for file in file_dir_list] file_dir_list = [file for file in file_dir_list] return file_dir_list except: print('get file dir error', traceback.print_exc()) def get_file_list(self, filepath): '''获取("反射型光电传感器")目录下的所有文件及文件夹''' # return os.listdir(self.util.to_gbk(filepath)) return os.listdir(filepath) def operate_picture(self, filepath): """ 处理图片 :param filepath: :return: 所有图片的路径 """ try: imgs = [] # for file in os.listdir(self.util.to_gbk(filepath)): # img = file.decode('gbk').encode('utf-8') # # imgs.append(os.path.join(filepath, img)) # imgs.append(img) for file in os.listdir(filepath): imgs.append(file) return imgs except: print('operate picture error', traceback.print_exc()) def write_article(self, path, article): """保存文章为txt格式""" try: with open(path.decode('utf-8'), 'w') as f: f.write(article) except: print('write article error', traceback.print_exc()) def read_xlsx(self): '''读取表格''' # filepath = r'E:/workSpace/article_splicing/data/keyword.xlsx' workbook = load_workbook(self.keyword_path) # 工作表 sheet = workbook.get_sheet_by_name("Sheet1") # sheet1 keyword_list = [i.value for i in sheet['A'] if i.value is not None] # if keyword_list[0] == 1: keyword_list = [i.value for i in sheet['B'] if i.value is not None] return keyword_list def get_article_len(self, article): '''求文章长度''' article_len = 0 for i in article: article_len += len(i) return article_len def splice_article(self): '''拼接,生成一篇文章''' if not os.path.exists(self.save_article_path): os.mkdir(self.save_article_path) if not os.path.exists(self.save_img_path): os.mkdir(self.save_img_path) # self.keywords, self.remaining_keywords = self.get_keywords() # 获取[含有"苏州"]的关键词 self.keywords = self.read_xlsx()[self.start_keyword:self.end_keyword] # 普通的取关键词 file_dir_list = self.get_file_dir(self.read_path) # 获取所有文件夹 # every_article_num = len(self.keywords) // (len(file_dir_list) - 1) + 1 # 平均每个文件加中需要生成多少篇文章 for folder in file_dir_list: if os.path.isdir(unicode(self.read_path + '/' + folder, "utf-8")) and folder != 'image' and folder != 'video': filepath = self.read_path + '/' + folder # file_list = [file.decode('gbk').encode('utf-8') for file in self.get_file_list(filepath) if file] file_list = [file for file in self.get_file_list(filepath) if file] for file in file_list: if file != 'img': t_filepath = filepath + '/' + file filename = t_filepath.split('/')[-1] if "首段" in filename: start_paragraph_list = self.util.start_end_paragraph(t_filepath) # 首段所有段落 elif "中段" in filename: middle_paragraph_list = self.util.middle_paragraph(t_filepath) all_mid_list = self.util.mid_permutation_and_combination(middle_paragraph_list) # 中段所有排列组合之后的情况 elif "尾段" in filename: end_paragraph_list = self.util.start_end_paragraph(t_filepath) # 尾段所有段落 else: t_filepath = filepath + '/' + file img_list = self.operate_picture(t_filepath) # 获取所有图片 for img in img_list: shutil.copy(u"{}".format(self.read_path + '/' + folder + '/img/' + img), self.save_img_path) # articles = self.util.article_permutation_and_combination(start_paragraph_list, all_mid_list, end_paragraph_list) articles = self.util.article_permutation_and_combination(random.sample(start_paragraph_list, 10), \ random.sample(all_mid_list, 20), \ random.sample(end_paragraph_list, 10)) article_list = [] article_list = self.util.get_article_list(articles, article_list) # 存储最终的所有的文章(无图)【单个文件夹下的】 ## 下面每次循环生成一篇文章, 每个文件夹需要生成“every_article_num”篇文章 # for _ in range(every_article_num): i = 1 while True: keyword = self.get_keyword() # 每一篇文章使用一个关键词 if keyword == None: # 关键词使用完之后退出循环 break print(i) print(keyword) # 随机抽取文章,要求文章字数在730~870 while True: article = random.choice(article_list) # 随机抽一篇文章 article_len = self.get_article_len(article) if 700 < article_len < 950: break temp_article = copy.deepcopy(article) # 深拷贝,对新数据进行处理,不改变原数据 img = random.sample(img_list, 2) # 随机取两张图 article_str = '' # article_str += '<h1>{}</h1>'.format(keyword)+ '\n' # 给文章添加标题 ####段落 -- 对每一段进行处理 for paragraph_num in range(len(temp_article)): first_keyword = keyword if paragraph_num == 0 or paragraph_num == len(temp_article) - 1: # 添加首段/尾段 temp_article[paragraph_num] = self.util.insert_keyword(keyword, temp_article[paragraph_num]) # 插入关键词 article_str += '<p>%s</p>\n' % temp_article[paragraph_num] elif paragraph_num == 1: # 添加第二段,并插入一张图片 article_str += '<p>%s</p>\n' % temp_article[paragraph_num] article_str += '<p><img src={imgpath}/%s_imgs/%s></p>\n' % (self.domain_name, img[0]) # 注意修改站点名称 elif paragraph_num == 3: # 添加第四段,并插入一张图片 article_str += '<p>%s</p>\n' % temp_article[paragraph_num] article_str += '<p><img src={imgpath}/%s_imgs/%s></p>\n' % (self.domain_name, img[1]) # 注意修改站点名称 else: # 添加第三段 article_str += '<p>%s</p>\n' % temp_article[paragraph_num] save_path = self.save_article_path + '/' + '{}.txt'.format(first_keyword) self.write_article(save_path, article_str.decode('utf-8').encode('gbk')) i += 1
class FileSplitAndSplicing(object): """ 将txt或docx的文章拆分下来首段,中段,尾段,然后其按照之前的逻辑拼接文章. """ def __init__(self, path): self.util = Util() self.document = docx # 读取 self.read_path = path self.domain_name = "_".join(path.split("/")[-1].split("_")[:3]) self.start_keyword = 0 # 关键词开始的位置 self.end_keyword = int(path.split("/")[-1].split("_")[-1]) # 关键词结束的位置 # self.special_keyword = '苏州' self.used_keyword = [] # 保存路径 self.save_article_path = r'./data/save_path/{}_articles'.format( self.domain_name) self.save_img_path = r'./data/save_path/{}_imgs'.format( self.domain_name) #################### 打包 ########################## # self.save_article_path = r'../data/save_path/{}_articles'.format(self.domain_name) # self.save_img_path = r'../data/save_path/{}_imgs'.format(self.domain_name) ######################################################## # def get_keywords(self): # """ # 获取关键词 # 举例:现需要150个关键词,含有"苏州"的关键词优先, # 如果含有"苏州"的关键词超过150个,则取前150个, # 如果含有"苏州"的关键词不到150,则取完这些词还要再取一些普通关键词凑够150个 # """ # all_keywords = self.read_xlsx() # 所有的关键词 # special_keywords = [kw for kw in all_keywords if self.special_keyword in kw] # 特殊关键词,如:含有"苏州"的关键词 # all_keywords = [kw for kw in all_keywords if self.special_keyword not in kw] # 不含有"苏州"的所有的关键词 # # keywords_num = self.end_keyword - self.start_keyword # 需要的关键词个数 # if len(special_keywords) >= keywords_num: # needed_keywords = special_keywords[0:keywords_num] # remaining_keywords = special_keywords[keywords_num:] # remaining_keywords.extend(all_keywords) # else: # needed_keywords = copy.deepcopy(special_keywords) # needed_keywords.extend(all_keywords[0: keywords_num-len(special_keywords)]) # remaining_keywords = all_keywords[(keywords_num-len(special_keywords)):] # return needed_keywords, remaining_keywords # def get_file_list(self, filepath): # '''获取("反射型光电传感器")目录下的所有文件及文件夹''' # # return os.listdir(self.util.to_gbk(filepath)) # return os.listdir(filepath) def split_article(self, file_path, file): """ 将文章拆分为首段/中段/尾段 """ paragraphs = list() final_paragraphs = list() try: if file_path.endswith('txt'): try: with open(file_path, 'r') as f: paragraphs = f.readlines() except: with open(file_path, 'r', encoding='UTF-8') as f: paragraphs = f.readlines() elif file_path.endswith("docx"): document = Document(u'{}'.format(file_path)) paragraphs = [ p.text for p in document.paragraphs if p.text != '\n' and p != '' and p is not None ] else: word = Dispatch('Word.Application') # 打开word应用程序 # word = DispatchEx('Word.Application') # 启动独立的进程 word.Visible = 0 # 后台运行,不显示 word.DisplayAlerts = 0 # 不警告 dir_path = os.path.dirname( os.path.abspath(file)) + "\\" + file_path doc = word.Documents.Open(dir_path) # doc.SaveAs(os.path.splitext(dir_path)[0] + '.docx', 12, False, "", True, "", False, False, False, False) for para in doc.paragraphs: paragraphs.append(para.Range.Text) doc.Close() # 处理文档中的非正常回车 for p in paragraphs: ps = p.split("\n\t") for pp in ps: final_paragraphs.append(pp) start = final_paragraphs[0] middle = final_paragraphs[1:-1] end = final_paragraphs[-1] return start, middle, end except: traceback.print_exc() def operate_picture(self, filepath): """ 处理图片 :param filepath: :return: 所有图片的路径 """ imgs = [] try: # for file in os.listdir(self.util.to_gbk(filepath)): # img = file.decode('gbk').encode('utf-8') # # imgs.append(os.path.join(filepath, img)) # imgs.append(img) for file in os.listdir(filepath): imgs.append(file) return imgs except: print('operate picture error', traceback.print_exc()) def run(self): if not os.path.exists(self.save_article_path): os.mkdir(self.save_article_path) if not os.path.exists(self.save_img_path): os.mkdir(self.save_img_path) img_list = list() article_list = list() start_paragraph_list = list() # 存放所有首段段落 middle_paragraph_list = list() # 存放所有中段段落 end_paragraph_list = list() # 存放所有尾段段落 file_dir_list = self.util.get_file_dir(self.read_path) # 获取所有文件 for file in file_dir_list: file_path = self.read_path + "/" + file if file != 'img' and 'xlsx' not in file: start_paragraph, middle_paragraph, end_paragraph = self.split_article( file_path, file) start_paragraph_list.append(start_paragraph) middle_paragraph_list.extend(middle_paragraph) end_paragraph_list.append(end_paragraph) elif file == "img": img_list = self.operate_picture(file_path) # 获取所有图片 for img in img_list: shutil.copy( u"{}".format(self.read_path + '/' + '/img/' + img), self.save_img_path) elif "xlsx" in file: self.keywords = self.util.read_xlsx(file_path)[ self.start_keyword:self.end_keyword] # 普通的取关键词 # middle_paragraph_list = middle_paragraph_list if len(middle_paragraph_list) < 100 else random.sample(middle_paragraph_list, 100) all_mid_list = self.util.mid_permutation_and_combination( middle_paragraph_list) # 中段所有排列组合之后的情况 all_mid_list = all_mid_list if len( all_mid_list) < 2000 else random.sample(all_mid_list, 2000) articles = self.util.article_permutation_and_combination( start_paragraph_list, all_mid_list, end_paragraph_list) # articles = self.util.article_permutation_and_combination(random.sample(start_paragraph_list, 10), random.sample(all_mid_list, 10) , random.sample(end_paragraph_list, 10)) article_list = self.util.get_article_list( articles, article_list) # 存储最终的所有的文章【单个文件夹下的】 # 下面每次循环生成一篇文章, 每个文件夹需要生成“every_article_num”篇文章 # for _ in range(every_article_num): index = 1 while True: keyword = self.util.get_keyword(self.keywords, self.used_keyword) if not keyword: # 关键词使用完之后退出循环 break print(index, keyword) # 随机抽取文章,要求文章字数在730~870 # while True: # article = random.choice(article_list) # 随机抽一篇文章 # article_len = self.file_paocessing1.get_article_len(article) # if 730 < article_len < 900: # break try: article = random.choice(article_list) # 随机抽一篇文章 temp_article = copy.deepcopy(article) # 深拷贝,对新数据进行处理,不改变原数据 img = random.sample(img_list, 2) # 随机取两张图 article_str = '' # 段落 -- 对每一段进行处理 for num in range(len(temp_article)): if num == 0 or num == len(temp_article) - 1: # 添加首段/尾段 temp_article[num] = self.util.insert_keyword( keyword, temp_article[num]) # 插入关键词 article_str += '<p>%s</p>\n' % temp_article[num] elif num == 1: # 添加第二段,并插入一张图片 article_str += '<p>%s</p>\n' % temp_article[num] article_str += '<p><img src="{imgpath}/%s_imgs/%s"></p>\n' % ( self.domain_name, img[0]) # 注意修改站点名称 elif num == 3: # 添加第四段,并插入一张图片 article_str += '<p>%s</p>\n' % temp_article[num] article_str += '<p><img src="{imgpath}/%s_imgs/%s"></p>\n' % ( self.domain_name, img[1]) # 注意修改站点名称 else: # 添加第三段 article_str += '<p>%s</p>\n' % temp_article[num] save_path = self.save_article_path + '/' + '{}.txt'.format( keyword) try: self.util.write_article(save_path, article_str) except: self.util.write_article( save_path, article_str.replace(u'\u200b', u'').replace( u'\xa0', u'').replace(u'\u2022', u'')) index += 1 except Exception as e: # 如果遇到错误,就将关键词从"used_keyword"列表中取出,重新获取此关键词进行拼接 self.used_keyword.remove(keyword) print(e)