class File_processing2():
    def __init__(self):
        self.util = Util()
        self.file_process = File_processing1()
        # 每次运行需要修改的点
        self.domain_name = self.file_process.domain_name
        self.kw_excel_name = self.file_process.kw_excel_name
        # 读取路径
        self.read_path = self.file_process.read_path
        self.keyword_path = self.file_process.keyword_path
        # 保存路径
        self.save_article_path = r'./data/save_path/{}_articles_no_picture'.format(
            self.domain_name)
        # self.save_img_path = r'./data/save_path/{}_imgs_with_no_picture'.format(self.domain_name)
        # self.domain_name = 'uni_technology'
        # self.keywords_num = 180  # 关键词数量
        # 已使用的关键字
        self.used_keyword = []
        # 未使用的段落
        # self.unused_paragraphs = []
        # 已使用的图片
        # self.used_pictures = []
        # 所有的段落
        # self.paragraphs = []
        # self.keywords = self.read_xlsx(self.read_path + '\keyword.xlsx')

    def get_keyword(self):
        """
        获取关键字(不能重复)
        :return:
        """
        try:
            unused_keyword = list(set(self.keywords) ^ set(self.used_keyword))
            if len(unused_keyword) == 0:
                return None
            keyword = random.choice(unused_keyword)
            self.used_keyword.append(keyword)
            return keyword
        except:
            traceback.print_exc()

    def get_file_dir(self, filepath):
        '''获取("E:\workSpace\article_splicing\data\technology_article")路径下的所有文件夹'''
        try:
            file_dir_list = os.listdir(self.util.to_gbk(filepath))
            file_dir_list = [
                file.decode('gbk').encode('utf-8') for file in file_dir_list
            ]
            return file_dir_list
        except:
            print('get file dir error', traceback.print_exc())

    def get_file_list(self, filepath):
        '''获取("反射型光电传感器")目录下的所有文件及文件夹'''
        return os.listdir(self.util.to_gbk(filepath))

    def operate_picture(self, filepath):
        """
        处理图片
        :param filepath:
        :return: 所有图片的路径
        """
        try:
            imgs = []
            for file in os.listdir(self.util.to_gbk(filepath)):
                img = file.decode('gbk').encode('utf-8')
                # imgs.append(os.path.join(filepath, img))
                imgs.append(img)
            return imgs
        except:
            print('operate picture error', traceback.print_exc())

    def write_article(self, path, article):
        """保存文章为txt格式"""
        try:
            with open(path.decode('utf-8'), 'w') as f:
                f.write(article)
        except:
            print('write article error', traceback.print_exc())

    def read_xlsx(self):
        '''读取表格'''
        workbook = load_workbook(self.keyword_path)  # 工作表
        sheet = workbook.get_sheet_by_name("Sheet1")  # sheet1
        keyword_list = [i.value for i in sheet['A']]  # 读取A列的值
        if keyword_list[0] == 1:
            keyword_list = [i.value for i in sheet['B']]  # 读取A列的值
        return keyword_list

    def get_all_article(self, dir_list):
        '''获取所有的文章列表'''
        all_articles = []
        for floder in dir_list:
            filepath = self.read_path + '/' + floder
            if os.path.isdir(unicode(
                    filepath,
                    "utf-8")) and floder != 'image' and floder != 'video':
                file_list = [
                    file.decode('gbk').encode('utf-8')
                    for file in self.get_file_list(filepath) if file
                ]

                for file in file_list:
                    t_filepath = filepath + '/' + file
                    filename = t_filepath.split('/')[-1]
                    if filename.endswith(u'首段.txt'):
                        start_paragraph_list = self.util.start_end_paragraph(
                            t_filepath)  # 首段所有段落
                    elif filename.endswith(u'中段.txt'):
                        middle_paragraph_list = self.util.middle_paragraph(
                            t_filepath)
                        all_mid_list = self.util.mid_permutation_and_combination(
                            middle_paragraph_list)  # 中段所有排列组合之后的情况
                    elif filename.endswith(u'尾段.txt'):
                        end_paragraph_list = self.util.start_end_paragraph(
                            t_filepath)  # 尾段所有段落
                articles = self.util.article_permutation_and_combination(
                    start_paragraph_list, all_mid_list, end_paragraph_list)
                article_list = []
                article_list = self.util.get_article_list(
                    articles, article_list)  # 存储最终的所有的文章(无图)【单个文件夹下的】

                for _ in article_list:
                    all_articles.append(_)
        return all_articles

    def get_article_str(self, paragraph_list, keyword):
        '''对每一段进行拼接,包括插入关键词和插入图片'''
        article_str = ''
        for i in range(len(paragraph_list)):
            # first_keyword = keyword
            if i == 0 or i == len(paragraph_list) - 1:  # 添加首段/尾段
                paragraph_list[i] = self.util.insert_keyword(
                    keyword, paragraph_list[i])  # 插入关键词
                article_str += '<p>%s</p>\n' % paragraph_list[i]
            else:  # 添加其他段落
                article_str += '<p>%s</p>\n' % paragraph_list[i]
        return article_str

    def get_article_len(self, article):
        '''求文章长度'''
        article_len = 0
        for i in article:
            article_len += len(i.decode('utf8'))
        return article_len

    def splice_article(self, all_articles):
        # img_list = self.operate_picture(self.read_path + '/imgs')  # 获取所有图片
        ## 每次循环生成一篇文章
        for _ in range(len(self.keywords)):
            print(_)
            keyword = self.get_keyword()  # 每一篇文章使用一个关键词
            if keyword == None:  # 关键词使用完之后退出循环
                break
            print(keyword)
            # 随机抽取文章,要求文章字数在730~870
            while True:
                article = random.choice(all_articles)  # 随机抽一篇文章
                article_len = self.get_article_len(article)
                if 730 < article_len < 870:
                    break

            temp_article = copy.deepcopy(article)  # 深拷贝,对新数据进行处理,不改变原数据
            # try:
            #     img = random.sample(img_list, 2)  # 随机取两张图
            # except:
            #     break
            article_str = self.get_article_str(temp_article, keyword)
            # 从图片list移除图片,实现图片的不复用
            # img_list.remove(img[0])
            # img_list.remove(img[1])

            save_path = os.path.join(self.save_article_path, keyword + '.txt')
            self.write_article(save_path,
                               article_str.decode('utf-8').encode('gbk'))
        # 重置已使用的关键词
        # self.used_keyword = []

    def main(self):
        '''拼接,生成一篇文章'''
        if not os.path.exists(self.save_article_path):
            os.mkdir(self.save_article_path)
        # 拷贝文件夹
        # if not os.path.exists(self.save_img_path):
        #     shutil.copytree(self.read_path + '/imgs', self.save_img_path)

        self.keywords = self.read_xlsx()[0:30]  # 取关键词
        file_dir_list = self.get_file_dir(self.read_path)  # 获取所有文件夹及文件
        # every_article_num = len(self.keywords) // len(file_dir_list) + 1  # 平均每个文件夹中需要生成多少篇文章
        all_articles = self.get_all_article(file_dir_list)
        self.splice_article(all_articles)
Exemplo n.º 2
0
class File_processing1():

    def __init__(self):
        self.util = Util()
        self.document = docx
        # 域名
        self.domain_name = 'shiyantai_a_1'
        self.kw_excel_name = '{}_keywords.xlsx'.format(self.domain_name)
        #读取
        self.read_path = r'./data/read_path/{}'.format(self.domain_name)
        self.keyword_path = r'./data/read_path/{}/{}'.format(self.domain_name, self.kw_excel_name)
        # 保存路径
        self.save_article_path = r'./data/save_path/{}_articles'.format(self.domain_name)
        self.save_img_path = r'./data/save_path/{}_imgs'.format(self.domain_name)
        self.start_keyword = 0  # 关键词开始的位置
        self.end_keyword = 150  # 关键词结束的位置
        self.special_keyword = '苏州'
        ####################   打包   ##########################
        # self.domain_name = raw_input('please input domain name:')
        # self.kw_excel_name = '{}_keywords.xlsx'.format(self.domain_name)
        # self.read_path = r'../data/read_path/{}'.format(self.domain_name)
        # self.keyword_path = r'../data/read_path/{}/{}'.format(self.domain_name, self.kw_excel_name)
        # # 保存路径
        # self.save_article_path = r'../data/save_path/{}_articles'.format(self.domain_name)
        # self.save_img_path = r'../data/save_path/{}_imgs'.format(self.domain_name)
        # self.start_keyword = int(raw_input('start keyword index:'))  # 关键词开始的位置
        # self.end_keyword = int(raw_input('end keyword index:'))  # 关键词结束的位置
        # self.special_keyword = raw_input('please input special keyword(example:"苏州"):')
        ########################################################
        # self.domain_name = 'uni_technology'
        # self.keywords_num = 180  # 关键词数量
        # 已使用的关键字
        self.used_keyword = []
        # 未使用的段落
        # self.unused_paragraphs = []
        # 已使用的图片
        # self.used_pictures = []
        # 所有的段落
        # self.paragraphs = []
        # self.keywords = self.read_xlsx(self.read_path + '\keyword.xlsx')


    def get_keywords(self):
        """
        获取关键词
        举例:现需要150个关键词,含有"苏州"的关键词优先,
        如果含有"苏州"的关键词超过150个,则取前150个,
        如果含有"苏州"的关键词不到150,则取完这些词还要再取一些普通关键词凑够150个
        """
        all_keywords = self.read_xlsx()  # 所有的关键词
        special_keywords = [kw for kw in all_keywords if self.special_keyword in kw]  # 特殊关键词,如:含有"苏州"的关键词
        all_keywords = [kw for kw in all_keywords if self.special_keyword not in kw]  # 不含有"苏州"的所有的关键词

        keywords_num = self.end_keyword - self.start_keyword  # 需要的关键词个数
        if len(special_keywords) >= keywords_num:
            needed_keywords = special_keywords[0:keywords_num]
            remaining_keywords = special_keywords[keywords_num:]
            remaining_keywords.extend(all_keywords)
        else:
            needed_keywords = copy.deepcopy(special_keywords)
            needed_keywords.extend(all_keywords[0: keywords_num-len(special_keywords)])
            remaining_keywords = all_keywords[(keywords_num-len(special_keywords)):]
        return needed_keywords, remaining_keywords

    def get_keyword(self):
        """
        获取关键字(不能重复)
        :return:
        """
        try:
            unused_keyword = list(set(self.keywords) ^ set(self.used_keyword))
            if len(unused_keyword) == 0:
                return None
            keyword = random.choice(unused_keyword)
            self.used_keyword.append(keyword)
            return keyword
        except:
            traceback.print_exc()

    def get_file_dir(self, filepath):
        '''获取("E:\workSpace\article_splicing\data\technology_article")路径下的所有文件夹'''
        try:
            file_dir_list = os.listdir(self.util.to_gbk(filepath))
            # file_dir_list = [file.decode('gbk').encode('utf-8') for file in file_dir_list]
            file_dir_list = [file for file in file_dir_list]
            return file_dir_list
        except:
            print('get file dir error', traceback.print_exc())

    def get_file_list(self, filepath):
        '''获取("反射型光电传感器")目录下的所有文件及文件夹'''
        # return os.listdir(self.util.to_gbk(filepath))
        return os.listdir(filepath)


    def operate_picture(self, filepath):
        """
        处理图片
        :param filepath:
        :return: 所有图片的路径
        """
        try:
            imgs = []
            # for file in os.listdir(self.util.to_gbk(filepath)):
            #     img = file.decode('gbk').encode('utf-8')
            #     # imgs.append(os.path.join(filepath, img))
            #     imgs.append(img)
            for file in os.listdir(filepath):
                imgs.append(file)
            return imgs
        except:
            print('operate picture error', traceback.print_exc())


    def write_article(self, path, article):
        """保存文章为txt格式"""
        try:
            with open(path.decode('utf-8'), 'w') as f:
                f.write(article)
        except:
            print('write article error', traceback.print_exc())

    def read_xlsx(self):
        '''读取表格'''
        # filepath = r'E:/workSpace/article_splicing/data/keyword.xlsx'
        workbook = load_workbook(self.keyword_path)  # 工作表
        sheet = workbook.get_sheet_by_name("Sheet1")  # sheet1
        keyword_list = [i.value for i in sheet['A'] if i.value is not None]  #
        if keyword_list[0] == 1:
            keyword_list = [i.value for i in sheet['B'] if i.value is not None]
        return keyword_list

    def get_article_len(self, article):
        '''求文章长度'''
        article_len = 0
        for i in article:
            article_len += len(i)
        return article_len

    def splice_article(self):
        '''拼接,生成一篇文章'''
        if not os.path.exists(self.save_article_path):
            os.mkdir(self.save_article_path)
        if not os.path.exists(self.save_img_path):
            os.mkdir(self.save_img_path)


        # self.keywords, self.remaining_keywords = self.get_keywords()  # 获取[含有"苏州"]的关键词
        self.keywords = self.read_xlsx()[self.start_keyword:self.end_keyword]  # 普通的取关键词


        file_dir_list = self.get_file_dir(self.read_path)  # 获取所有文件夹
        # every_article_num = len(self.keywords) // (len(file_dir_list) - 1) + 1 # 平均每个文件加中需要生成多少篇文章
        for folder in file_dir_list:
            if os.path.isdir(unicode(self.read_path + '/' + folder, "utf-8")) and folder != 'image' and folder != 'video':
                filepath = self.read_path + '/' + folder
                # file_list = [file.decode('gbk').encode('utf-8') for file in self.get_file_list(filepath) if file]
                file_list = [file for file in self.get_file_list(filepath) if file]

                for file in file_list:
                    if file != 'img':
                        t_filepath = filepath + '/' + file
                        filename = t_filepath.split('/')[-1]
                        if "首段" in filename:
                            start_paragraph_list = self.util.start_end_paragraph(t_filepath)  # 首段所有段落
                        elif "中段" in filename:
                            middle_paragraph_list = self.util.middle_paragraph(t_filepath)
                            all_mid_list = self.util.mid_permutation_and_combination(middle_paragraph_list)  # 中段所有排列组合之后的情况
                        elif "尾段" in filename:
                            end_paragraph_list = self.util.start_end_paragraph(t_filepath)  # 尾段所有段落
                    else:
                        t_filepath = filepath + '/' + file
                        img_list = self.operate_picture(t_filepath)  # 获取所有图片
                        for img in img_list:
                            shutil.copy(u"{}".format(self.read_path + '/' + folder + '/img/' + img), self.save_img_path)
                # articles = self.util.article_permutation_and_combination(start_paragraph_list, all_mid_list, end_paragraph_list)
                articles = self.util.article_permutation_and_combination(random.sample(start_paragraph_list, 10), \
                                                                         random.sample(all_mid_list, 20), \
                                                                         random.sample(end_paragraph_list, 10))
                article_list = []
                article_list = self.util.get_article_list(articles, article_list)  # 存储最终的所有的文章(无图)【单个文件夹下的】

                ## 下面每次循环生成一篇文章, 每个文件夹需要生成“every_article_num”篇文章
                # for _ in range(every_article_num):
                i = 1
                while True:
                    keyword = self.get_keyword()  # 每一篇文章使用一个关键词
                    if keyword == None:  # 关键词使用完之后退出循环
                        break
                    print(i)
                    print(keyword)
                    # 随机抽取文章,要求文章字数在730~870
                    while True:
                        article = random.choice(article_list)  # 随机抽一篇文章
                        article_len = self.get_article_len(article)
                        if 700 < article_len < 950:
                            break
                    temp_article = copy.deepcopy(article)  # 深拷贝,对新数据进行处理,不改变原数据
                    img = random.sample(img_list, 2)  # 随机取两张图
                    article_str = ''
                    # article_str += '<h1>{}</h1>'.format(keyword)+ '\n'  # 给文章添加标题
                    ####段落 -- 对每一段进行处理
                    for paragraph_num in range(len(temp_article)):
                        first_keyword = keyword
                        if paragraph_num == 0 or paragraph_num == len(temp_article) - 1:  # 添加首段/尾段
                            temp_article[paragraph_num] = self.util.insert_keyword(keyword, temp_article[paragraph_num])  # 插入关键词
                            article_str += '<p>%s</p>\n' % temp_article[paragraph_num]
                        elif paragraph_num == 1:  # 添加第二段,并插入一张图片
                            article_str += '<p>%s</p>\n' % temp_article[paragraph_num]
                            article_str += '<p><img src={imgpath}/%s_imgs/%s></p>\n' % (self.domain_name, img[0])  # 注意修改站点名称
                        elif paragraph_num == 3:  # 添加第四段,并插入一张图片
                            article_str += '<p>%s</p>\n' % temp_article[paragraph_num]
                            article_str += '<p><img src={imgpath}/%s_imgs/%s></p>\n' % (self.domain_name, img[1])  # 注意修改站点名称
                        else:  # 添加第三段
                            article_str += '<p>%s</p>\n' % temp_article[paragraph_num]
                    save_path = self.save_article_path + '/' + '{}.txt'.format(first_keyword)
                    self.write_article(save_path, article_str.decode('utf-8').encode('gbk'))
                    i += 1
class FileSplitAndSplicing(object):
    """
    将txt或docx的文章拆分下来首段,中段,尾段,然后其按照之前的逻辑拼接文章.
    """
    def __init__(self, path):
        self.util = Util()
        self.document = docx
        # 读取
        self.read_path = path
        self.domain_name = "_".join(path.split("/")[-1].split("_")[:3])

        self.start_keyword = 0  # 关键词开始的位置
        self.end_keyword = int(path.split("/")[-1].split("_")[-1])  # 关键词结束的位置
        # self.special_keyword = '苏州'
        self.used_keyword = []

        # 保存路径
        self.save_article_path = r'./data/save_path/{}_articles'.format(
            self.domain_name)
        self.save_img_path = r'./data/save_path/{}_imgs'.format(
            self.domain_name)
        ####################   打包   ##########################
        # self.save_article_path = r'../data/save_path/{}_articles'.format(self.domain_name)
        # self.save_img_path = r'../data/save_path/{}_imgs'.format(self.domain_name)
        ########################################################

    # def get_keywords(self):
    #     """
    #     获取关键词
    #     举例:现需要150个关键词,含有"苏州"的关键词优先,
    #     如果含有"苏州"的关键词超过150个,则取前150个,
    #     如果含有"苏州"的关键词不到150,则取完这些词还要再取一些普通关键词凑够150个
    #     """
    #     all_keywords = self.read_xlsx()  # 所有的关键词
    #     special_keywords = [kw for kw in all_keywords if self.special_keyword in kw]  # 特殊关键词,如:含有"苏州"的关键词
    #     all_keywords = [kw for kw in all_keywords if self.special_keyword not in kw]  # 不含有"苏州"的所有的关键词
    #
    #     keywords_num = self.end_keyword - self.start_keyword  # 需要的关键词个数
    #     if len(special_keywords) >= keywords_num:
    #         needed_keywords = special_keywords[0:keywords_num]
    #         remaining_keywords = special_keywords[keywords_num:]
    #         remaining_keywords.extend(all_keywords)
    #     else:
    #         needed_keywords = copy.deepcopy(special_keywords)
    #         needed_keywords.extend(all_keywords[0: keywords_num-len(special_keywords)])
    #         remaining_keywords = all_keywords[(keywords_num-len(special_keywords)):]
    #     return needed_keywords, remaining_keywords

    # def get_file_list(self, filepath):
    #     '''获取("反射型光电传感器")目录下的所有文件及文件夹'''
    #     # return os.listdir(self.util.to_gbk(filepath))
    #     return os.listdir(filepath)

    def split_article(self, file_path, file):
        """
        将文章拆分为首段/中段/尾段
        """
        paragraphs = list()
        final_paragraphs = list()
        try:
            if file_path.endswith('txt'):
                try:
                    with open(file_path, 'r') as f:
                        paragraphs = f.readlines()
                except:
                    with open(file_path, 'r', encoding='UTF-8') as f:
                        paragraphs = f.readlines()
            elif file_path.endswith("docx"):
                document = Document(u'{}'.format(file_path))
                paragraphs = [
                    p.text for p in document.paragraphs
                    if p.text != '\n' and p != '' and p is not None
                ]
            else:
                word = Dispatch('Word.Application')  # 打开word应用程序
                # word = DispatchEx('Word.Application') # 启动独立的进程
                word.Visible = 0  # 后台运行,不显示
                word.DisplayAlerts = 0  # 不警告
                dir_path = os.path.dirname(
                    os.path.abspath(file)) + "\\" + file_path
                doc = word.Documents.Open(dir_path)
                # doc.SaveAs(os.path.splitext(dir_path)[0] + '.docx', 12, False, "", True, "", False, False, False, False)
                for para in doc.paragraphs:
                    paragraphs.append(para.Range.Text)
                doc.Close()

            # 处理文档中的非正常回车
            for p in paragraphs:
                ps = p.split("\n\t")
                for pp in ps:
                    final_paragraphs.append(pp)

            start = final_paragraphs[0]
            middle = final_paragraphs[1:-1]
            end = final_paragraphs[-1]
            return start, middle, end
        except:
            traceback.print_exc()

    def operate_picture(self, filepath):
        """
        处理图片
        :param filepath:
        :return: 所有图片的路径
        """
        imgs = []
        try:
            # for file in os.listdir(self.util.to_gbk(filepath)):
            #     img = file.decode('gbk').encode('utf-8')
            #     # imgs.append(os.path.join(filepath, img))
            #     imgs.append(img)
            for file in os.listdir(filepath):
                imgs.append(file)
            return imgs
        except:
            print('operate picture error', traceback.print_exc())

    def run(self):
        if not os.path.exists(self.save_article_path):
            os.mkdir(self.save_article_path)
        if not os.path.exists(self.save_img_path):
            os.mkdir(self.save_img_path)

        img_list = list()
        article_list = list()
        start_paragraph_list = list()  # 存放所有首段段落
        middle_paragraph_list = list()  # 存放所有中段段落
        end_paragraph_list = list()  # 存放所有尾段段落

        file_dir_list = self.util.get_file_dir(self.read_path)  # 获取所有文件

        for file in file_dir_list:
            file_path = self.read_path + "/" + file
            if file != 'img' and 'xlsx' not in file:
                start_paragraph, middle_paragraph, end_paragraph = self.split_article(
                    file_path, file)
                start_paragraph_list.append(start_paragraph)
                middle_paragraph_list.extend(middle_paragraph)
                end_paragraph_list.append(end_paragraph)
            elif file == "img":
                img_list = self.operate_picture(file_path)  # 获取所有图片
                for img in img_list:
                    shutil.copy(
                        u"{}".format(self.read_path + '/' + '/img/' + img),
                        self.save_img_path)
            elif "xlsx" in file:
                self.keywords = self.util.read_xlsx(file_path)[
                    self.start_keyword:self.end_keyword]  # 普通的取关键词

        # middle_paragraph_list = middle_paragraph_list if len(middle_paragraph_list) < 100 else random.sample(middle_paragraph_list, 100)
        all_mid_list = self.util.mid_permutation_and_combination(
            middle_paragraph_list)  # 中段所有排列组合之后的情况
        all_mid_list = all_mid_list if len(
            all_mid_list) < 2000 else random.sample(all_mid_list, 2000)
        articles = self.util.article_permutation_and_combination(
            start_paragraph_list, all_mid_list, end_paragraph_list)
        # articles = self.util.article_permutation_and_combination(random.sample(start_paragraph_list, 10), random.sample(all_mid_list, 10) , random.sample(end_paragraph_list, 10))
        article_list = self.util.get_article_list(
            articles, article_list)  # 存储最终的所有的文章【单个文件夹下的】

        # 下面每次循环生成一篇文章, 每个文件夹需要生成“every_article_num”篇文章
        # for _ in range(every_article_num):
        index = 1
        while True:
            keyword = self.util.get_keyword(self.keywords, self.used_keyword)
            if not keyword:  # 关键词使用完之后退出循环
                break
            print(index, keyword)

            # 随机抽取文章,要求文章字数在730~870
            # while True:
            #     article = random.choice(article_list)  # 随机抽一篇文章
            #     article_len = self.file_paocessing1.get_article_len(article)
            #     if 730 < article_len < 900:
            #         break

            try:
                article = random.choice(article_list)  # 随机抽一篇文章
                temp_article = copy.deepcopy(article)  # 深拷贝,对新数据进行处理,不改变原数据
                img = random.sample(img_list, 2)  # 随机取两张图
                article_str = ''
                #  段落 -- 对每一段进行处理
                for num in range(len(temp_article)):
                    if num == 0 or num == len(temp_article) - 1:  # 添加首段/尾段
                        temp_article[num] = self.util.insert_keyword(
                            keyword, temp_article[num])  # 插入关键词
                        article_str += '<p>%s</p>\n' % temp_article[num]
                    elif num == 1:  # 添加第二段,并插入一张图片
                        article_str += '<p>%s</p>\n' % temp_article[num]
                        article_str += '<p><img src="{imgpath}/%s_imgs/%s"></p>\n' % (
                            self.domain_name, img[0])  # 注意修改站点名称
                    elif num == 3:  # 添加第四段,并插入一张图片
                        article_str += '<p>%s</p>\n' % temp_article[num]
                        article_str += '<p><img src="{imgpath}/%s_imgs/%s"></p>\n' % (
                            self.domain_name, img[1])  # 注意修改站点名称
                    else:  # 添加第三段
                        article_str += '<p>%s</p>\n' % temp_article[num]
                save_path = self.save_article_path + '/' + '{}.txt'.format(
                    keyword)
                try:
                    self.util.write_article(save_path, article_str)
                except:
                    self.util.write_article(
                        save_path,
                        article_str.replace(u'\u200b', u'').replace(
                            u'\xa0', u'').replace(u'\u2022', u''))
                index += 1
            except Exception as e:
                # 如果遇到错误,就将关键词从"used_keyword"列表中取出,重新获取此关键词进行拼接
                self.used_keyword.remove(keyword)
                print(e)