def parse_content(self, html_file_path):
     """
     解析 HTML 中的段落文本
     按顺序返回多个 paragraph 构成一个数组,
     每个 paragraph 是一个 content 行构成的数组
     :param html_file_path:
     :return:
     """
     rs = []
     with codecs.open(html_file_path, encoding='utf-8', mode='r') as fp:
         soup = BeautifulSoup(fp.read(), "html.parser")
         paragraphs = []
         for div in soup.find_all('div'):
             div_type = div.get('type')
             if div_type is not None and div_type == 'paragraph':
                 paragraphs.append(div)
         for paragraph_div in paragraphs:
             has_sub_paragraph = False
             for div in paragraph_div.find_all('div'):
                 div_type = div.get('type')
                 if div_type is not None and div_type == 'paragraph':
                     has_sub_paragraph = True
             if has_sub_paragraph:
                 continue
             rs.append([])
             for content_div in paragraph_div.find_all('div'):
                 div_type = content_div.get('type')
                 if div_type is not None and div_type == 'content':
                     rs[-1].append(TextUtils.clean_text(content_div.text))
     paragraphs = []
     for content_list in rs:
         if len(content_list) > 0:
             paragraphs.append(''.join(content_list))
     return paragraphs
Exemplo n.º 2
0
 def parse_content(self, html_file_path):
     """
     解析 HTML 中的段落文本
     按顺序返回多个 paragraph 构成一个数组,
     每个 paragraph 是一个 content 行构成的数组
     :param html_file_path:
     :return:
     """
     rs = []
     with codecs.open(html_file_path, encoding='utf-8', mode='r') as fp:
         soup = BeautifulSoup(fp.read(), "html.parser")
         paragraphs = []
         for div in soup.find_all('div'):
             div_type = div.get('type')
             #添加div_type == 'paragraph'的div块中的文本
             if div_type is not None and div_type == 'paragraph':
                 paragraphs.append(div)
         for paragraph_div in paragraphs:
             has_sub_paragraph = False  #判断paragraph中是否有子paragraph
             for div in paragraph_div.find_all('div'):
                 div_type = div.get('type')
                 if div_type is not None and div_type == 'paragraph':
                     has_sub_paragraph = True
             if has_sub_paragraph:
                 continue  #若存在子paragraph则continue,因为后面会遍历到该paragraph
             rs.append([])  #每个paragraphs中的content保存在rs的子列表中
             #将paragraph中的content添加到列表中
             for content_div in paragraph_div.find_all('div'):
                 div_type = content_div.get('type')
                 if div_type is not None and div_type == 'content':
                     rs[-1].append(TextUtils.clean_text(content_div.text))
     paragraphs = []
     for content_list in rs:
         if len(content_list) > 0:
             paragraphs.append(
                 ''.join(content_list))  #每个content_list结合在一起成为一个字符串
     return paragraphs