Пример #1
0
    def parse_content(self, html_file_path):
        """
        解析 HTML 中的段落文本
        按顺序返回多个 paragraph 构成一个数组,
        每个 paragraph 是一个 content 行构成的数组
        :param html_file_path:
        :return:
        """
        rs = []
        with codecs.open(html_file_path, encoding='utf-8', mode='r') as fp:
            soup = BeautifulSoup(fp.read(), "html.parser")
            paragraphs = []
            for div in soup.find_all('div'):
                div_type = div.get('type')
                if div_type is not None and div_type == 'paragraph':
                    paragraphs.append(div)
            for paragraph_div in paragraphs:
                has_sub_paragraph = False
                for div in paragraph_div.find_all('div'):
                    div_type = div.get('type')
                    if div_type is not None and div_type == 'paragraph':
                        has_sub_paragraph = True
                if has_sub_paragraph:
                    continue
                rs.append([])
                for content_div in paragraph_div.find_all('div'):
                    div_type = content_div.get('type')
                    if div_type is not None and div_type == 'content':
                        table = content_div.find_all('table')
                        if table:
                            tableText=""
                            tr = soup.find_all('tr')
                            for r in tr:
                                td = r.find_all('td')
                                for d in td:
                                    tableText+= (text_utils.clean_text(text_utils.normalize(d.text))) + ','

                            rs[-1].append((tableText))
                        else:
                            rs[-1].append(text_utils.clean_text(text_utils.normalize(content_div.text)))
        paragraphs = []
        for content_list in rs:
            if len(content_list) > 0:
                paragraphs.append(''.join(content_list))
        return paragraphs
Пример #2
0
def getDingZeng_html(filename):
    with open(filename, 'r', encoding='UTF-8') as fr:
        soup = BeautifulSoup(fr.read(), 'html.parser')
        text = ""

        #################
        cutPage = False
        hidden = soup.findAll('hidden')
        if len(hidden) > 3:
            cutPage = True
            last_hidden = int(int(hidden[-1]['name'][1:]) * 0.6)
        #################

        for child in soup.descendants:
            sentence = ""

            #################
            if cutPage and child.name == 'hidden' and int(
                    child['name'][1:]) >= last_hidden:
                print('break in\t', child['name'], '\tall\t',
                      hidden[-1]['name'])
                break
            #################

            if child.name == 'tr' or child.name == 'td':
                if not text[-1] in CommaCharInNumberSet1:
                    sentence += ','
            if child.name == 'img':
                continue
            if isinstance(child, bs4.element.Tag) and child.attrs.get('title'):
                if 'title' in child.attrs:
                    sentence = clean_text(normalize(child['title']))
                    if not sentence.endswith(':'):
                        sentence += ':'
            elif isinstance(child,
                            bs4.NavigableString) and len(child.string) > 2:
                sentence = clean_text(normalize(child.string))
            text += sentence
        return text
Пример #3
0
def getContentFromEveryDiv(filepath):
    with open(filepath, 'r', encoding="UTF-8") as fr:
        soup = BeautifulSoup(fr.read(), 'html.parser')
        text = ""
        print(filepath)
        for child in soup.descendants:
            sentence = ""
            if child.name == 'tr' or child.name == 'td':
                if not text[-1] in CommaCharInNumberSet1:
                    sentence += ','
            if isinstance(child, bs4.element.Tag) and child.attrs.get('title'):
                if 'title' in child.attrs:
                    sentence = clean_text(normalize(child['title']))
                    print("end", sentence)
                    if not sentence.endswith(':'):
                        sentence += ':'
            elif isinstance(child,
                            bs4.NavigableString) and len(child.string) > 2:
                sentence = clean_text(normalize(child.string))
                print("end", sentence)

            text += sentence
        return text
Пример #4
0
 def _parse_table_to_2d_dict(table):
     rs_dict = {}
     row_index = 0
     is_head_two_rowspan, is_head = False, True
     for tr in table.find_all('tr'):
         col_index, cur_col_index = 0, 0
         for td in tr.find_all('td'):
             rowspan = td.get('rowspan')
             rowspan = int(rowspan) if (rowspan is not None and int(rowspan) > 1) else 1
             colspan = td.get('colspan')
             colspan = int(colspan) if (colspan is not None and int(colspan) > 1) else 1
             if is_head:
                 if rowspan > 1 or colspan > 1:
                     is_head_two_rowspan = True
                 is_head = False
             for r in range(rowspan):
                 if (row_index + r) not in rs_dict:
                     rs_dict[row_index + r] = {}
                 for c in range(colspan):
                     cur_col_index = col_index
                     while cur_col_index in rs_dict[row_index + r]:
                         cur_col_index += 1
                     rs_dict[row_index + r][cur_col_index] = text_utils.clean_text(text_utils.normalize(td.text))
                     cur_col_index += 1
             col_index = cur_col_index
         row_index += 1
     return rs_dict, is_head_two_rowspan
Пример #5
0
 def _normalize(self):
     if self.shangxian is not None:
         self.shangxian = text_utils.normalize(self.shangxian)
     if self.xiaxian is not None:
         self.xiaxian = text_utils.normalize(self.xiaxian)
Пример #6
0
 def _normalize(self):
     if self.shuliang is not None:
         self.shuliang = text_utils.normalize(self.shuliang+self.danwei['num'])
     if self.jine is not None:
         self.jine = text_utils.normalize(self.jine+self.danwei['money'])
Пример #7
0
    def _extract_from_table_dict(self, table_dict):
        # check none
        rs = []
        if table_dict is None or len(table_dict) <= 0:
            return rs

        # 1. 假定第一行是表头部分则尝试进行规则匹配这一列是哪个类型的字段
        # 必须满足 is_match_pattern is True and is_match_col_skip_pattern is False
        head_row = table_dict[0]
        col_length = len(head_row)
        row_length = len(table_dict)
        field_col_dict = {}
        skip_row_set = set()
        danwei = {'shuliang': '', 'jine': ''}  # {'num':,'money':.}
        print("    head",head_row)
        for i in range(col_length):
            text = head_row[i]
            for (field_name, table_dict_field_pattern) in self.table_dict_field_pattern_dict.items():
                col_good, _danwei = table_dict_field_pattern.is_match_pattern(text)
                if col_good and not table_dict_field_pattern.is_match_col_skip_pattern(text):
                    if field_name not in field_col_dict:
                        field_col_dict[field_name] = i
                    if field_name in ["jine","shuliang"]:
                        danwei[field_name] = _danwei if _danwei else ""
                    # 逐行扫描这个字段的取值,如果满足 row_skip_pattern 则丢弃整行 row
                    for j in range(1, row_length):
                        try:
                            text = table_dict[j][i]
                            if table_dict_field_pattern.is_match_row_skip_pattern(text):
                                skip_row_set.add(j)
                        except KeyError:
                            pass
        if len(field_col_dict) <= 0:
            return rs
        # 2. 遍历每个有效行,获取 record
        for row_index in range(1, row_length):
            if row_index in skip_row_set:
                continue
            record = DingZengRecord(None, None, None, None, None)
            for (field_name, col_index) in field_col_dict.items():
                try:
                    text = table_dict[row_index][col_index]
                    print("        text",text)
                    if field_name == 'duixiang':
                        record.name = self.table_dict_field_pattern_dict.get(field_name).convert(text)
                    elif field_name == 'shuliang':
                        record.shuliang = self.table_dict_field_pattern_dict.get(field_name).convert(normalize(text+danwei["shuliang"]+"股"))
                    elif field_name == 'jine':
                        record.jine = self.table_dict_field_pattern_dict.get(field_name).convert(normalize(text+danwei["jine"]+"元"))
                    elif field_name == 'rengoufangshi':
                        record.money = self.table_dict_field_pattern_dict.get(field_name).convert(text)
                        if not record.money:
                            record.money = self.money
                    elif field_name == 'suodingqi':
                        record.time = self.table_dict_field_pattern_dict.get(field_name).convert(text)
                        if not record.time:
                            record.time = self.month
                    else:
                        pass
                except KeyError:
                    pass
                rs.append(record)
        return rs
Пример #8
0
 def _extract_from_table_dict(self, table_dict):
     # print(table_dict)
     rs = []
     if table_dict is None or len(table_dict) <= 0:
         return rs
     row_length = len(table_dict)
     field_col_dict = {}
     skip_row_set = set()
     # 1. 假定第一行是表头部分则尝试进行规则匹配这一列是哪个类型的字段
     # 必须满足 is_match_pattern is True and is_match_col_skip_pattern is False
     head_row = table_dict[0]
     col_length = len(head_row)
     danwei = {'sharePrice': '', 'shareNum': ''}
     for i in range(col_length):
         text = head_row[i]
         for (field_name, table_dict_field_pattern
              ) in self.table_dict_field_pattern_dict.items():
             col_good, _danwei = table_dict_field_pattern.is_match_pattern(
                 text)
             if col_good and not table_dict_field_pattern.is_match_col_skip_pattern(
                     text):
                 if field_name not in field_col_dict:
                     field_col_dict[field_name] = i
                 if field_name in ["sharePrice", "shareNum"]:
                     danwei[field_name] = _danwei if _danwei else ""
                     if _danwei is not None:
                         print(field_name, _danwei)
                 for j in range(1, row_length):
                     try:
                         text = table_dict[j][i]
                         if table_dict_field_pattern.is_match_row_skip_pattern(
                                 text):
                             skip_row_set.add(j)
                     except KeyError:
                         pass
     if len(field_col_dict) <= 0:
         return rs
     # 2. 遍历每个有效行,获取 record
     for row_index in range(1, row_length):
         if row_index in skip_row_set:
             continue
         record = ZengJianChiRecord(None, None, None, None, None, None,
                                    None)
         for (field_name, col_index) in field_col_dict.items():
             try:
                 text = table_dict[row_index][col_index]
                 if field_name == 'shareholderFullName':
                     record.shareholderFullName = self.table_dict_field_pattern_dict.get(
                         field_name).convert(text)
                 elif field_name == 'finishDate':
                     record.finishDate = self.table_dict_field_pattern_dict.get(
                         field_name).convert(text)
                 elif field_name == 'sharePrice':
                     record.sharePrice = self.table_dict_field_pattern_dict.get(
                         field_name).convert(
                             normalize(text + danwei["sharePrice"] + "元"))
                 elif field_name == 'shareNum':
                     record.shareNum = self.table_dict_field_pattern_dict.get(
                         field_name).convert(
                             normalize(text + danwei["shareNum"] + "股"))
                 elif field_name == 'shareNumAfterChg':
                     record.shareNumAfterChg = self.table_dict_field_pattern_dict.get(
                         field_name).convert(text)
                 elif field_name == 'sharePcntAfterChg':
                     record.sharePcntAfterChg = self.table_dict_field_pattern_dict.get(
                         field_name).convert(text)
                 else:
                     pass
             except KeyError:
                 pass
         rs.append(record)
     return rs