def parse_content(self, html_file_path): """ 解析 HTML 中的段落文本 按顺序返回多个 paragraph 构成一个数组, 每个 paragraph 是一个 content 行构成的数组 :param html_file_path: :return: """ rs = [] with codecs.open(html_file_path, encoding='utf-8', mode='r') as fp: soup = BeautifulSoup(fp.read(), "html.parser") paragraphs = [] for div in soup.find_all('div'): div_type = div.get('type') if div_type is not None and div_type == 'paragraph': paragraphs.append(div) for paragraph_div in paragraphs: has_sub_paragraph = False for div in paragraph_div.find_all('div'): div_type = div.get('type') if div_type is not None and div_type == 'paragraph': has_sub_paragraph = True if has_sub_paragraph: continue rs.append([]) for content_div in paragraph_div.find_all('div'): div_type = content_div.get('type') if div_type is not None and div_type == 'content': table = content_div.find_all('table') if table: tableText="" tr = soup.find_all('tr') for r in tr: td = r.find_all('td') for d in td: tableText+= (text_utils.clean_text(text_utils.normalize(d.text))) + ',' rs[-1].append((tableText)) else: rs[-1].append(text_utils.clean_text(text_utils.normalize(content_div.text))) paragraphs = [] for content_list in rs: if len(content_list) > 0: paragraphs.append(''.join(content_list)) return paragraphs
def getDingZeng_html(filename): with open(filename, 'r', encoding='UTF-8') as fr: soup = BeautifulSoup(fr.read(), 'html.parser') text = "" ################# cutPage = False hidden = soup.findAll('hidden') if len(hidden) > 3: cutPage = True last_hidden = int(int(hidden[-1]['name'][1:]) * 0.6) ################# for child in soup.descendants: sentence = "" ################# if cutPage and child.name == 'hidden' and int( child['name'][1:]) >= last_hidden: print('break in\t', child['name'], '\tall\t', hidden[-1]['name']) break ################# if child.name == 'tr' or child.name == 'td': if not text[-1] in CommaCharInNumberSet1: sentence += ',' if child.name == 'img': continue if isinstance(child, bs4.element.Tag) and child.attrs.get('title'): if 'title' in child.attrs: sentence = clean_text(normalize(child['title'])) if not sentence.endswith(':'): sentence += ':' elif isinstance(child, bs4.NavigableString) and len(child.string) > 2: sentence = clean_text(normalize(child.string)) text += sentence return text
def getContentFromEveryDiv(filepath): with open(filepath, 'r', encoding="UTF-8") as fr: soup = BeautifulSoup(fr.read(), 'html.parser') text = "" print(filepath) for child in soup.descendants: sentence = "" if child.name == 'tr' or child.name == 'td': if not text[-1] in CommaCharInNumberSet1: sentence += ',' if isinstance(child, bs4.element.Tag) and child.attrs.get('title'): if 'title' in child.attrs: sentence = clean_text(normalize(child['title'])) print("end", sentence) if not sentence.endswith(':'): sentence += ':' elif isinstance(child, bs4.NavigableString) and len(child.string) > 2: sentence = clean_text(normalize(child.string)) print("end", sentence) text += sentence return text
def _parse_table_to_2d_dict(table): rs_dict = {} row_index = 0 is_head_two_rowspan, is_head = False, True for tr in table.find_all('tr'): col_index, cur_col_index = 0, 0 for td in tr.find_all('td'): rowspan = td.get('rowspan') rowspan = int(rowspan) if (rowspan is not None and int(rowspan) > 1) else 1 colspan = td.get('colspan') colspan = int(colspan) if (colspan is not None and int(colspan) > 1) else 1 if is_head: if rowspan > 1 or colspan > 1: is_head_two_rowspan = True is_head = False for r in range(rowspan): if (row_index + r) not in rs_dict: rs_dict[row_index + r] = {} for c in range(colspan): cur_col_index = col_index while cur_col_index in rs_dict[row_index + r]: cur_col_index += 1 rs_dict[row_index + r][cur_col_index] = text_utils.clean_text(text_utils.normalize(td.text)) cur_col_index += 1 col_index = cur_col_index row_index += 1 return rs_dict, is_head_two_rowspan
def _normalize(self): if self.shangxian is not None: self.shangxian = text_utils.normalize(self.shangxian) if self.xiaxian is not None: self.xiaxian = text_utils.normalize(self.xiaxian)
def _normalize(self): if self.shuliang is not None: self.shuliang = text_utils.normalize(self.shuliang+self.danwei['num']) if self.jine is not None: self.jine = text_utils.normalize(self.jine+self.danwei['money'])
def _extract_from_table_dict(self, table_dict): # check none rs = [] if table_dict is None or len(table_dict) <= 0: return rs # 1. 假定第一行是表头部分则尝试进行规则匹配这一列是哪个类型的字段 # 必须满足 is_match_pattern is True and is_match_col_skip_pattern is False head_row = table_dict[0] col_length = len(head_row) row_length = len(table_dict) field_col_dict = {} skip_row_set = set() danwei = {'shuliang': '', 'jine': ''} # {'num':,'money':.} print(" head",head_row) for i in range(col_length): text = head_row[i] for (field_name, table_dict_field_pattern) in self.table_dict_field_pattern_dict.items(): col_good, _danwei = table_dict_field_pattern.is_match_pattern(text) if col_good and not table_dict_field_pattern.is_match_col_skip_pattern(text): if field_name not in field_col_dict: field_col_dict[field_name] = i if field_name in ["jine","shuliang"]: danwei[field_name] = _danwei if _danwei else "" # 逐行扫描这个字段的取值,如果满足 row_skip_pattern 则丢弃整行 row for j in range(1, row_length): try: text = table_dict[j][i] if table_dict_field_pattern.is_match_row_skip_pattern(text): skip_row_set.add(j) except KeyError: pass if len(field_col_dict) <= 0: return rs # 2. 遍历每个有效行,获取 record for row_index in range(1, row_length): if row_index in skip_row_set: continue record = DingZengRecord(None, None, None, None, None) for (field_name, col_index) in field_col_dict.items(): try: text = table_dict[row_index][col_index] print(" text",text) if field_name == 'duixiang': record.name = self.table_dict_field_pattern_dict.get(field_name).convert(text) elif field_name == 'shuliang': record.shuliang = self.table_dict_field_pattern_dict.get(field_name).convert(normalize(text+danwei["shuliang"]+"股")) elif field_name == 'jine': record.jine = self.table_dict_field_pattern_dict.get(field_name).convert(normalize(text+danwei["jine"]+"元")) elif field_name == 'rengoufangshi': record.money = self.table_dict_field_pattern_dict.get(field_name).convert(text) if not record.money: record.money = self.money elif field_name == 'suodingqi': record.time = self.table_dict_field_pattern_dict.get(field_name).convert(text) if not record.time: record.time = self.month else: pass except KeyError: pass rs.append(record) return rs
def _extract_from_table_dict(self, table_dict): # print(table_dict) rs = [] if table_dict is None or len(table_dict) <= 0: return rs row_length = len(table_dict) field_col_dict = {} skip_row_set = set() # 1. 假定第一行是表头部分则尝试进行规则匹配这一列是哪个类型的字段 # 必须满足 is_match_pattern is True and is_match_col_skip_pattern is False head_row = table_dict[0] col_length = len(head_row) danwei = {'sharePrice': '', 'shareNum': ''} for i in range(col_length): text = head_row[i] for (field_name, table_dict_field_pattern ) in self.table_dict_field_pattern_dict.items(): col_good, _danwei = table_dict_field_pattern.is_match_pattern( text) if col_good and not table_dict_field_pattern.is_match_col_skip_pattern( text): if field_name not in field_col_dict: field_col_dict[field_name] = i if field_name in ["sharePrice", "shareNum"]: danwei[field_name] = _danwei if _danwei else "" if _danwei is not None: print(field_name, _danwei) for j in range(1, row_length): try: text = table_dict[j][i] if table_dict_field_pattern.is_match_row_skip_pattern( text): skip_row_set.add(j) except KeyError: pass if len(field_col_dict) <= 0: return rs # 2. 遍历每个有效行,获取 record for row_index in range(1, row_length): if row_index in skip_row_set: continue record = ZengJianChiRecord(None, None, None, None, None, None, None) for (field_name, col_index) in field_col_dict.items(): try: text = table_dict[row_index][col_index] if field_name == 'shareholderFullName': record.shareholderFullName = self.table_dict_field_pattern_dict.get( field_name).convert(text) elif field_name == 'finishDate': record.finishDate = self.table_dict_field_pattern_dict.get( field_name).convert(text) elif field_name == 'sharePrice': record.sharePrice = self.table_dict_field_pattern_dict.get( field_name).convert( normalize(text + danwei["sharePrice"] + "元")) elif field_name == 'shareNum': record.shareNum = self.table_dict_field_pattern_dict.get( field_name).convert( normalize(text + danwei["shareNum"] + "股")) elif field_name == 'shareNumAfterChg': record.shareNumAfterChg = self.table_dict_field_pattern_dict.get( field_name).convert(text) elif field_name == 'sharePcntAfterChg': record.sharePcntAfterChg = self.table_dict_field_pattern_dict.get( field_name).convert(text) else: pass except KeyError: pass rs.append(record) return rs