def post2list(ele: PageElement): post_list = [] headers = ele.find('div', class_='hd')('li') for header in headers: post_list.append({ 'name': header.a.text, 'link': header.a['href'], 'children': [] }) uls = ele.find('div', class_='bd')('ul') for i in range(len(post_list)): for li in uls[i]('li'): post_list[i]['children'].append({ 'name': ''.join(li('a')[-1].text.split()), 'link': li('a')[-1]['href'], 'new': True if li.img else False, 'date': li.span.text if li.span else '' }) return post_list
def _parseTable(table: PageElement): print('table =======>', table) tb = dict({ "table": table.get('name', ''), "numRows": table.get('numrows', ''), #"remarks": table.get('remarks', '') }) cols = table.findAllNext('column') def _parseCol(col: PageElement): col_dict = { "col_name": col.get('name', 'unknown name'), "col_remarks": col.get('remarks', ''), "autoUpdated": col.get('autoupdated', ''), "nullable": col.get('nullable', ''), } return col_dict columns = list(map(_parseCol, cols)) cols = {"columns": list(columns)} tab = {"tb": tb} return {**tab, **cols}
def make_cover(soup: PageElement, options: Options): """ Generate a cover page. Arguments: soup {BeautifulSoup} -- target element. options {Options} -- the project options. """ if not options.cover: return options.logger.info('Generate a cover page.') article = soup.new_tag('article', id='doc-cover') d = soup.new_tag('div', **{'class': 'wrapper'}) article.append(d) box = soup.new_tag('div', **{'class': 'wrapper'}) article.append(box) title = options.cover_title h1 = soup.new_tag('h1') h1.append(title) box.append(h1) sub_title = options.cover_subtitle if sub_title: h2 = soup.new_tag('h2') h2.append(sub_title) box.append(h2) article.append(_gen_address(soup, options)) soup.body.insert(0, article)
def convert_for_two_columns(soup: PageElement, level: int, logger: Logger = None): if level == 0: return elif level != 3: if logger: logger.warning('`two_columns_level` is only support `3` yet.') return if logger: logger.info('Converting for two-column layout(heading level 3).') ignored = [] for el in soup.find_all('h3'): if el in ignored: continue els = [ i for i in itertools.takewhile( lambda x: x.name not in ['h1', 'h2'], el.next_siblings) ] section = soup.new_tag('section', **{'class': 'md-typeset two-columns'}) el.wrap(section) for tag in els: section.append(tag) if tag.name == 'h3': ignored.append(tag) images_size_to_half_in(section)
def _parseCol(col: PageElement): col_dict = { "col_name": col.get('name', 'unknown name'), "col_remarks": col.get('remarks', ''), "autoUpdated": col.get('autoupdated', ''), "nullable": col.get('nullable', ''), } return col_dict
def replace_asset_hrefs(soup: PageElement, base_url: str) -> PageElement: """makes all relative asset links absolute""" for link in soup.find_all('link', href=True): link['href'] = abs_asset_href(link['href'], base_url) for asset in soup.find_all(src=True): asset['src'] = abs_asset_href(asset['src'], base_url) return soup
def _inline_script(script_tag: PageElement, script_file: Path) -> bool: """ replacement callable to replace scripts for inline_data """ script_content = NavigableString(script_file.read_text()) new_script_tag = BeautifulSoup(features="html.parser").new_tag("script") new_script_tag.insert(0, script_content) new_script_tag["type"] = "text/javascript" script_tag.replaceWith(new_script_tag)
def _inline_css(style_tag: PageElement, style_file: Path) -> bool: """ replacement callable to replace stylesheets for inline_data """ style_content = NavigableString(style_file.read_text()) new_style_tag = BeautifulSoup(features="html.parser").new_tag("style") new_style_tag.insert(0, style_content) new_style_tag["type"] = "text/css" style_tag.replaceWith(new_style_tag)
def __serialize_listing(self, listing_element: PageElement): title_element = listing_element.find_next(class_="listing_header") title = title_element.text price_text = listing_element.find_next(class_="price").text price = int(re.sub("[^0-9]", "", price_text)) body = listing_element.find_next(class_="body").text listing = BarnstormersClassifiedListing() listing.title = title listing.price = price listing.description = body listing.url = self.base_url + title_element["href"] return listing
def __parse_result_item(self, result_item: PageElement): result_title = result_item.find_next(id='title') price_text = re.sub("[^0-9]", "", result_item.find_next(class_='txt-price').text) price = 0 description = result_item.find_next(class_='description').text.strip() url = self.base_url + result_title['href'] if len(price_text) != 0: price = int(price_text) return TradeAPlaneListing(title=result_title.text.strip(), price=price, description=description, url=url)
def wrap_tabbed_set_content(soup: PageElement, logger: Logger = None): for ts in soup.select('div.tabbed-set'): for radio in ts.select('input'): els = [i for i in itertools.takewhile( lambda x: x.name not in ['input'], radio.next_siblings)] wrapper = soup.new_tag('div', **{'class': 'tabbed-content--wrap'}) radio.wrap(wrapper) for tag in els: wrapper.append(tag) for d in soup.select('details'): d['open'] = ''
def visit_and_hyphenate( node: bs4.PageElement) -> Optional[List[bs4.PageElement]]: """Visits HTML nodes and hyphenates text. Returns: Children of tag elements that should be further processed, e.g., <pre> elements are skipped. """ if isinstance(node, bs4.Comment): return None # We check whether `Stylesheet` is implemented, because it's a # relatively recent addition to BeautifulSoup # (https://bazaar.launchpad.net/~leonardr/beautifulsoup/bs4/revision/564). # In case it is not, we don't skip <style> nodes. This will mangle # stylesheets if they exist, but that is a cost I'm willing to take. if (is_stylesheet_implemented() and isinstance(node, bs4.element.Stylesheet)): return None if isinstance(node, bs4.Tag): if node.name == 'pre': return None if node.name == 'style': return None return node.children if not isinstance(node, bs4.NavigableString): return None # My intention is to remove silent-hyphens, so that language detection # works correctly. printable_text = only_printable(node) if should_ignore(printable_text): return None try: lang = langdetect.detect(printable_text) if lang == 'en': # Use US dictionary for English, because it seems that the US # dictionary is richer. For example en_GB doesn't hyphenate # "format," but US does ("for-mat"). lang = 'en_US' dic = pyphen.Pyphen(lang=lang) except (langdetect.lang_detect_exception.LangDetectException, KeyError): return None new_text = hyphenate_end_node(dic, node) node.replaceWith(new_text) return None
def convert_iframe(soup: PageElement, entries: list, logger: Logger = None): """Replace iFrame to a(anchor) e.g: ```html "before:" <iframe frameborder="0" height="100%" src="SRC"/> ``` ```html "after:" <a class="converted-iframe" href="SRC" target="_blank"> <img src="POSTER IMAGE"/> </a> ``` """ if len(entries) < 1: return if logger: logger.info('Converting <iframe> to poster image(if available).') for iframe in soup.find_all('iframe', src=True): for entry in entries: if iframe['src'] != entry.get('src'): continue a = soup.new_tag('a', href=iframe['src'], target='_blank', **{'class': 'converted-iframe'}) img_src = entry.get('img') if img_src: a.append(soup.new_tag('img', src=img_src)) text = entry.get('text') if text: span = soup.new_tag('span') span.string = text a.append(span) # copy attributes for key, val in iframe.attrs.items(): if key in ['style']: a[key] = val iframe.replace_with(a)
def _format_vacancy(item: PageElement): data = { "site_id": int( item.find_next( "a", {"class": "no-decoration"})['href'].split("/")[-2]), "title": item.find_next("a").get_text(), "company": item.find_next("div", { "class": "add-top-xs" }).find_next("b").get_text(), "desc": " ".join( unicodedata.normalize("NFKD", item.find_next("p").get_text()).split()), "salary": unicodedata.normalize("NFKD", a) if "грн" in (a := item.find_next("b").get_text()) else None, "city": "|".join([ i.replace('\xa0', ' ') for i in item.find_next("div", { "class": "add-top-xs" }).get_text().split("·")[1:] ]), "link": "https://work.ua" + item.find_next("a", {"class": "no-decoration"})['href'] } return data
def get_html_table_header_and_rows( table: bs4.PageElement) -> Tuple[List, List]: """ return header and rows from a html table as a list """ header = [] rows = [] table_header = table.find("tr") table_rows = table.find_all("tr")[1:] for items in table_header: header.append(items.get_text()) for table_row in table_rows: row = [] for cell in table_row.findAll(['th', 'td']): row.append(cell) rows.append(row) return header, rows
def fix_twemoji(soup: PageElement, logger: Logger = None): """ (workaraound) replace <svg> to <img + b64encoded data/> cause, don't shown WeasyPrint 51 for after material v4.5.0 @see https://github.com/squidfunk/mkdocs-material/pull/1330 """ def fix_size(svg): ''' svg['width'] = 24 svg['height'] = 24 ''' viewbox = _parse_viewbox(svg['viewbox']) width, height = ( viewbox[2] - viewbox[0], viewbox[3] - viewbox[1] ) svg['width'] = int(width) svg['height'] = int(height) svg['style'] = 'fill: currentColor;' if logger: logger.debug('Converting emoji SVG to img(workaround).') for svg in soup.select('.twemoji svg'): try: fix_size(svg) encoded = b64encode(str(svg).encode('utf-8')).decode('ascii') data = "data:image/svg+xml;charset=utf-8;base64," + encoded img = soup.new_tag('img', src=data, **{'class': 'converted-twemoji'}) svg.replace_with(img) if logger: logger.debug(f'> svg: {svg}') logger.debug(f'< img: {img}') except Exception as e: if logger: logger.warning(f'Failed to convert SVG: {e}') pass
def _gen_address(soup: PageElement, options: Options) -> PageElement: box = soup.new_tag('div', **{'class': 'properties'}) address = soup.new_tag('address') box.append(address) if options.author: span = soup.new_tag('p', id="author") span.append(options.author) address.append(span) if options.copyright: span = soup.new_tag('p', id="copyright") import html span.append(html.unescape(options.copyright)) address.append(span) return box
def get_combined(soup: PageElement, base_url: str, rel_url: str) -> PageElement: """ transforms all relative hrefs pointing to other html docs into relative pdf hrefs """ for element in soup.find_all(id=True): element['id'] = transform_id(element['id'], rel_url) for a in soup.find_all('a', href=True): if urls.url_is_absolute(a['href']) or os.path.isabs(a['href']): continue a['href'] = transform_href(a['href'], rel_url) soup.body['id'] = get_body_id(rel_url) soup = replace_asset_hrefs(soup, base_url) return soup
def parse_video_block(video_block: PageElement) -> Dict: video_object = {} video_title_el = video_block.find("h3") video_object["video_title"] = str(video_title_el.string) if video_title_el else None video_link_el = video_block.find(class_ = "btn-link video-sources video-download-button") video_object["video_link"] = video_link_el["href"] if video_link_el else None transcript_link_el = video_block.select(".wrapper-download-transcripts a") video_object["transcript_link"] = set() for srt_link in transcript_link_el: srt_url = srt_link["href"] u = urlparse(srt_url) if not u.scheme: u = u._replace(scheme='https') if not u.netloc: u = u._replace(netloc='courses.edx.org') srt_url = urlunparse(u) video_object["transcript_link"].add(srt_url) video_object["transcript_link"] = list(video_object["transcript_link"]) return video_object
def _label(node: PageElement): if isinstance(node, NavigableString): text = node.strip() parent = node.parent if parent.name == 'a' and 'href' in parent.attrs: yield '[[[{}|||{}]]]'.format(text, parent.attrs['href']) else: yield text elif isinstance(node, Tag): for child in node.children: yield from _label(child)
def _parseTr(tr: PageElement): tds = tr.findAllNext("td") return { "Table": tds[0].text, "Column": tds[1].text, "Type": tds[2].text, "Size": tds[3].text, "Nulls": tds[4].text, "Auto": tds[5].text, "Default": tds[6].text }
def _format_vacancy(item: PageElement): data = { "site_id": int(item["_id"]), "title": item.find_next("a", { "class": "vt" }).get_text(), "company": item.find_next("a", { "class": "company" }).get_text().replace('\xa0', ''), "desc": " ".join( unicodedata.normalize( "NFKD", item.find_next("div", { "class": "sh-info" }).get_text()).split()), "salary": unicodedata.normalize("NFKD", a.get_text()) if (a := item.find_next("span", {"class": "salary"})) else None, "city": item.find_next("span", { "class": "cities" }).get_text(), "link": item.find_next("a", {"class": "vt"})['href'] } return data
def _format_vacancy(item: PageElement): data = { "site_id": int(item["data-vacancy-id"]), "title": item.find_next("a", { "class": "ga_listing" }).get_text().replace("\n", ""), "company": item.find_next("a", { "class": "company-profile-name" }).get_text(), "desc": item.find_next("div", { "class": "card-description" }).get_text(), "salary": a if (a := item.find_next("span", { "class": "salary" }).get_text()) else None, "city": item.find_next("span", { "class": "location" }).get_text(), "link": "https://robota.ua" + item.find_next("a", {"class": "ga_listing"})['href'] } return data
def _remove_empty_tags(self, soup: PageElement): def is_blank(el): if len(el.get_text(strip=True)) != 0: return False elif el.find(['img', 'svg']): return False else: return True includes = ['article', 'p'] while True: hit = False for x in soup.find_all(): if x.name in includes and is_blank(x): # self.logger.debug(f'Strip: {x}') x.extract() hit = True if not hit: break
def fix_image_alignment(soup: PageElement, logger: Logger = None): """ (workaraound) convert <img align=*> to `float` style. and, move <img width=*>, <image height=*> to style attributes. """ if logger: logger.info('Converting <img> alignment(workaround).') for img in soup.select('img'): try: if img.has_attr('class') and 'twemoji' in img['class']: continue styles = _parse_style(getattr(img, 'style', '')) logger.debug(f' | {img}') if img.has_attr('align'): if img['align'] == 'left': styles['float'] = 'left' styles['padding-right'] = '1rem' styles['padding-bottom'] = '0.5rem' img.attrs.pop('align') elif img['align'] == 'right': styles['float'] = 'right' styles['padding-left'] = '1rem' styles['padding-bottom'] = '0.5rem' img.attrs.pop('align') if img.has_attr('width'): styles['width'] = _convert_dimension(img['width']) img.attrs.pop('width') if img.has_attr('height'): styles['height'] = _convert_dimension(img['height']) img.attrs.pop('height') img['style'] = " ".join(f'{k}: {v};' for k, v in styles.items()) except Exception as e: if logger: logger.warning(f'Failed to convert img align: {e}') pass
def is_group_header(element: bs4.PageElement) -> bool: return is_tag(element) and 'groupheader' in element.get('class', ())
def _get_content(self, soup: PageElement, page): def shift_heading(elem, page): for i in range(7, 0, -1): while True: h = elem.find(f'h{i}') if not h: break h.name = f'h{i + 1}' page_path = self._page_path_for_id(page) h1 = soup.new_tag('h1', id=f'{page_path}') h1.append(page.title) elem.insert(0, h1) return elem def cleanup_class(classes: []): if classes and len(classes): excludes = ['md-content__inner'] return [c for c in classes if not (c in excludes)] return classes article = getattr(page, 'pdf-article', None) if article: page_path = self._page_path_for_id(page) article['id'] = f'{page_path}:' # anchor for each page. article['data-url'] = f'/{page_path}' return article elif page.children: new_article = soup.new_tag('article') found = False for c in page.children: content = self._get_content(soup, c) if content: new_article.append(content) found = True if not found: return None child_classes = None for child_article in new_article.find_all('article'): child_article.name = 'section' classes = child_article.get('class') if classes and not child_classes: child_classes = classes child_article['class'] = cleanup_class(classes) page_path = self._page_path_for_id(page) new_article['id'] = f'{page_path}:' # anchor for each page. new_article['data-url'] = f'/{page_path}' if child_classes: new_article['class'] = child_classes if self._options.heading_shift: return shift_heading(new_article, page) return new_article return None
def make_indexes(soup: PageElement, options: Options) -> None: """ Generate ordered chapter number and TOC of document. Arguments: soup {BeautifulSoup} -- DOM object of Document. options {Options} -- The options of this sequence. """ # Step 1: (re)ordered headdings _inject_heading_order(soup, options) # Step 2: generate toc page level = options.toc_level if level < 1 or level > 3: return options.logger.info( f'Generate a table of contents up to heading level {level}.') h1li = None h2ul = h2li = h3ul = None exclude_lv2 = exclude_lv3 = False def makeLink(h: Tag) -> Tag: li = soup.new_tag('li') ref = h.get('id', '') a = soup.new_tag('a', href=f'#{ref}') for el in h.contents: if el.name == 'a': a.append(el.contents[0]) else: a.append(clone_element(el)) li.append(a) options.logger.debug(f"| [{h.get_text(separator=' ')}]({ref})") return li toc = soup.new_tag('article', id='doc-toc') title = soup.new_tag('h1') title.append(options.toc_title) toc.append(title) h1ul = soup.new_tag('ul') toc.append(h1ul) headings = soup.find_all(['h1', 'h2', 'h3']) for h in headings: if h.name == 'h1': h1li = makeLink(h) h1ul.append(h1li) h2ul = h2li = h3ul = None exclude_lv2 = _is_exclude(h.get('id', None), options) elif not exclude_lv2 and h.name == 'h2' and level >= 2: if not h2ul: h2ul = soup.new_tag('ul') h1li.append(h2ul) h2li = makeLink(h) h2ul.append(h2li) h3ul = None exclude_lv3 = _is_exclude(h.get('id', None), options) elif not exclude_lv2 and not exclude_lv3 \ and h.name == 'h3' and level >= 3: if not h2li: continue if not h3ul: h3ul = soup.new_tag('ul') h2li.append(h3ul) h3li = makeLink(h) h3ul.append(h3li) else: continue pass soup.body.insert(0, toc)
def is_declaration(element: bs4.PageElement) -> bool: return (is_tag(element) and element.name == 'div' and 'memitem' in element.get('class', ()) )
def get_element_with_comment(container: PageElement, comment: str) -> PageElement: return container.find( text=lambda t: _find_comment(t, comment)).find_parent()
def get_separate(soup: PageElement, base_url: str) -> PageElement: for a in soup.find_all('a', href=True): a['href'] = rel_pdf_href(a['href']) soup = replace_asset_hrefs(soup, base_url) return soup