Exemplo n.º 1
0
def replace_asset_hrefs(soup: PageElement, base_url: str) -> PageElement:
    """makes all relative asset links absolute"""

    for link in soup.find_all('link', href=True):
        link['href'] = abs_asset_href(link['href'], base_url)

    for asset in soup.find_all(src=True):
        asset['src'] = abs_asset_href(asset['src'], base_url)

    return soup
Exemplo n.º 2
0
def convert_for_two_columns(soup: PageElement,
                            level: int,
                            logger: Logger = None):
    if level == 0:
        return
    elif level != 3:
        if logger:
            logger.warning('`two_columns_level` is only support `3` yet.')
        return

    if logger:
        logger.info('Converting for two-column layout(heading level 3).')

    ignored = []
    for el in soup.find_all('h3'):
        if el in ignored:
            continue
        els = [
            i for i in itertools.takewhile(
                lambda x: x.name not in ['h1', 'h2'], el.next_siblings)
        ]
        section = soup.new_tag('section',
                               **{'class': 'md-typeset two-columns'})
        el.wrap(section)
        for tag in els:
            section.append(tag)
            if tag.name == 'h3':
                ignored.append(tag)
        images_size_to_half_in(section)
Exemplo n.º 3
0
def get_combined(soup: PageElement, base_url: str,
                 rel_url: str) -> PageElement:
    """ transforms all relative hrefs pointing to other html docs
    into relative pdf hrefs
    """

    for element in soup.find_all(id=True):
        element['id'] = transform_id(element['id'], rel_url)

    for a in soup.find_all('a', href=True):
        if urls.url_is_absolute(a['href']) or os.path.isabs(a['href']):
            continue

        a['href'] = transform_href(a['href'], rel_url)

    soup.body['id'] = get_body_id(rel_url)
    soup = replace_asset_hrefs(soup, base_url)

    return soup
Exemplo n.º 4
0
def convert_iframe(soup: PageElement, entries: list, logger: Logger = None):
    """Replace iFrame to a(anchor)

    e.g:
        ```html "before:"
        <iframe frameborder="0" height="100%" src="SRC"/>
        ```

        ```html "after:"
        <a class="converted-iframe" href="SRC" target="_blank">
          <img src="POSTER IMAGE"/>
        </a>
        ```
    """

    if len(entries) < 1:
        return

    if logger:
        logger.info('Converting <iframe> to poster image(if available).')

    for iframe in soup.find_all('iframe', src=True):
        for entry in entries:
            if iframe['src'] != entry.get('src'):
                continue

            a = soup.new_tag('a',
                             href=iframe['src'],
                             target='_blank',
                             **{'class': 'converted-iframe'})
            img_src = entry.get('img')
            if img_src:
                a.append(soup.new_tag('img', src=img_src))
            text = entry.get('text')
            if text:
                span = soup.new_tag('span')
                span.string = text
                a.append(span)

            # copy attributes
            for key, val in iframe.attrs.items():
                if key in ['style']:
                    a[key] = val

            iframe.replace_with(a)
Exemplo n.º 5
0
    def _remove_empty_tags(self, soup: PageElement):
        def is_blank(el):
            if len(el.get_text(strip=True)) != 0:
                return False
            elif el.find(['img', 'svg']):
                return False
            else:
                return True

        includes = ['article', 'p']
        while True:
            hit = False
            for x in soup.find_all():
                if x.name in includes and is_blank(x):
                    # self.logger.debug(f'Strip: {x}')
                    x.extract()
                    hit = True
            if not hit:
                break
Exemplo n.º 6
0
def get_html_table_header_and_rows(
        table: bs4.PageElement) -> Tuple[List, List]:
    """
    return header and rows from a html table as a list
    """
    header = []
    rows = []
    table_header = table.find("tr")
    table_rows = table.find_all("tr")[1:]
    for items in table_header:
        header.append(items.get_text())

    for table_row in table_rows:
        row = []
        for cell in table_row.findAll(['th', 'td']):
            row.append(cell)
        rows.append(row)

    return header, rows
Exemplo n.º 7
0
def make_indexes(soup: PageElement, options: Options) -> None:
    """ Generate ordered chapter number and TOC of document.

    Arguments:
        soup {BeautifulSoup} -- DOM object of Document.
        options {Options} -- The options of this sequence.
    """

    # Step 1: (re)ordered headdings
    _inject_heading_order(soup, options)

    # Step 2: generate toc page
    level = options.toc_level
    if level < 1 or level > 3:
        return

    options.logger.info(
        f'Generate a table of contents up to heading level {level}.')

    h1li = None
    h2ul = h2li = h3ul = None
    exclude_lv2 = exclude_lv3 = False

    def makeLink(h: Tag) -> Tag:
        li = soup.new_tag('li')
        ref = h.get('id', '')
        a = soup.new_tag('a', href=f'#{ref}')
        for el in h.contents:
            if el.name == 'a':
                a.append(el.contents[0])
            else:
                a.append(clone_element(el))
        li.append(a)
        options.logger.debug(f"| [{h.get_text(separator=' ')}]({ref})")
        return li

    toc = soup.new_tag('article', id='doc-toc')
    title = soup.new_tag('h1')
    title.append(options.toc_title)
    toc.append(title)

    h1ul = soup.new_tag('ul')
    toc.append(h1ul)

    headings = soup.find_all(['h1', 'h2', 'h3'])
    for h in headings:

        if h.name == 'h1':

            h1li = makeLink(h)
            h1ul.append(h1li)
            h2ul = h2li = h3ul = None

            exclude_lv2 = _is_exclude(h.get('id', None), options)

        elif not exclude_lv2 and h.name == 'h2' and level >= 2:

            if not h2ul:
                h2ul = soup.new_tag('ul')
                h1li.append(h2ul)
            h2li = makeLink(h)
            h2ul.append(h2li)
            h3ul = None

            exclude_lv3 = _is_exclude(h.get('id', None), options)

        elif not exclude_lv2 and not exclude_lv3 \
                and h.name == 'h3' and level >= 3:

            if not h2li:
                continue
            if not h3ul:
                h3ul = soup.new_tag('ul')
                h2li.append(h3ul)
            h3li = makeLink(h)
            h3ul.append(h3li)

        else:
            continue
        pass

    soup.body.insert(0, toc)
Exemplo n.º 8
0
def get_separate(soup: PageElement, base_url: str) -> PageElement:
    for a in soup.find_all('a', href=True):
        a['href'] = rel_pdf_href(a['href'])

    soup = replace_asset_hrefs(soup, base_url)
    return soup