示例#1
0
def normalize_url(url, query_string=None):
    if url:
        if query_string:
            url += '?' + query_string
        if not url_is_absolute(url):
            # Default to HTTP rather than relative filenames
            url = 'http://' + url
        return url
示例#2
0
def normalize_url(url, query_string=None):
    if url:
        if query_string:
            url += '?' + query_string
        if not url_is_absolute(url):
            # Default to HTTP rather than relative filenames
            url = 'http://' + url
        return url
示例#3
0
def is_doc(href: str):
    tail = os.path.basename(href)
    _, ext = os.path.splitext(tail)

    absurl = urls.url_is_absolute(href)
    abspath = os.path.isabs(href)
    htmlfile = ext.startswith('.html')
    if absurl or abspath or not htmlfile:
        return False

    return True
示例#4
0
def get_combined(soup: BeautifulSoup, base_url: str, rel_url: str):
    for id in soup.find_all(id=True):
        id['id'] = transform_id(id['id'], rel_url)

    for a in soup.find_all('a', href=True):
        if urls.url_is_absolute(a['href']) or os.path.isabs(a['href']):
            continue

        a['href'] = transform_href(a['href'], rel_url)

    soup.body['id'] = get_body_id(rel_url)
    soup = replace_asset_hrefs(soup, base_url)
    return soup
示例#5
0
        def run_test(scene: str, patterns: []):
            for index, pattern in enumerate(patterns):
                with self.subTest(index=index, pattern=pattern):
                    case, x_href = pattern
                    href, rel_url = case

                    # check on `preprocessor.get_combined`
                    if urls.url_is_absolute(href) or os.path.isabs(href):
                        self.assertTrue(True)
                        continue

                    r = transform_href(href, rel_url)
                    self.assertEqual(r, x_href, f'"{scene}" at {index}')
示例#6
0
def get_combined(soup: PageElement, base_url: str,
                 rel_url: str) -> PageElement:
    """ transforms all relative hrefs pointing to other html docs
    into relative pdf hrefs
    """

    for element in soup.find_all(id=True):
        element['id'] = transform_id(element['id'], rel_url)

    for a in soup.find_all('a', href=True):
        if urls.url_is_absolute(a['href']) or os.path.isabs(a['href']):
            continue

        a['href'] = transform_href(a['href'], rel_url)

    soup.body['id'] = get_body_id(rel_url)
    soup = replace_asset_hrefs(soup, base_url)

    return soup
示例#7
0
def is_doc(href: str) -> bool:
    """check if href is relative

    if it is relative it *should* be an html that generates a PDF doc

    Arguments:
        href {str} -- a string of URL.

    Returns:
        bool -- result
    """

    tail = os.path.basename(href)
    _, ext = os.path.splitext(tail)

    absurl = urls.url_is_absolute(href)
    abspath = os.path.isabs(href)
    htmlfile = ext.startswith('.html')
    if absurl or abspath or not htmlfile:
        return False

    return True
示例#8
0
def get_separate(soup: BeautifulSoup, base_url: str, should_slugify=False):
    if should_slugify:
        for id in soup.find_all(id=True):
            id['id'] = slugify(id['id'])

        for a in soup.find_all('a', href=True):
            href = a['href']
            if urls.url_is_absolute(href) or os.path.isabs(href):
                continue

            if '#' in href:
                section, id = href.rsplit('#', 1)
                a['href'] = '{}#{}'.format(section, slugify(id))

    # transforms all relative hrefs pointing to other html docs
    # into relative pdf hrefs
    for a in soup.find_all('a', href=True):
        remove_md_skip(a)
        a['href'] = rel_pdf_href(a['href'])

    soup = replace_asset_hrefs(soup, base_url)
    return soup
示例#9
0
def abs_asset_href(href: str, base_url: str):
    if urls.url_is_absolute(href) or os.path.isabs(href):
        return href

    return urls.iri_to_uri(urls.urljoin(base_url, href))