def normalize_url(url, query_string=None): if url: if query_string: url += '?' + query_string if not url_is_absolute(url): # Default to HTTP rather than relative filenames url = 'http://' + url return url
def is_doc(href: str): tail = os.path.basename(href) _, ext = os.path.splitext(tail) absurl = urls.url_is_absolute(href) abspath = os.path.isabs(href) htmlfile = ext.startswith('.html') if absurl or abspath or not htmlfile: return False return True
def get_combined(soup: BeautifulSoup, base_url: str, rel_url: str): for id in soup.find_all(id=True): id['id'] = transform_id(id['id'], rel_url) for a in soup.find_all('a', href=True): if urls.url_is_absolute(a['href']) or os.path.isabs(a['href']): continue a['href'] = transform_href(a['href'], rel_url) soup.body['id'] = get_body_id(rel_url) soup = replace_asset_hrefs(soup, base_url) return soup
def run_test(scene: str, patterns: []): for index, pattern in enumerate(patterns): with self.subTest(index=index, pattern=pattern): case, x_href = pattern href, rel_url = case # check on `preprocessor.get_combined` if urls.url_is_absolute(href) or os.path.isabs(href): self.assertTrue(True) continue r = transform_href(href, rel_url) self.assertEqual(r, x_href, f'"{scene}" at {index}')
def get_combined(soup: PageElement, base_url: str, rel_url: str) -> PageElement: """ transforms all relative hrefs pointing to other html docs into relative pdf hrefs """ for element in soup.find_all(id=True): element['id'] = transform_id(element['id'], rel_url) for a in soup.find_all('a', href=True): if urls.url_is_absolute(a['href']) or os.path.isabs(a['href']): continue a['href'] = transform_href(a['href'], rel_url) soup.body['id'] = get_body_id(rel_url) soup = replace_asset_hrefs(soup, base_url) return soup
def is_doc(href: str) -> bool: """check if href is relative if it is relative it *should* be an html that generates a PDF doc Arguments: href {str} -- a string of URL. Returns: bool -- result """ tail = os.path.basename(href) _, ext = os.path.splitext(tail) absurl = urls.url_is_absolute(href) abspath = os.path.isabs(href) htmlfile = ext.startswith('.html') if absurl or abspath or not htmlfile: return False return True
def get_separate(soup: BeautifulSoup, base_url: str, should_slugify=False): if should_slugify: for id in soup.find_all(id=True): id['id'] = slugify(id['id']) for a in soup.find_all('a', href=True): href = a['href'] if urls.url_is_absolute(href) or os.path.isabs(href): continue if '#' in href: section, id = href.rsplit('#', 1) a['href'] = '{}#{}'.format(section, slugify(id)) # transforms all relative hrefs pointing to other html docs # into relative pdf hrefs for a in soup.find_all('a', href=True): remove_md_skip(a) a['href'] = rel_pdf_href(a['href']) soup = replace_asset_hrefs(soup, base_url) return soup
def abs_asset_href(href: str, base_url: str): if urls.url_is_absolute(href) or os.path.isabs(href): return href return urls.iri_to_uri(urls.urljoin(base_url, href))