def test_find_all_by_tag_strainer(self): self.assertSelects( self.tree.find_all(SoupStrainer('a')), ['First tag.', 'Nested tag.'])
def test_soupstrainer(self): """Parsers should be able to work with SoupStrainers.""" strainer = SoupStrainer("b") soup = self.soup("A <b>bold</b> <meta/> <i>statement</i>", parse_only=strainer) self.assertEqual(soup.decode(), "<b>bold</b>")
def test_parse_with_soupstrainer(self): markup = "No<b>Yes</b><a>No<b>Yes <c>Yes</c></b>" strainer = SoupStrainer("b") soup = self.soup(markup, parse_only=strainer) self.assertEqual(soup.encode(), b"<b>Yes</b><b>Yes <c>Yes</c></b>")
def test_soupstrainer(self): # The html5lib tree builder does not support SoupStrainers. strainer = SoupStrainer("b") markup = "<p>A <b>bold</b> statement.</p>" soup = self.soup(markup, parse_only=strainer) self.assertEqual(soup.decode(), self.document_for(markup))
def test_soupstrainer(self): strainer = SoupStrainer("b")
def test_soupstrainer(self): strainer = SoupStrainer("b") soup = self.soup("A <b>bold</b> <meta /> <i>statement</i>", parse_only=strainer) self.assertEqual(soup.decode(), "<b>bold</b>")
def soupify_links(url, file_extension=None): """ Returns a String list containing urls that match the specified file_extension Only works on link tags Args: url (String): the target URL Returns: [String]: A list of string URLs representing all links to content from <a> and <img> tags """ headers = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36" } with requests.session() as session: try: # run a GET request on the supplied URL r = session.get(url, headers=headers, stream=True, timeout=1) r.raise_for_status() except HTTPError as http_err: click.secho(f"\nHTTP error occurred: {http_err}\n", fg="red", bold=False) return False except TimeoutError as timeout_err: click.secho(f"\nRequest timed out: {timeout_err}\n", fg="red", bold=False) return False except Exception as err: click.secho(f"\nOther error occurred: {err}\n", fg="red", bold=False) return False else: # no errors... continue # parse just the <img> and <a> tags soup_a = BeautifulSoup(r.content, "lxml", parse_only=SoupStrainer("a")) soup_img = BeautifulSoup(r.content, "lxml", parse_only=SoupStrainer("img")) # build the list of hrefs hrefs = [] if file_extension is not None: print(f"Getting links for {file_extension} files...") # Looking for a specific file_extension for img_link in soup_img(src=regex.compile(f".{file_extension}")): if img_link.get("src") is not None: hrefs.append(conv_rel_abs_addr(url, img_link.get("src"))) for a_link in soup_a(href=regex.compile(f".{file_extension}")): if a_link.get("href") is not None: hrefs.append(conv_rel_abs_addr(url, a_link.get("href"))) else: print("Getting links...") for img_link in soup_img.find_all("img"): if img_link.get("src") is not None: hrefs.append(conv_rel_abs_addr(url, img_link.get("src"))) for a_link in soup_a.find_all("a"): if a_link.get("href") is not None: hrefs.append(conv_rel_abs_addr(url, a_link.get("href"))) return hrefs