l.success() def loop(xml): for item in xml: if item.name == 'folder': if item['description'] != 'UA List :: About': loop(item) elif item.name == 'useragent': uas.add(item['useragent'].strip()) with log.waitfor('Parsing list') as l: loop(soup.useragentswitcher) l.success() with log.waitfor('Fetching from http://www.user-agents.org') as l: xml = getxml('http://www.user-agents.org/allagents.xml') soup = BeautifulSoup(xml) l.success() with log.waitfor('Parsing list') as l: for item in soup.__getattr__('user-agents'): if item.name == 'user-agent': ua = item.select('string')[0].string.strip() uas.add(ua) l.success() log.info('Fetched %d user agents' % len(uas)) write('useragents.txt', ''.join(sorted(ua + '\n' for ua in uas)))
def loop(xml): for item in xml: if item.name == 'folder': if item['description'] != 'UA List :: About': loop(item) elif item.name == 'useragent': uas.add(item['useragent'].strip()) with log.waitfor('Parsing list') as l: loop(soup.useragentswitcher) l.success() with log.waitfor('Fetching from http://www.user-agents.org') as l: xml = getxml('http://www.user-agents.org/allagents.xml') soup = BeautifulSoup(xml) l.success() with log.waitfor('Parsing list') as l: for item in soup.__getattr__('user-agents'): if item.name == 'user-agent': ua = item.select('string')[0].string.strip() uas.add(ua) l.success() log.info('Fetched %d user agents' % len(uas)) write('useragents.txt', ''.join(sorted(ua + '\n' for ua in uas)))
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """ soup = BeautifulSoup(html_doc, "html.parser") #print(soup.prettify()) for tag in soup.find_all(re.compile("^b")): print(tag.name) for tag in soup.find_all(lambda x: x.name.startswith('b')): print(tag.name) soup2 = BeautifulSoup("“Dammit!” he said.", "html5lib") print(str(soup2)) print('title' in dir(soup)) print(hasattr(soup, 'title')) print('title' in soup.__dict__) print(soup.__getattr__('title')) markup = "<h1>Sacr\xc3\xa9 bleu!</h1>" soup = BeautifulSoup(markup, "html5lib") print(soup.h1) # <h1>Sacré bleu!</h1> print(soup.h1.string) # u'Sacr\xe9 bleu!' print(soup.original_encoding) print(soup.contains_replacement_characters) print(soup.prettify("latin-1"))