def extract_image_urls(url): origin = None li = [] text = get_page(url, user_agent=True, referer=True) soup = bs.to_soup(text) # this version worked for a day: # for pic in soup.findCssSelect('div.pic'): # a = pic.find('a', href=True) # if a: # li.append(a['href']) # here is a new version, updated to the changes for div in soup.findCssSelect("div.pic"): img = div.find("img") if img and img.has_key("src"): li.append(img["src"].replace("/small/", "/large/")) for div in soup.findCssSelect("html body form#aspnetForm div#main div"): result = re.search(r"URL: (http://.*)View full images", div.text) if result: origin = result.group(1) return origin, li
def test_get_links(): soup = bs.to_soup(ex.LINKS) links = bs.get_links(soup, 'http://retrogames.com') assert links == ['http://retrogames.com', 'http://retrogames.com/games/elite', 'http://retrogames.com/games/commando'] # links = bs.get_links(soup) assert links == ['http://retrogames.com', '/games/elite', '/games/commando']
def test_make_links_absolute(): soup = bs.to_soup(ex.LINKS) soup = bs.make_links_absolute(soup, 'http://retrogames.com') # links = bs.get_links(soup) assert links == ['http://retrogames.com', 'http://retrogames.com/games/elite', 'http://retrogames.com/games/commando']
def visit(blog, dic): url = 'http://{name}.wordpress.com'.format(name=blog) text = get_page(url) soup = bs.to_soup(text) hits = soup.findCssSelect('div#blog-stats ul li')[0].text hits = int(hits.replace('hits','').replace(',','').strip()) # dic[url] = hits
def test_make_links_absolute(): soup = bs.to_soup(ex.LINKS) soup = bs.make_links_absolute(soup, 'http://retrogames.com') # links = bs.get_links(soup) assert links == [ 'http://retrogames.com', 'http://retrogames.com/games/elite', 'http://retrogames.com/games/commando' ]
def test_get_links(): soup = bs.to_soup(ex.LINKS) links = bs.get_links(soup, 'http://retrogames.com') assert links == [ 'http://retrogames.com', 'http://retrogames.com/games/elite', 'http://retrogames.com/games/commando' ] # links = bs.get_links(soup) assert links == [ 'http://retrogames.com', '/games/elite', '/games/commando' ]
def get_slogan(word, times=1): assert 1 <= times <= 10 # be nice with the server # li = [] url = BASE + urllib.urlencode({'user' : word}) for _ in xrange(times): text = get_page(url, user_agent=True) soup = bs.to_soup(text) slogan = soup.findCssSelect('html body div p')[0].text if string.count(slogan, '.') == 1 and not slogan[0].isupper(): slogan = slogan.replace('.', '') if len(slogan) >= 2 and slogan[-1] == '.' and slogan[-2] == '!': slogan = slogan[:-1] li.append(slogan) return li
def get_slogan(word, times=1): assert 1 <= times <= 10 # be nice with the server # li = [] url = BASE + urllib.urlencode({'user': word}) for _ in xrange(times): text = get_page(url, user_agent=True) soup = bs.to_soup(text) slogan = soup.findCssSelect('html body div p')[0].text if string.count(slogan, '.') == 1 and not slogan[0].isupper(): slogan = slogan.replace('.', '') if len(slogan) >= 2 and slogan[-1] == '.' and slogan[-2] == '!': slogan = slogan[:-1] li.append(slogan) return li
def extract_list(): """ Extract proxy list from base url. """ sys.stdout.write('# extracting list') proxies = [] text = get_page(BASE, user_agent=True) soup = bs.to_soup(text) proxylist = soup.findCssSelect('table.proxylist')[0] for tr in proxylist.findAll('tr', {'class': True}): if tr['class'] in ('odd', 'even'): cols = tr.findAll('td') ip = cols[0].text type = cols[1].text country = cols[2].text proxies.append(Proxy(ip, type, country)) sys.stdout.write('.') # print 'done.' return proxies
def test_prettify(): soup = bs.to_soup(ex.UGLY) assert soup.prettify() == """<html>
def test_to_soup(): soup = bs.to_soup(ex.HTML_1) assert isinstance(soup, BeautifulSoup) assert str(soup) == """
def demo9(): url = "http://python.org/" text = get_page(url) soup = bs.to_soup(text) title = soup.findCssSelect("html head title")[0] print title.text