def extract_image_urls(url):
    origin = None
    li = []

    text = get_page(url, user_agent=True, referer=True)
    soup = bs.to_soup(text)

    #    this version worked for a day:
    #    for pic in soup.findCssSelect('div.pic'):
    #        a = pic.find('a', href=True)
    #        if a:
    #            li.append(a['href'])

    #   here is a new version, updated to the changes
    for div in soup.findCssSelect("div.pic"):
        img = div.find("img")
        if img and img.has_key("src"):
            li.append(img["src"].replace("/small/", "/large/"))

    for div in soup.findCssSelect("html body form#aspnetForm div#main div"):
        result = re.search(r"URL: (http://.*)View full images", div.text)
        if result:
            origin = result.group(1)

    return origin, li
Exemplo n.º 2
0
def test_get_links():
    soup = bs.to_soup(ex.LINKS)
    links = bs.get_links(soup, 'http://retrogames.com')
    assert links == ['http://retrogames.com', 'http://retrogames.com/games/elite', 'http://retrogames.com/games/commando']
    #
    links = bs.get_links(soup)
    assert links == ['http://retrogames.com', '/games/elite', '/games/commando']
Exemplo n.º 3
0
def test_make_links_absolute():
    soup = bs.to_soup(ex.LINKS)
    soup = bs.make_links_absolute(soup, 'http://retrogames.com')
    #
    links = bs.get_links(soup)
    assert links == ['http://retrogames.com', 'http://retrogames.com/games/elite', 'http://retrogames.com/games/commando']
    
Exemplo n.º 4
0
def visit(blog, dic):
    url = 'http://{name}.wordpress.com'.format(name=blog)
    text = get_page(url)
    soup = bs.to_soup(text)
    hits = soup.findCssSelect('div#blog-stats ul li')[0].text
    hits = int(hits.replace('hits','').replace(',','').strip())
    #
    dic[url] = hits
Exemplo n.º 5
0
def test_make_links_absolute():
    soup = bs.to_soup(ex.LINKS)
    soup = bs.make_links_absolute(soup, 'http://retrogames.com')
    #
    links = bs.get_links(soup)
    assert links == [
        'http://retrogames.com', 'http://retrogames.com/games/elite',
        'http://retrogames.com/games/commando'
    ]
Exemplo n.º 6
0
def test_get_links():
    soup = bs.to_soup(ex.LINKS)
    links = bs.get_links(soup, 'http://retrogames.com')
    assert links == [
        'http://retrogames.com', 'http://retrogames.com/games/elite',
        'http://retrogames.com/games/commando'
    ]
    #
    links = bs.get_links(soup)
    assert links == [
        'http://retrogames.com', '/games/elite', '/games/commando'
    ]
Exemplo n.º 7
0
def get_slogan(word, times=1):
    assert 1 <= times <= 10     # be nice with the server
    #
    li = []
    url = BASE + urllib.urlencode({'user' : word})
    for _ in xrange(times):
        text = get_page(url, user_agent=True)
        soup = bs.to_soup(text)
        slogan = soup.findCssSelect('html body div p')[0].text
        if string.count(slogan, '.') == 1 and not slogan[0].isupper():
            slogan = slogan.replace('.', '')
        if len(slogan) >= 2 and slogan[-1] == '.' and slogan[-2] == '!':
            slogan = slogan[:-1]
        li.append(slogan)
        
    return li
Exemplo n.º 8
0
def get_slogan(word, times=1):
    assert 1 <= times <= 10  # be nice with the server
    #
    li = []
    url = BASE + urllib.urlencode({'user': word})
    for _ in xrange(times):
        text = get_page(url, user_agent=True)
        soup = bs.to_soup(text)
        slogan = soup.findCssSelect('html body div p')[0].text
        if string.count(slogan, '.') == 1 and not slogan[0].isupper():
            slogan = slogan.replace('.', '')
        if len(slogan) >= 2 and slogan[-1] == '.' and slogan[-2] == '!':
            slogan = slogan[:-1]
        li.append(slogan)

    return li
Exemplo n.º 9
0
def extract_list():
    """
    Extract proxy list from base url.
    """
    sys.stdout.write('# extracting list')
    proxies = []
    text = get_page(BASE, user_agent=True)
    soup = bs.to_soup(text)
    proxylist = soup.findCssSelect('table.proxylist')[0]
    for tr in proxylist.findAll('tr', {'class': True}):
        if tr['class'] in ('odd', 'even'):
            cols = tr.findAll('td')
            ip = cols[0].text
            type = cols[1].text
            country = cols[2].text
            proxies.append(Proxy(ip, type, country))
            sys.stdout.write('.')
    #
    print 'done.'
    return proxies
Exemplo n.º 10
0
def test_prettify():
    soup = bs.to_soup(ex.UGLY)
    assert soup.prettify() == """<html>
Exemplo n.º 11
0
def test_to_soup():
    soup = bs.to_soup(ex.HTML_1)
    assert isinstance(soup, BeautifulSoup)
    assert str(soup) == """
Exemplo n.º 12
0
def test_prettify():
    soup = bs.to_soup(ex.UGLY)
    assert soup.prettify() == """<html>
Exemplo n.º 13
0
def test_to_soup():
    soup = bs.to_soup(ex.HTML_1)
    assert isinstance(soup, BeautifulSoup)
    assert str(soup) == """
Exemplo n.º 14
0
def demo9():
    url = "http://python.org/"
    text = get_page(url)
    soup = bs.to_soup(text)
    title = soup.findCssSelect("html head title")[0]
    print title.text