示例#1
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import re
import urllib

from beautifulsoup import BeautifulSoup

wiki_url = 'https://en.wikipedia.org/wiki/Game_of_Thrones'
wiki_html = urllib.urlopen(wiki_url).read()
wiki_content = BeautifulSoup(wiki_html)

seasons_table = wiki_content.find('table', attrs={'class': 'wikitable'})
seasons = seasons_table.findAll(
    'a',
    attrs={'href': re.compile('\/wiki\/Game_of_Thrones_\(season_?[0-9]+\)')})

views = 0

for season in seasons:
    season_url = 'https://en.wikipedia.org' + season['href']
    season_html = urllib.urlopen(season_url).read()
    season_content = BeautifulSoup(season_html)

    episodes_table = season_content.find(
        'table', attrs={'class': 'wikitable plainrowheaders wikiepisodetable'})

    if episodes_table:
        episode_rows = episodes_table.findAll('tr', attrs={'class': 'vevent'})

        if episode_rows:
示例#2
0
login_data = urllib.urlencode({'user_email' : username, 'user_password' : password, 'autologin' : autologin, 'submit' : login })
log = opener.open('http://www.autosport.com/subs/login.php', login_data)
gallery = opener.open(url)
gal_id = url.split('/')[-1]

html_gal = open('debug.html', 'wb')
html_gal.write(gallery.read())
html_gal.close()

soup = BeautifulSoup(open('debug.html', 'r').read())
links = soup.findAll('div', 'boxes')
descriptions = soup.findAll('img', 'black')

images = []
titles = []
dir_name = soup.find('option', value=str(gal_id))
gal_dir = string.replace(dir_name.contents[0], ' ', '_') + '_' + gal_id


for link in links:
    hrefs = link.findAll('a')
    images.append(hrefs[-1]['href'].split('/dir')[1])

for description in descriptions:
    title = description['onmouseover']
    titles.append(title.split("return overlib('")[1].split("');")[0])


def gallery_directory():
    if os.path.exists(gal_dir) == False:
        os.mkdir(gal_dir)
示例#3
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from beautifulsoup import BeautifulSoup
from urllib2 import urlopen

url = "https://scrapebook22.appspot.com/"
response = urlopen(url).read()
soup = BeautifulSoup(response)

print soup.html.head.title.string

for link in soup.findAll("a"):
    if link.string == "See full profile":
        person_url = "https://scrapebook22.appspot.com" + link["href"]
        person_html = urlopen(person_url).read()
        person_soup = BeautifulSoup(person_html)
        email = person_soup.find("span", attrs={"class": "email"}).string
        name = person_soup.find("div", attrs={"class": "col-md-8"}).h1.string
        city = person_soup.find("span", attrs={"data-city": True}).string
        print name + "," + email + "," + city

        csv_file = open("list.csv", "w")
        csv_file.write(name + "," + email + "," + city + "\n")

        csv_file.close()
示例#4
0
login_data = urllib.urlencode({'user_email' : username, 'user_password' : password, 'autologin' : autologin, 'submit' : login })
log = opener.open('http://www.autosport.com/subs/login.php', login_data)
gallery = opener.open(url)
gal_id = url.split('/')[-1]

html_gal = open('debug.html', 'wb')
html_gal.write(gallery.read())
html_gal.close()

soup = BeautifulSoup(open('debug.html', 'r').read())
items = soup.findAll('ul', id='mycarousel')
descriptions = soup.findAll('img', {'class': re.compile(r'\bthumbnail\b')})

images = []
titles = []
dir_name = soup.find('h1')
gal_dir = string.replace(dir_name.contents[0], ' ', '_') + '_' + gal_id


for item in items:
    links = item.findAll('a')

for link in links:
    images.append(link['href'])


for description in descriptions:
    title = description['alt']
    titles.append(title)

print len(titles)