#!/usr/bin/env python # -*- coding: utf-8 -*- import re import urllib from beautifulsoup import BeautifulSoup wiki_url = 'https://en.wikipedia.org/wiki/Game_of_Thrones' wiki_html = urllib.urlopen(wiki_url).read() wiki_content = BeautifulSoup(wiki_html) seasons_table = wiki_content.find('table', attrs={'class': 'wikitable'}) seasons = seasons_table.findAll( 'a', attrs={'href': re.compile('\/wiki\/Game_of_Thrones_\(season_?[0-9]+\)')}) views = 0 for season in seasons: season_url = 'https://en.wikipedia.org' + season['href'] season_html = urllib.urlopen(season_url).read() season_content = BeautifulSoup(season_html) episodes_table = season_content.find( 'table', attrs={'class': 'wikitable plainrowheaders wikiepisodetable'}) if episodes_table: episode_rows = episodes_table.findAll('tr', attrs={'class': 'vevent'}) if episode_rows:
login_data = urllib.urlencode({'user_email' : username, 'user_password' : password, 'autologin' : autologin, 'submit' : login }) log = opener.open('http://www.autosport.com/subs/login.php', login_data) gallery = opener.open(url) gal_id = url.split('/')[-1] html_gal = open('debug.html', 'wb') html_gal.write(gallery.read()) html_gal.close() soup = BeautifulSoup(open('debug.html', 'r').read()) links = soup.findAll('div', 'boxes') descriptions = soup.findAll('img', 'black') images = [] titles = [] dir_name = soup.find('option', value=str(gal_id)) gal_dir = string.replace(dir_name.contents[0], ' ', '_') + '_' + gal_id for link in links: hrefs = link.findAll('a') images.append(hrefs[-1]['href'].split('/dir')[1]) for description in descriptions: title = description['onmouseover'] titles.append(title.split("return overlib('")[1].split("');")[0]) def gallery_directory(): if os.path.exists(gal_dir) == False: os.mkdir(gal_dir)
#!/usr/bin/env python # -*- coding: utf-8 -*- from beautifulsoup import BeautifulSoup from urllib2 import urlopen url = "https://scrapebook22.appspot.com/" response = urlopen(url).read() soup = BeautifulSoup(response) print soup.html.head.title.string for link in soup.findAll("a"): if link.string == "See full profile": person_url = "https://scrapebook22.appspot.com" + link["href"] person_html = urlopen(person_url).read() person_soup = BeautifulSoup(person_html) email = person_soup.find("span", attrs={"class": "email"}).string name = person_soup.find("div", attrs={"class": "col-md-8"}).h1.string city = person_soup.find("span", attrs={"data-city": True}).string print name + "," + email + "," + city csv_file = open("list.csv", "w") csv_file.write(name + "," + email + "," + city + "\n") csv_file.close()
login_data = urllib.urlencode({'user_email' : username, 'user_password' : password, 'autologin' : autologin, 'submit' : login }) log = opener.open('http://www.autosport.com/subs/login.php', login_data) gallery = opener.open(url) gal_id = url.split('/')[-1] html_gal = open('debug.html', 'wb') html_gal.write(gallery.read()) html_gal.close() soup = BeautifulSoup(open('debug.html', 'r').read()) items = soup.findAll('ul', id='mycarousel') descriptions = soup.findAll('img', {'class': re.compile(r'\bthumbnail\b')}) images = [] titles = [] dir_name = soup.find('h1') gal_dir = string.replace(dir_name.contents[0], ' ', '_') + '_' + gal_id for item in items: links = item.findAll('a') for link in links: images.append(link['href']) for description in descriptions: title = description['alt'] titles.append(title) print len(titles)