def get_similar_image_urls(html): soup = b(html) for item in soup.find('div', { 'id': 'iur' }).findAll('a', {'class': 'bia uh_rl'}): url = item.get('href') yield urlparse.parse_qs(urlparse.urlparse(url).query)['imgurl']
def find_party_get_party_party_on(url): response = urllib2.urlopen(url) html = response.read() poop = b(html) trs = poop.findAll('tr') drunk = 'Unknown' for tr in trs: a = tr.findAll('th') if len(a) >= 1: k = a[0] if k.text == 'Political party': t = tr.findAll('td') ass = t[0].findAll('a') if len(ass) >= 1: if ass[-1].text == '[1]': drunk = ass[-2].text else: drunk = ass[-1].text break text = [p.getText() for p in poop.findAll('p')[:5]] conc_text = "".join(text).lower() C = Counter(conc_text.split()) #print C['he'], C['she'] if C['he'] > C['she']: tonk = 'Male' elif C['she'] > C['he']: tonk = 'Female' else: tonk = 'Not Found' return drunk, tonk
#! /usr/bin/python2 import urllib2 as u from BeautifulSoup import BeautifulSoup as b import re,sys,os,tkMessageBox from Tkinter import * while True: try: args = sys.argv[1:] page = u.urlopen(args[0]) newpath = args[0].split('/')[-3] +'-'+args[0].split('/')[-1] if len(args) >1: newpath = args[1] if not os.path.exists(newpath): os.makedirs(newpath) os.chdir(newpath) soup = b(page) images = set() for link in soup("a"): href = link.get('href') if href is not None: if "//images.4chan.org" in link.get('href'): images.add("https:"+link.get('href')) for x in images: print 'saving image %s'%x os.system('wget -c --limit-rate=30k %s'%x) if not tkMessageBox.askyesno("Done", "done saving 4chan thread %s, do it again?"%args[0]): break except Exception as e: if not tkMessageBox.askyesno("Error", "error %s in saving 4chan thread %s, do it again?"%(str(e),args[0])): break
from BeautifulSoup import BeautifulSoup as b from collections import Counter import urllib2, numpy import matplotlib.pyplot as plt import sqlite3 as lite import numpy import seaborn as sns # Creates or opens a file called mydb with a SQLite3 DB db = lite.connect('QTdb') response = urllib2.urlopen('http://en.wikipedia.org/wiki/List_of_Question_Time_episodes') html = response.read() soup = b(html) people_all = [] genders_all = [] parties_all = [] people_multi = [] genders_multi = [] parties_multi = [] tables = soup.findAll('table','wikitable')[2:] #First two tables are other content year_headers = soup.findAll('h2')[2:-4] # Likewise with headers years = [] def find_party_get_party_party_on(url): response = urllib2.urlopen(url) html = response.read()
#!/usr/bin/env python #this will return dell warranty information. tested and works as of 2012-02-16. no promises if dell updates their page and breaks stuff from BeautifulSoup import BeautifulSoup as b import sys import requests serviceTag = sys.argv[1] url = "http://www.dell.com/support/troubleshooting/us/en/usgen1/Index?c=us&l=en&s=gen&cs=&t=warranty&servicetag=" r = requests.get(url + serviceTag) if(r.ok): soup = b(r.content) x = soup.find("li", "TopTwoWarrantyListItem") #search for the css class inside a li days = x.text.split('.')[0] #only want the first sentence that we match on print days else: print "Error retrieving url"
def export(self, filename, append=True): if filename.split('.')[-1] != '.txt': filename += '.txt' mode = 'a' if append else 'w' with open(filename, mode) as f: for p in self.data: f.write(p + "\n") f.close() class json: def __init__(self, data): self.data = data def export(self, filename, append=True): import demjson js = demjson.encode(self.data) if filename.split('.')[-1] != '.json': filename += '.json' mode = 'a' if append else 'w' with open(filename, mode) as f: f.write(js) f.close() if __name__ == '__main__': import urllib from BeautifulSoup import BeautifulSoup as b bb = b(urllib.urlopen("http://www.thehindu.com/sci-tech/science/irnss1d-launch/article7043608.ece?homepage=true").read()) bs = bb.findAll("p") docx(bb.find("title").text, "\n".join([s.text for s in bs])).export("a")