/
GoogleResults.py
43 lines (37 loc) · 1.4 KB
/
GoogleResults.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
from BeautifulSoup import BeautifulSoup
from supporting import struc
import urllib, urllib2
# Number of results you need to store from each search engine
numresults = 30
def google_scrape(query):
googleResults = []
address = "http://www.google.com/search?q=" + urllib.quote_plus(query) + "&num=" + str(numresults+2) +"&hl=en&start=0"
request = urllib2.Request(address, None, {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_4) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11'})
urlfile = urllib2.urlopen(request)
page = urlfile.read()
soup = BeautifulSoup(page)
titles = []
descriptions = []
urls = []
headers = soup.findAll('div','rc')
for header in headers:
t = header.a.string.encode('utf-8')
u = header.a.get('href')
urls.append(u)
titles.append(t)
desclist = soup.findAll('span','st')
for desc in desclist:
d = desc.text.encode('utf-8')
descriptions.append(d)
size = len(titles)
filename = query + '_Google.txt'
file = open(filename,"w")
for i in range(0,size):
result = struc()
result.rank = i+1
result.title = titles[i]
result.description = descriptions[i]
result.url = urls[i]
file.write(str(i+1) + ' ' + titles[i] + ' ' + descriptions[i] + '\n')
googleResults.append(result)
return googleResults