forked from hannawallach/web-scraping
/
get_data.py
146 lines (99 loc) · 3.8 KB
/
get_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
import os, re, sys
from BeautifulSoup import BeautifulSoup
from glob import glob
from json import loads
from mechanize import Browser
from iterview import iterview
from utilities import download_url, makedir, safe_write
DATA_DIR = 'data'
CACHE = os.path.join(DATA_DIR, 'cache')
LISTING_URLS_FILE = os.path.join(CACHE, 'listing_urls.txt')
HTML_DIR = os.path.join(CACHE, 'html')
SEARCH_RESULTS_DIR = os.path.join(HTML_DIR, 'search_results')
LISTING_PAGES_DIR = os.path.join(HTML_DIR, 'listing_pages')
CSV_FILE = os.path.join(DATA_DIR, 'data.csv')
SEARCH_URL = 'http://streeteasy.com/rentals'
def get_listing_urls(br):
"""
Searches StreetEasy for all rental apartment listings in
Williamsburg, caches each page of search results to the directory
whose name is stored in the variable SEARCH_RESULTS_DIR, and
caches the URLs for the listings (one per line) to the file whose
name is stored in the variable LISTING_URLS_FILE.
Arguments:
br -- Browser object
"""
if os.path.exists(LISTING_URLS_FILE):
return
makedir(os.path.dirname(LISTING_URLS_FILE))
br.open(SEARCH_URL)
br.select_form(nr=1)
# print br.form
br.form['area[]'] = ['302']
response = br.submit()
results_url = response.geturl()
with safe_write(LISTING_URLS_FILE) as f:
while True:
filename = download_url(br, results_url, SEARCH_RESULTS_DIR)
soup = BeautifulSoup(file(filename).read())
results = soup.findAll('div', attrs={'class': 'details_title' })
urls = []
for r in results:
r = r.find('h5')
r = r.find('a')
r = r.get('href')
urls.append('http://streeteasy.com' + r)
# urls = ['http://www.streeteasy.com' + r.find('h5').find('a').get('href') for r in soup.findAll('div', attrs={'class': 'details_title' })]
f.write('\n'.join(urls))
f.write('\n')
f.flush()
nav = soup.find('a', attrs={'class': 'next_page'})
try:
results_url = 'http://www.streeteasy.com' + nav.get('href')
except AttributeError:
break
def get_listing_pages(br):
"""
Caches the contents of each URL in the file whose name is stored
in the variable LISTING_URLS_FILE to the directory whose name is
stored on the variable LISTING_PAGES_DIR. The contents of each URL
will be stored in a file whose name is that URL's md5 hash.
Arguments:
br -- Browser object
"""
listing_urls = [url.strip() for url in file(LISTING_URLS_FILE)]
for url in iterview(listing_urls):
try:
download_url(br, url, LISTING_PAGES_DIR)
except Exception as e:
print >> sys.stderr, '\n', (url, e)
def get_listing_data():
with safe_write(CSV_FILE) as f:
for filename in iterview(glob(LISTING_PAGES_DIR + '/*')):
contents = file(filename).read()
# print contents
try:
[obj] = re.findall('dataLayer\s*=\s*\[(.*)\];', contents)
obj = loads(obj)
except ValueError:
return
if 'listPrice' in obj and 'listBed' in obj:
text = '\t'.join((os.path.basename(filename),
str(obj['listPrice']), str(obj['listBed'])))
f.write(text)
f.write('\n')
f.flush()
def main():
br = Browser()
br.set_handle_robots(False) # ignore robots.txt
get_listing_urls(br)
get_listing_pages(br)
get_listing_data()
if __name__ == '__main__':
main()
from numpy import loadtxt
from pylab import plot, show, xlim
data = loadtxt('data/data.csv', usecols=[1, 2])
plot(data[:, 0], data[:, 1], 'x')
# xlim(0, 8000)
show()