forked from scrapers/ccb_licenses
-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraper.py
109 lines (79 loc) · 3.53 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
# -*- coding: utf-8 -*-
import argparse
import json
import re
import urllib
import helper
BASE_URI = 'http://search.ccb.state.or.us'
def main():
parser = argparse.ArgumentParser(description='Scrape CCB License Information')
parser.add_argument('term', help='The search term')
parser.add_argument('--name', action='store_true', help='Indicates the search term is a person\'s name')
args = parser.parse_args()
client = LicenseScraper(BASE_URI)
key = 'name' if args.name else 'term'
print client.search(**{key:args.term})
class LicenseScraper(helper.Scraper):
def search(self, term=None, name=None):
"""Scrape CCB License Information
CCB WEB SITE SEARCH TIPS
- The most accurate search is by CCB license number. If you have that
number in your records, you will save time by finding it.
- Make the search fairly broad and include a key word. So, search for
"Schmitt" instead of "Schmitt Contracting Inc" or search for "concrete"
instead of "Joes Concrete".
- Leave out punctuation such as apostrophes and hyphens.
- If you're searching for a phone number, search on the numbers only,
i.e. 5039991234 instead of (503) 999-1234.
- Verify your spelling. A simple misspelling of a word will cause your
search to fail.
:returns: List of CCB License Info
"""
q = '%%%s' % name if name else (term or '')
q = re.compile(r'[\'\-\(\)]').sub('', q)
content = self.get('/search/search_results_list.asp',
params=dict(search_criteria=q))
# with open('output.html', 'wb') as file:
# file.write(content.prettify('utf-8'))
tables = content.findAll('table', attrs={'class': 'bodyText'})
parsed = [x for x in [self.parse(table) for table in tables] if x]
# with open('output.json', 'w') as fp:
# json.dump(parsed, fp, indent=4)
return json.dumps(parsed, indent=4)
def parse(self, table):
"""Extract all data from a table of results
"""
trs = table.findAll('tr')
description = trs[0].find('td').text.split('&mdash')[0].strip()
header = self.header(trs[1]) # the header is the second row all the time
trs = None
rows = []
try:
for tr in table.findAll('tr', attrs={'valign':'top'}):
rows.extend([self.row(tds) for tds in chunks(tr.findAll('td'), 5)])
results = [dict(zip(header, row)) for row in rows if row]
total = len(results)
return {'description':description, 'total':total, 'results':results} if total else None
except Exception, e:
pass
def header(self, tr):
"""Extract the header from the table (used as keys for the data items)
"""
return tuple([helper.slugify(td.text) for td in tr.findAll('td')] + ['url'])
def row(self, tds):
"""Parse CCB/OCHI/ID #, status, name, address, detail_url.
"""
try:
return tuple([td.text.strip('\r\n ') for td in tds[:4]] + \
[self.resolve('/search/%s' % tds[4].find('a')['href'])])
except Exception, e: # not a valid row
pass
def summary(self, url):
pass
def chunks(items, length=1):
"""Some of the rows present chunks longer than 5 tds
We need to break these into smaller chunks
"""
return [items[i:i + length] for i in range(0, len(items), length)]
if __name__ == "__main__":
main()