def main(argv=None): fields = ['url', 'name', 'location', 'city', 'zip', 'date', 'score'] filename = 'austin.csv' csv_writer = get_csv_writer(filename, fields) print "Starting with url %s" % URL_ROOT print "POST params %s" % SEARCH_PARAMS search_resp = requests.post(URL_ROOT, data=SEARCH_PARAMS) soup = BeautifulSoup(search_resp.content) content = soup.find(id='col3_content') resultsHeader = content.p.text m = re.search('(?P<count>\d+) records found', resultsHeader) total_results = m.group('count').strip() print "Total Results: %s " % total_results data_table = content.table for data_row in data_table.find_all('tr'): if not getattr(data_row, 'td', False): continue cells = data_row.find_all('td') facility = {} facility['name'] = get_value(cells[0]) facility['location'] = get_value(cells[1]) facility['url'] = "%s#%s@%s" % (URL_ROOT, facility['name'], facility['location']) facility['city'] = 'Austin' facility['zip'] = get_value(cells[3]) facility['date'] = get_value(cells[4]) facility['score'] = get_value(cells[5]) print "Facility: %s" % facility csv_writer.writerow(facility)
def _scrape_content(content, csv_writer): resultsHeader = content.find(id='searchResultsHeader') print ' '.join([text for text in resultsHeader.stripped_strings][:-2]) facility_links = content.find(id='searchResults').find_all( 'a', {'class': 'resultMore'}) for facility_link in facility_links: facility_url = "%s%s" % (URL_ROOT, facility_link['href']) time.sleep(SECONDS_THROTTLE) facility_resp = requests.get(facility_url) facility_soup = BeautifulSoup(facility_resp.content) inspection_history = facility_soup.find(id='inspectionHistory') if not getattr(inspection_history, 'ul', False): continue if not getattr(inspection_history.ul, 'li', False): continue latest_inspection = inspection_history.ul.li latest_inspection_link = latest_inspection.a inspection_url = "%s%s" % (URL_ROOT, latest_inspection_link['href']) time.sleep(SECONDS_THROTTLE) inspection_resp = requests.get(inspection_url) inspection_soup = BeautifulSoup(inspection_resp.content) inspection_detail = inspection_soup.find(id='inspectionDetail') facility = {} facility['name'] = get_value(inspection_detail.h3) m = re.search('Location: (?P<location>.*)[\r\n\t]+Smoking', inspection_detail.text) facility['location'] = m.group('location').strip() facility['url'] = inspection_url facility['city'] = 'Tulsa' m = re.search('.*(?P<zip>\d{5})$', facility['location']) facility['zip'] = m.group('zip').strip() inspection_info = inspection_detail.find(id='inspectionInfo') inspection_date_row = inspection_info.table.tr facility['date'] = get_value(inspection_date_row.find_all('td')[1]) inspection_violations = inspection_detail.find( id='inspectionViolations') facility['score'] = _get_inspection_score(inspection_violations) print "Facility: %s" % facility csv_writer.writerow(facility) next_page_link = content.find('a', text='Next %s' % PAGE_SIZE) if next_page_link: time.sleep(SECONDS_THROTTLE) next_url = "%s%s" % (URL_ROOT, next_page_link['href']) search_resp = requests.get(next_url) soup = BeautifulSoup(search_resp.content) content = soup.find(id='content') _scrape_content(content, csv_writer)
def _scrape_content(content, csv_writer): resultsHeader = content.find(id='searchResultsHeader') print ' '.join([text for text in resultsHeader.stripped_strings][:-2]) facility_links = content.find(id='searchResults').find_all('a', {'class': 'resultMore'}) for facility_link in facility_links: facility_url = "%s%s" % (URL_ROOT, facility_link['href']) time.sleep(SECONDS_THROTTLE) facility_resp = requests.get(facility_url) facility_soup = BeautifulSoup(facility_resp.content) inspection_history = facility_soup.find(id='inspectionHistory') if not getattr(inspection_history, 'ul', False): continue if not getattr(inspection_history.ul, 'li', False): continue latest_inspection = inspection_history.ul.li latest_inspection_link = latest_inspection.a inspection_url = "%s%s" % (URL_ROOT, latest_inspection_link['href']) time.sleep(SECONDS_THROTTLE) inspection_resp = requests.get(inspection_url) inspection_soup = BeautifulSoup(inspection_resp.content) inspection_detail = inspection_soup.find(id='inspectionDetail') facility = {} facility['name'] = get_value(inspection_detail.h3) m = re.search('Location: (?P<location>.*)[\r\n\t]+Smoking', inspection_detail.text) facility['location'] = m.group('location').strip() facility['url'] = inspection_url facility['city'] = 'Tulsa' m = re.search('.*(?P<zip>\d{5})$', facility['location']) facility['zip'] = m.group('zip').strip() inspection_info = inspection_detail.find(id='inspectionInfo') inspection_date_row = inspection_info.table.tr facility['date'] = get_value(inspection_date_row.find_all('td')[1]) inspection_violations = inspection_detail.find(id='inspectionViolations') facility['score'] = _get_inspection_score(inspection_violations) print "Facility: %s" % facility csv_writer.writerow(facility) next_page_link = content.find('a', text='Next %s' % PAGE_SIZE) if next_page_link: time.sleep(SECONDS_THROTTLE) next_url = "%s%s" % (URL_ROOT, next_page_link['href']) search_resp = requests.get(next_url) soup = BeautifulSoup(search_resp.content) content = soup.find(id='content') _scrape_content(content, csv_writer)
def write_data_table(data_table, csv_writer): for data_row in data_table.find_all('tr'): if data_row.td.string == u'Name': continue cells = data_row.find_all('td') facility = {} facility['name'] = get_value(cells[0]) facility['location'] = get_value(cells[1]) facility['url'] = "%s#%s@%s" % (URL_ROOT, facility['name'], facility['location']) facility['city'] = 'Dallas' facility['zip'] = get_value(cells[3]) facility['date'] = get_value(cells[5]) facility['score'] = get_value(cells[6]) print "Facility: %s" % facility csv_writer.writerow(facility)