示例#1
0
def main(argv=None):
    fields = ['url', 'name', 'location', 'city', 'zip', 'date', 'score']
    filename = 'austin.csv'
    csv_writer = get_csv_writer(filename, fields)

    print "Starting with url %s" % URL_ROOT
    print "POST params %s" % SEARCH_PARAMS
    search_resp = requests.post(URL_ROOT, data=SEARCH_PARAMS)
    soup = BeautifulSoup(search_resp.content)
    content = soup.find(id='col3_content')

    resultsHeader = content.p.text
    m = re.search('(?P<count>\d+) records found', resultsHeader)
    total_results = m.group('count').strip()
    print "Total Results: %s " % total_results

    data_table = content.table
    for data_row in data_table.find_all('tr'):
        if not getattr(data_row, 'td', False):
            continue
        cells = data_row.find_all('td')
        facility = {}
        facility['name'] = get_value(cells[0])
        facility['location'] = get_value(cells[1])
        facility['url'] = "%s#%s@%s" % (URL_ROOT, facility['name'],
                                        facility['location'])
        facility['city'] = 'Austin'
        facility['zip'] = get_value(cells[3])
        facility['date'] = get_value(cells[4])
        facility['score'] = get_value(cells[5])
        print "Facility: %s" % facility
        csv_writer.writerow(facility)
示例#2
0
def main(argv=None):
    fields = ['url', 'name', 'location', 'city', 'zip', 'date', 'score']
    filename = 'austin.csv'
    csv_writer = get_csv_writer(filename, fields)

    print "Starting with url %s" % URL_ROOT
    print "POST params %s" % SEARCH_PARAMS
    search_resp = requests.post(URL_ROOT, data=SEARCH_PARAMS)
    soup = BeautifulSoup(search_resp.content)
    content = soup.find(id='col3_content')

    resultsHeader = content.p.text
    m = re.search('(?P<count>\d+) records found', resultsHeader)
    total_results = m.group('count').strip()
    print "Total Results: %s " % total_results

    data_table = content.table
    for data_row in data_table.find_all('tr'):
        if not getattr(data_row, 'td', False):
            continue
        cells = data_row.find_all('td')
        facility = {}
        facility['name'] = get_value(cells[0])
        facility['location'] = get_value(cells[1])
        facility['url'] = "%s#%s@%s" % (URL_ROOT,
                                     facility['name'],
                                     facility['location'])
        facility['city'] = 'Austin'
        facility['zip'] = get_value(cells[3])
        facility['date'] = get_value(cells[4])
        facility['score'] = get_value(cells[5])
        print "Facility: %s" % facility
        csv_writer.writerow(facility)
示例#3
0
def _scrape_content(content, csv_writer):
    resultsHeader = content.find(id='searchResultsHeader')
    print ' '.join([text for text in resultsHeader.stripped_strings][:-2])

    facility_links = content.find(id='searchResults').find_all(
        'a', {'class': 'resultMore'})
    for facility_link in facility_links:
        facility_url = "%s%s" % (URL_ROOT, facility_link['href'])
        time.sleep(SECONDS_THROTTLE)
        facility_resp = requests.get(facility_url)
        facility_soup = BeautifulSoup(facility_resp.content)
        inspection_history = facility_soup.find(id='inspectionHistory')
        if not getattr(inspection_history, 'ul', False):
            continue
        if not getattr(inspection_history.ul, 'li', False):
            continue
        latest_inspection = inspection_history.ul.li
        latest_inspection_link = latest_inspection.a
        inspection_url = "%s%s" % (URL_ROOT, latest_inspection_link['href'])
        time.sleep(SECONDS_THROTTLE)
        inspection_resp = requests.get(inspection_url)
        inspection_soup = BeautifulSoup(inspection_resp.content)
        inspection_detail = inspection_soup.find(id='inspectionDetail')
        facility = {}
        facility['name'] = get_value(inspection_detail.h3)
        m = re.search('Location:  (?P<location>.*)[\r\n\t]+Smoking',
                      inspection_detail.text)
        facility['location'] = m.group('location').strip()
        facility['url'] = inspection_url
        facility['city'] = 'Tulsa'
        m = re.search('.*(?P<zip>\d{5})$', facility['location'])
        facility['zip'] = m.group('zip').strip()
        inspection_info = inspection_detail.find(id='inspectionInfo')
        inspection_date_row = inspection_info.table.tr
        facility['date'] = get_value(inspection_date_row.find_all('td')[1])
        inspection_violations = inspection_detail.find(
            id='inspectionViolations')
        facility['score'] = _get_inspection_score(inspection_violations)
        print "Facility: %s" % facility
        csv_writer.writerow(facility)

    next_page_link = content.find('a', text='Next %s' % PAGE_SIZE)
    if next_page_link:
        time.sleep(SECONDS_THROTTLE)
        next_url = "%s%s" % (URL_ROOT, next_page_link['href'])
        search_resp = requests.get(next_url)
        soup = BeautifulSoup(search_resp.content)
        content = soup.find(id='content')
        _scrape_content(content, csv_writer)
示例#4
0
def _scrape_content(content, csv_writer):
    resultsHeader = content.find(id='searchResultsHeader')
    print ' '.join([text for text in resultsHeader.stripped_strings][:-2])

    facility_links = content.find(id='searchResults').find_all('a',
                                                    {'class': 'resultMore'})
    for facility_link in facility_links:
        facility_url = "%s%s" % (URL_ROOT, facility_link['href'])
        time.sleep(SECONDS_THROTTLE)
        facility_resp = requests.get(facility_url)
        facility_soup = BeautifulSoup(facility_resp.content)
        inspection_history = facility_soup.find(id='inspectionHistory')
        if not getattr(inspection_history, 'ul', False):
            continue
        if not getattr(inspection_history.ul, 'li', False):
            continue
        latest_inspection = inspection_history.ul.li
        latest_inspection_link = latest_inspection.a
        inspection_url = "%s%s" % (URL_ROOT, latest_inspection_link['href'])
        time.sleep(SECONDS_THROTTLE)
        inspection_resp = requests.get(inspection_url)
        inspection_soup = BeautifulSoup(inspection_resp.content)
        inspection_detail = inspection_soup.find(id='inspectionDetail')
        facility = {}
        facility['name'] = get_value(inspection_detail.h3)
        m = re.search('Location:  (?P<location>.*)[\r\n\t]+Smoking', inspection_detail.text)
        facility['location'] = m.group('location').strip()
        facility['url'] = inspection_url
        facility['city'] = 'Tulsa'
        m = re.search('.*(?P<zip>\d{5})$', facility['location'])
        facility['zip'] = m.group('zip').strip()
        inspection_info = inspection_detail.find(id='inspectionInfo')
        inspection_date_row = inspection_info.table.tr
        facility['date'] = get_value(inspection_date_row.find_all('td')[1])
        inspection_violations = inspection_detail.find(id='inspectionViolations')
        facility['score'] = _get_inspection_score(inspection_violations)
        print "Facility: %s" % facility
        csv_writer.writerow(facility)

    next_page_link = content.find('a', text='Next %s' % PAGE_SIZE)
    if next_page_link:
        time.sleep(SECONDS_THROTTLE)
        next_url = "%s%s" % (URL_ROOT, next_page_link['href'])
        search_resp = requests.get(next_url)
        soup = BeautifulSoup(search_resp.content)
        content = soup.find(id='content')
        _scrape_content(content, csv_writer)
示例#5
0
def write_data_table(data_table, csv_writer):
    for data_row in data_table.find_all('tr'):
        if data_row.td.string == u'Name':
            continue
        cells = data_row.find_all('td')
        facility = {}
        facility['name'] = get_value(cells[0])
        facility['location'] = get_value(cells[1])
        facility['url'] = "%s#%s@%s" % (URL_ROOT,
                                     facility['name'],
                                     facility['location'])
        facility['city'] = 'Dallas'
        facility['zip'] = get_value(cells[3])
        facility['date'] = get_value(cells[5])
        facility['score'] = get_value(cells[6])
        print "Facility: %s" % facility
        csv_writer.writerow(facility)