def main(argv=None): fields = ['url', 'name', 'location', 'city', 'zip', 'date', 'score'] filename = 'austin.csv' csv_writer = get_csv_writer(filename, fields) print "Starting with url %s" % URL_ROOT print "POST params %s" % SEARCH_PARAMS search_resp = requests.post(URL_ROOT, data=SEARCH_PARAMS) soup = BeautifulSoup(search_resp.content) content = soup.find(id='col3_content') resultsHeader = content.p.text m = re.search('(?P<count>\d+) records found', resultsHeader) total_results = m.group('count').strip() print "Total Results: %s " % total_results data_table = content.table for data_row in data_table.find_all('tr'): if not getattr(data_row, 'td', False): continue cells = data_row.find_all('td') facility = {} facility['name'] = get_value(cells[0]) facility['location'] = get_value(cells[1]) facility['url'] = "%s#%s@%s" % (URL_ROOT, facility['name'], facility['location']) facility['city'] = 'Austin' facility['zip'] = get_value(cells[3]) facility['date'] = get_value(cells[4]) facility['score'] = get_value(cells[5]) print "Facility: %s" % facility csv_writer.writerow(facility)
def main(argv=None): fields = ['url', 'name', 'location', 'city', 'zip', 'date', 'score'] filename = 'whoodat.csv' csv_writer = get_csv_writer(filename, fields) page_number=0 more_to_do=True # Have to use mechanize to get past the pain in the ASPX form handling br = mechanize.Browser() br.user_agent_alias = 'Linux Firefox' while more_to_do: print BRK # SEARCH_PARAMS['pageIndex']=page_number search_url = "%s%s%i%s" % (URL_ROOT, '/Results.aspx?pageIndex=',page_number,'&pageSize=150') search_resp = br.open(search_url) content=search_resp.read() soup=BeautifulSoup(content) if soup.find('div',text=re.compile('No records to display')): more_to_do=False else: facilities=_scrape_content(content, csv_writer) time.sleep(SECONDS_THROTTLE) page_number=page_number+1 print "============= DONE ==============="
def main(argv=None): fields = ['url', 'name', 'location', 'city', 'zip', 'date', 'score'] filename = 'tulsa.csv' csv_writer = get_csv_writer(filename, fields) search_url = "%s%s" % (URL_ROOT, 'index.cfm') print "Starting with url %s" % search_url print "POST params %s" % SEARCH_PARAMS search_resp = requests.post(search_url, data=SEARCH_PARAMS) soup = BeautifulSoup(search_resp.content) content = soup.find(id='content') _scrape_content(content, csv_writer)
def main(argv=None): fields = ['url', 'name', 'location', 'city', 'zip', 'date', 'score'] filename = 'lasvegas.csv' csv_writer = get_csv_writer(filename, fields) more_to_do=True rest_start=0 while more_to_do: SEARCH_PARAMS['start']=rest_start search_resp = requests.post(URL_ROOT,data=SEARCH_PARAMS) print BRK # we have to fix this almost valid JSON response json_blob=search_resp.content json_blob=json_blob.replace('total','\"total\"') json_blob=json_blob.replace('restaurants','\"restaurants\"') response_list=json.loads(json_blob) response_count=len(response_list['restaurants']) if response_count==0: more_to_do=False else: for rest in response_list['restaurants']: facility={} facility['url']='http://www.southernnevadahealthdistrict.org/restaurants/inspections.php' facility['name']= rest['restaurant_name'] facility['location']="%s" % rest['address'] facility['city']="%s, %s" % (rest['city_name'],rest['state']) facility['zip']=rest['zip_code'] facility['date']=rest['date_current'].split(' ')[0] facility['score']=100-int(rest['demerits']) print "Facility: %s" % facility csv_writer.writerow(facility) rest_start=rest_start+response_count time.sleep(SECONDS_THROTTLE)
def main(argv=None): fields = ['url', 'name', 'location', 'city', 'zip', 'date', 'score'] filename = 'dallas.csv' csv_writer = get_csv_writer(filename, fields) print "Starting with url %s" % URL_ROOT print "POST params %s" % SEARCH_PARAMS resp = requests.post("%s%s" % (URL_ROOT, 'SearchScoresAction.cfm'), data=SEARCH_PARAMS) soup = BeautifulSoup(resp.content) body = soup.find('body') resultsHeader = body.find('span', {'class': 'style14'}).text m = re.search('Found (?P<count>\d+) records', resultsHeader) total_results = m.group('count').strip() print "Total Results: %s " % total_results data_table = body.find_all('table')[1] write_data_table(data_table, csv_writer) check_for_next_page(body, resp.cookies, csv_writer)
def main(argv=None): fields = ['url', 'name', 'location', 'city', 'zip', 'date', 'score'] filename = 'sanantonio.csv' csv_writer = get_csv_writer(filename, fields) search_url = "%s%s" % (URL_ROOT, 'search.cfm') print "Starting with url %s" % search_url print "POST params %s" % SEARCH_PARAMS search_resp = requests.post(search_url, data=SEARCH_PARAMS) soup = BeautifulSoup(search_resp.content) content = soup.find('td') page_links = content.find_all(href=re.compile("search")) for page_link in page_links: next_page_url = "%s%s" % (URL_ROOT, page_link['href']) next_page_content = requests.get(next_page_url) next_page_soup=BeautifulSoup(next_page_content.content) content = next_page_soup.find('td') _scrape_content(content, csv_writer)
def main(argv=None): fields = ['url', 'name', 'location', 'city', 'zip', 'date', 'score'] filename = 'sandiego.csv' csv_writer = get_csv_writer(filename, fields) # Have to use mechanize to get past ASPX form handling br = mechanize.Browser() br.user_agent_alias = 'Linux Firefox' start_search_url=(URL_ROOT+'/default.aspx') start_search=br.open(start_search_url) soup=BeautifulSoup(start_search.read()) city_list=soup.find('select',id='lbCity').find_all('option') city_list.pop(0) # get rid of 'Select a City' for city in city_list: print BRK br.select_form(name='Form1') br.form.set_all_readonly(False) # allow changing the .value of all controls br["__EVENTTARGET"] = 'lbCity' br['lbCity']=[city.get_text()] city_results=br.submit() br.select_form(name='Form1') try: more_pages = br.form.find_control("btnNext") except: more_pages=False if more_pages: br.form.set_all_readonly(False) # allow changing the .value of all controls br["__EVENTTARGET"] = 'Linkbutton3' city_results=br.submit() facilities=_scrape_content(city_results, csv_writer) time.sleep(SECONDS_THROTTLE) br.open(start_search_url) return