示例#1
0
def main(argv=None):
    fields = ['url', 'name', 'location', 'city', 'zip', 'date', 'score']
    filename = 'austin.csv'
    csv_writer = get_csv_writer(filename, fields)

    print "Starting with url %s" % URL_ROOT
    print "POST params %s" % SEARCH_PARAMS
    search_resp = requests.post(URL_ROOT, data=SEARCH_PARAMS)
    soup = BeautifulSoup(search_resp.content)
    content = soup.find(id='col3_content')

    resultsHeader = content.p.text
    m = re.search('(?P<count>\d+) records found', resultsHeader)
    total_results = m.group('count').strip()
    print "Total Results: %s " % total_results

    data_table = content.table
    for data_row in data_table.find_all('tr'):
        if not getattr(data_row, 'td', False):
            continue
        cells = data_row.find_all('td')
        facility = {}
        facility['name'] = get_value(cells[0])
        facility['location'] = get_value(cells[1])
        facility['url'] = "%s#%s@%s" % (URL_ROOT,
                                     facility['name'],
                                     facility['location'])
        facility['city'] = 'Austin'
        facility['zip'] = get_value(cells[3])
        facility['date'] = get_value(cells[4])
        facility['score'] = get_value(cells[5])
        print "Facility: %s" % facility
        csv_writer.writerow(facility)
示例#2
0
def main(argv=None):
    fields = ['url', 'name', 'location', 'city', 'zip', 'date', 'score']
    filename = 'austin.csv'
    csv_writer = get_csv_writer(filename, fields)

    print "Starting with url %s" % URL_ROOT
    print "POST params %s" % SEARCH_PARAMS
    search_resp = requests.post(URL_ROOT, data=SEARCH_PARAMS)
    soup = BeautifulSoup(search_resp.content)
    content = soup.find(id='col3_content')

    resultsHeader = content.p.text
    m = re.search('(?P<count>\d+) records found', resultsHeader)
    total_results = m.group('count').strip()
    print "Total Results: %s " % total_results

    data_table = content.table
    for data_row in data_table.find_all('tr'):
        if not getattr(data_row, 'td', False):
            continue
        cells = data_row.find_all('td')
        facility = {}
        facility['name'] = get_value(cells[0])
        facility['location'] = get_value(cells[1])
        facility['url'] = "%s#%s@%s" % (URL_ROOT, facility['name'],
                                        facility['location'])
        facility['city'] = 'Austin'
        facility['zip'] = get_value(cells[3])
        facility['date'] = get_value(cells[4])
        facility['score'] = get_value(cells[5])
        print "Facility: %s" % facility
        csv_writer.writerow(facility)
示例#3
0
def main(argv=None):
    fields = ['url', 'name', 'location', 'city', 'zip', 'date', 'score']
    filename = 'whoodat.csv'
    csv_writer = get_csv_writer(filename, fields)
    
    page_number=0
    more_to_do=True

#   Have to use mechanize to get past the pain in the ASPX form handling
    br = mechanize.Browser()
    br.user_agent_alias = 'Linux Firefox'

    while more_to_do:

        print BRK
        # SEARCH_PARAMS['pageIndex']=page_number
        search_url = "%s%s%i%s" % (URL_ROOT, '/Results.aspx?pageIndex=',page_number,'&pageSize=150')
        search_resp = br.open(search_url)
        content=search_resp.read()
        soup=BeautifulSoup(content)

        if soup.find('div',text=re.compile('No records to display')):
            more_to_do=False
        else:
            facilities=_scrape_content(content, csv_writer)
            time.sleep(SECONDS_THROTTLE)
            page_number=page_number+1

    print "============= DONE ==============="
示例#4
0
def main(argv=None):
    fields = ['url', 'name', 'location', 'city', 'zip', 'date', 'score']
    filename = 'tulsa.csv'
    csv_writer = get_csv_writer(filename, fields)

    search_url = "%s%s" % (URL_ROOT, 'index.cfm')
    print "Starting with url %s" % search_url
    print "POST params %s" % SEARCH_PARAMS
    search_resp = requests.post(search_url, data=SEARCH_PARAMS)
    soup = BeautifulSoup(search_resp.content)
    content = soup.find(id='content')
    _scrape_content(content, csv_writer)
示例#5
0
def main(argv=None):
    fields = ['url', 'name', 'location', 'city', 'zip', 'date', 'score']
    filename = 'tulsa.csv'
    csv_writer = get_csv_writer(filename, fields)

    search_url = "%s%s" % (URL_ROOT, 'index.cfm')
    print "Starting with url %s" % search_url
    print "POST params %s" % SEARCH_PARAMS
    search_resp = requests.post(search_url, data=SEARCH_PARAMS)
    soup = BeautifulSoup(search_resp.content)
    content = soup.find(id='content')
    _scrape_content(content, csv_writer)
示例#6
0
def main(argv=None):
    fields = ['url', 'name', 'location', 'city', 'zip', 'date', 'score']
    filename = 'lasvegas.csv'
    csv_writer = get_csv_writer(filename, fields)
    more_to_do=True
    
    
    rest_start=0
    
    while more_to_do:

        SEARCH_PARAMS['start']=rest_start
        search_resp = requests.post(URL_ROOT,data=SEARCH_PARAMS)
        print BRK
        # we have to fix this almost valid JSON response
        json_blob=search_resp.content
        json_blob=json_blob.replace('total','\"total\"')
        json_blob=json_blob.replace('restaurants','\"restaurants\"')
        response_list=json.loads(json_blob)
        response_count=len(response_list['restaurants'])
        
        if response_count==0:
            more_to_do=False
        else:

            for rest in response_list['restaurants']:
                facility={}
                facility['url']='http://www.southernnevadahealthdistrict.org/restaurants/inspections.php'
                facility['name']= rest['restaurant_name']
                facility['location']="%s" % rest['address']
                facility['city']="%s, %s" % (rest['city_name'],rest['state'])
                facility['zip']=rest['zip_code']
                facility['date']=rest['date_current'].split(' ')[0]
                facility['score']=100-int(rest['demerits'])

                print "Facility: %s" % facility
                csv_writer.writerow(facility)
                
 
            rest_start=rest_start+response_count
        
        time.sleep(SECONDS_THROTTLE)
示例#7
0
def main(argv=None):
    fields = ['url', 'name', 'location', 'city', 'zip', 'date', 'score']
    filename = 'dallas.csv'
    csv_writer = get_csv_writer(filename, fields)

    print "Starting with url %s" % URL_ROOT
    print "POST params %s" % SEARCH_PARAMS
    resp = requests.post("%s%s" % (URL_ROOT, 'SearchScoresAction.cfm'),
                                          data=SEARCH_PARAMS)
    soup = BeautifulSoup(resp.content)
    body = soup.find('body')

    resultsHeader = body.find('span', {'class': 'style14'}).text
    m = re.search('Found (?P<count>\d+) records', resultsHeader)
    total_results = m.group('count').strip()
    print "Total Results: %s " % total_results

    data_table = body.find_all('table')[1]
    write_data_table(data_table, csv_writer)
    check_for_next_page(body, resp.cookies, csv_writer)
示例#8
0
def main(argv=None):
    fields = ['url', 'name', 'location', 'city', 'zip', 'date', 'score']
    filename = 'sanantonio.csv'
    csv_writer = get_csv_writer(filename, fields)

    search_url = "%s%s" % (URL_ROOT, 'search.cfm')
    print "Starting with url %s" % search_url
    print "POST params %s" % SEARCH_PARAMS
    search_resp = requests.post(search_url, data=SEARCH_PARAMS)
    soup = BeautifulSoup(search_resp.content)
    content = soup.find('td')

    page_links = content.find_all(href=re.compile("search"))

    for page_link in page_links:
        next_page_url = "%s%s" % (URL_ROOT, page_link['href'])
        next_page_content = requests.get(next_page_url)
        next_page_soup=BeautifulSoup(next_page_content.content)
        content = next_page_soup.find('td')
        _scrape_content(content, csv_writer)
示例#9
0
def main(argv=None):
    fields = ['url', 'name', 'location', 'city', 'zip', 'date', 'score']
    filename = 'sandiego.csv'
    csv_writer = get_csv_writer(filename, fields)

#   Have to use mechanize to get past ASPX form handling
    br = mechanize.Browser()
    br.user_agent_alias = 'Linux Firefox' 
    start_search_url=(URL_ROOT+'/default.aspx')
    start_search=br.open(start_search_url)

    soup=BeautifulSoup(start_search.read())    
    city_list=soup.find('select',id='lbCity').find_all('option')
    city_list.pop(0) # get rid of 'Select a City'

    for city in city_list:
        print BRK
        br.select_form(name='Form1')
        br.form.set_all_readonly(False) # allow changing the .value of all controls
        br["__EVENTTARGET"] = 'lbCity'        
        br['lbCity']=[city.get_text()]
        city_results=br.submit()

        br.select_form(name='Form1')
        try:
            more_pages = br.form.find_control("btnNext")
        except:
            more_pages=False
        
        if more_pages:
            br.form.set_all_readonly(False) # allow changing the .value of all controls
            br["__EVENTTARGET"] = 'Linkbutton3'
            city_results=br.submit()
                

        facilities=_scrape_content(city_results, csv_writer)
        
        time.sleep(SECONDS_THROTTLE)
        br.open(start_search_url)
    return