def get_session_details(): """ We will fetch a list of available sessions from the 'bill locator' page. We won't get legislators for all these sessions, but all bills for these sessions are available and we want to be able to get to them. """ scraper = Scraper() nm_locator_url = 'http://legis.state.nm.us/lcs/locator.aspx' with scraper.urlopen(nm_locator_url) as page: page = BeautifulSoup(page) #The first `tr` is simply 'Bill Locator`. Ignoring that data_table = page.find('table', id = 'ctl00_mainCopy_Locators')('tr')[1:] for session in data_table: session_tag = session.find('a') session_name = ' '.join([tag.string.strip() for tag in session_tag('span')]).strip() session_year, sub_session_name = SESSION_NAME_RE.match(session_name).groups() if session_year in metadata['sessions']: if sub_session_name not in metadata['session_details'][session_year]['sub_sessions']: metadata['session_details'][session_year]['sub_sessions'].append(sub_session_name) else: metadata['sessions'].append(session_year) metadata['session_details'][session_year] = dict(years = session_year, sub_sessions = [sub_session_name])
def lxmlize(url, encoding="utf-8", user_agent=requests.utils.default_user_agent()): scraper = Scrapelib(follow_robots=False, requests_per_minute=0) scraper.user_agent = user_agent entry = scraper.urlopen(url) if encoding != "utf-8" or not isinstance(entry, unicode): entry = entry.encode(encoding) page = lxml.html.fromstring(entry) meta = page.xpath('//meta[@http-equiv="refresh"]') if meta: _, url = meta[0].attrib["content"].split("=", 1) return lxmlize(url, encoding) else: page.make_links_absolute(url) return page
def import_bills(state, last_updated, cache_dir, data_dir): if last_updated: scraper = Scraper(cache_dir=cache_dir) url = BILL_INDEX + "?%s" query = {'state': state, 'updated_since': last_updated, # YYYY-MM-DD 'apikey': settings.SUNLIGHT_API_KEY} query = urllib.urlencode(query) url = url % query with scraper.urlopen(url) as bill_index: bills = json.loads(bill_index) for b in bills: url = BILL_INDEX + "%s/%s/%s/?apikey=%s" % (b['state'], b['session'], urllib.quote(b['bill_id']), settings.SUNLIGHT_API_KEY) with scraper.urlopen(url) as bill_page: bill = json.loads(bill_page) process_bill(bill) else: pattern = os.path.join(data_dir, state, 'bills', state) sessions = Session.objects.values_list('name') _request_frequency = 1 _last_request = 0 for session in sessions: for chamber in ('upper', 'lower'): paths = glob.glob(os.path.join(pattern, session[0], chamber, '*')) for path in sorted(paths): now = time.time() diff = _request_frequency - (now - _last_request) if diff > 0: print "sleeping for %fs" % diff time.sleep(diff) _last_request = time.time() else: _last_request = now page = open(path, 'rb') bill = json.load(page) page.close() process_bill(bill)
def lxmlize(url, encoding='utf-8', user_agent=requests.utils.default_user_agent()): scraper = Scrapelib(follow_robots=False, requests_per_minute=0) scraper.user_agent = user_agent entry = scraper.urlopen(url) if encoding != 'utf-8' or not isinstance(entry, unicode): entry = entry.encode(encoding) page = lxml.html.fromstring(entry) meta = page.xpath('//meta[@http-equiv="refresh"]') if meta: _, url = meta[0].attrib['content'].split('=', 1) return lxmlize(url, encoding) else: page.make_links_absolute(url) return page