Exemplo n.º 1
0
def get_session_details():
    """
    We will fetch a list of available sessions from the 'bill locator' page.
    We won't get legislators for all these sessions, but all bills for these
    sessions are available and we want to be able to get to them.
    """
    scraper = Scraper()

    nm_locator_url = 'http://legis.state.nm.us/lcs/locator.aspx'
    with scraper.urlopen(nm_locator_url) as page:
        page = BeautifulSoup(page)

        #The first `tr` is simply 'Bill Locator`. Ignoring that
        data_table = page.find('table', id = 'ctl00_mainCopy_Locators')('tr')[1:]
        for session in data_table:
            session_tag = session.find('a')
            session_name = ' '.join([tag.string.strip() for tag in session_tag('span')]).strip()

            session_year, sub_session_name = SESSION_NAME_RE.match(session_name).groups()
            if session_year in metadata['sessions']:
                if sub_session_name not in metadata['session_details'][session_year]['sub_sessions']:
                    metadata['session_details'][session_year]['sub_sessions'].append(sub_session_name)
            else:
                metadata['sessions'].append(session_year)
                metadata['session_details'][session_year] = dict(years = session_year, sub_sessions = [sub_session_name])
Exemplo n.º 2
0
def get_session_details():
    """
    We will fetch a list of available sessions from the 'bill locator' page.
    We won't get legislators for all these sessions, but all bills for these
    sessions are available and we want to be able to get to them.
    """
    scraper = Scraper()

    nm_locator_url = 'http://legis.state.nm.us/lcs/locator.aspx'
    with scraper.urlopen(nm_locator_url) as page:
        page = BeautifulSoup(page)

        #The first `tr` is simply 'Bill Locator`. Ignoring that
        data_table = page.find('table', id = 'ctl00_mainCopy_Locators')('tr')[1:]
        for session in data_table:
            session_tag = session.find('a')
            session_name = ' '.join([tag.string.strip() for tag in session_tag('span')]).strip()

            session_year, sub_session_name = SESSION_NAME_RE.match(session_name).groups()
            if session_year in metadata['sessions']:
                if sub_session_name not in metadata['session_details'][session_year]['sub_sessions']:
                    metadata['session_details'][session_year]['sub_sessions'].append(sub_session_name)
            else:
                metadata['sessions'].append(session_year)
                metadata['session_details'][session_year] = dict(years = session_year, sub_sessions = [sub_session_name])
Exemplo n.º 3
0
def lxmlize(url, encoding="utf-8", user_agent=requests.utils.default_user_agent()):
    scraper = Scrapelib(follow_robots=False, requests_per_minute=0)
    scraper.user_agent = user_agent
    entry = scraper.urlopen(url)
    if encoding != "utf-8" or not isinstance(entry, unicode):
        entry = entry.encode(encoding)
    page = lxml.html.fromstring(entry)
    meta = page.xpath('//meta[@http-equiv="refresh"]')
    if meta:
        _, url = meta[0].attrib["content"].split("=", 1)
        return lxmlize(url, encoding)
    else:
        page.make_links_absolute(url)
        return page
Exemplo n.º 4
0
def import_bills(state, last_updated, cache_dir, data_dir):
    if last_updated:
        scraper = Scraper(cache_dir=cache_dir)
        url = BILL_INDEX + "?%s"
        query = {'state': state, 'updated_since': last_updated, # YYYY-MM-DD
                 'apikey': settings.SUNLIGHT_API_KEY}
        query = urllib.urlencode(query)
        url = url % query
        with scraper.urlopen(url) as bill_index:
            bills = json.loads(bill_index)
            for b in bills:
                url = BILL_INDEX + "%s/%s/%s/?apikey=%s" % (b['state'], b['session'],
                                                  urllib.quote(b['bill_id']), settings.SUNLIGHT_API_KEY)
                with scraper.urlopen(url) as bill_page:
                    bill = json.loads(bill_page)
                    process_bill(bill)
    else:
        pattern = os.path.join(data_dir, state, 'bills', state)
        sessions = Session.objects.values_list('name')
        _request_frequency = 1
        _last_request = 0
        for session in sessions:
            for chamber in ('upper', 'lower'):
                paths = glob.glob(os.path.join(pattern, session[0], chamber, '*'))
                for path in sorted(paths):
                    now = time.time()
                    diff = _request_frequency - (now - _last_request)
                    if diff > 0:
                        print "sleeping for %fs" % diff
                        time.sleep(diff)
                        _last_request = time.time()
                    else:
                        _last_request = now
                    page = open(path, 'rb')
                    bill = json.load(page)
                    page.close()
                    process_bill(bill)
Exemplo n.º 5
0
def lxmlize(url,
            encoding='utf-8',
            user_agent=requests.utils.default_user_agent()):
    scraper = Scrapelib(follow_robots=False, requests_per_minute=0)
    scraper.user_agent = user_agent
    entry = scraper.urlopen(url)
    if encoding != 'utf-8' or not isinstance(entry, unicode):
        entry = entry.encode(encoding)
    page = lxml.html.fromstring(entry)
    meta = page.xpath('//meta[@http-equiv="refresh"]')
    if meta:
        _, url = meta[0].attrib['content'].split('=', 1)
        return lxmlize(url, encoding)
    else:
        page.make_links_absolute(url)
        return page