Exemplos de Scraper.urlopen em Python, exemplos de scrapelib.Scraper.urlopen em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: __init__.py Projeto: Empact/fiftystates

def get_session_details():
    """
    We will fetch a list of available sessions from the 'bill locator' page.
    We won't get legislators for all these sessions, but all bills for these
    sessions are available and we want to be able to get to them.
    """
    scraper = Scraper()

    nm_locator_url = 'http://legis.state.nm.us/lcs/locator.aspx'
    with scraper.urlopen(nm_locator_url) as page:
        page = BeautifulSoup(page)

        #The first `tr` is simply 'Bill Locator`. Ignoring that
        data_table = page.find('table', id = 'ctl00_mainCopy_Locators')('tr')[1:]
        for session in data_table:
            session_tag = session.find('a')
            session_name = ' '.join([tag.string.strip() for tag in session_tag('span')]).strip()

            session_year, sub_session_name = SESSION_NAME_RE.match(session_name).groups()
            if session_year in metadata['sessions']:
                if sub_session_name not in metadata['session_details'][session_year]['sub_sessions']:
                    metadata['session_details'][session_year]['sub_sessions'].append(sub_session_name)
            else:
                metadata['sessions'].append(session_year)
                metadata['session_details'][session_year] = dict(years = session_year, sub_sessions = [sub_session_name])

Exemplo n.º 2

0

Exibir arquivo

Arquivo: __init__.py Projeto: rzuck/openstates

def get_session_details():
    """
    We will fetch a list of available sessions from the 'bill locator' page.
    We won't get legislators for all these sessions, but all bills for these
    sessions are available and we want to be able to get to them.
    """
    scraper = Scraper()

    nm_locator_url = 'http://legis.state.nm.us/lcs/locator.aspx'
    with scraper.urlopen(nm_locator_url) as page:
        page = BeautifulSoup(page)

        #The first `tr` is simply 'Bill Locator`. Ignoring that
        data_table = page.find('table', id = 'ctl00_mainCopy_Locators')('tr')[1:]
        for session in data_table:
            session_tag = session.find('a')
            session_name = ' '.join([tag.string.strip() for tag in session_tag('span')]).strip()

            session_year, sub_session_name = SESSION_NAME_RE.match(session_name).groups()
            if session_year in metadata['sessions']:
                if sub_session_name not in metadata['session_details'][session_year]['sub_sessions']:
                    metadata['session_details'][session_year]['sub_sessions'].append(sub_session_name)
            else:
                metadata['sessions'].append(session_year)
                metadata['session_details'][session_year] = dict(years = session_year, sub_sessions = [sub_session_name])

Exemplo n.º 3

0

Exibir arquivo

Arquivo: utils.py Projeto: rhymeswithcycle/scrapers-ca

def lxmlize(url, encoding="utf-8", user_agent=requests.utils.default_user_agent()):
    scraper = Scrapelib(follow_robots=False, requests_per_minute=0)
    scraper.user_agent = user_agent
    entry = scraper.urlopen(url)
    if encoding != "utf-8" or not isinstance(entry, unicode):
        entry = entry.encode(encoding)
    page = lxml.html.fromstring(entry)
    meta = page.xpath('//meta[@http-equiv="refresh"]')
    if meta:
        _, url = meta[0].attrib["content"].split("=", 1)
        return lxmlize(url, encoding)
    else:
        page.make_links_absolute(url)
        return page

Exemplo n.º 4

0

Exibir arquivo

Arquivo: import_bills.py Projeto: BrandonLewis/watchingaz

def import_bills(state, last_updated, cache_dir, data_dir):
    if last_updated:
        scraper = Scraper(cache_dir=cache_dir)
        url = BILL_INDEX + "?%s"
        query = {'state': state, 'updated_since': last_updated, # YYYY-MM-DD
                 'apikey': settings.SUNLIGHT_API_KEY}
        query = urllib.urlencode(query)
        url = url % query
        with scraper.urlopen(url) as bill_index:
            bills = json.loads(bill_index)
            for b in bills:
                url = BILL_INDEX + "%s/%s/%s/?apikey=%s" % (b['state'], b['session'],
                                                  urllib.quote(b['bill_id']), settings.SUNLIGHT_API_KEY)
                with scraper.urlopen(url) as bill_page:
                    bill = json.loads(bill_page)
                    process_bill(bill)
    else:
        pattern = os.path.join(data_dir, state, 'bills', state)
        sessions = Session.objects.values_list('name')
        _request_frequency = 1
        _last_request = 0
        for session in sessions:
            for chamber in ('upper', 'lower'):
                paths = glob.glob(os.path.join(pattern, session[0], chamber, '*'))
                for path in sorted(paths):
                    now = time.time()
                    diff = _request_frequency - (now - _last_request)
                    if diff > 0:
                        print "sleeping for %fs" % diff
                        time.sleep(diff)
                        _last_request = time.time()
                    else:
                        _last_request = now
                    page = open(path, 'rb')
                    bill = json.load(page)
                    page.close()
                    process_bill(bill)

Exemplo n.º 5

0

Exibir arquivo

Arquivo: utils.py Projeto: fchagnon/scrapers-ca

def lxmlize(url,
            encoding='utf-8',
            user_agent=requests.utils.default_user_agent()):
    scraper = Scrapelib(follow_robots=False, requests_per_minute=0)
    scraper.user_agent = user_agent
    entry = scraper.urlopen(url)
    if encoding != 'utf-8' or not isinstance(entry, unicode):
        entry = entry.encode(encoding)
    page = lxml.html.fromstring(entry)
    meta = page.xpath('//meta[@http-equiv="refresh"]')
    if meta:
        _, url = meta[0].attrib['content'].split('=', 1)
        return lxmlize(url, encoding)
    else:
        page.make_links_absolute(url)
        return page