Python Scraper.urlopen示例

编程语言: Python

命名空间/包名称: scrapelib

类/类型: Scraper

方法/功能: urlopen

hotexamples.com的示例: 5

Python Scraper.urlopen - 已找到5个示例。这些是从开源项目中提取的最受好评的scrapelib.Scraper.urlopen现实Python示例。您可以评价示例，以帮助我们提高示例质量。

常用方法

显示隐藏

Scraper(7)

cache_storage(4)

cache_write_only(2)

request(2)

urlopen(2)

user_agent(2)

timeout(1)

示例#1

显示文件

文件： __init__.py 项目： Empact/fiftystates

def get_session_details():
    """
    We will fetch a list of available sessions from the 'bill locator' page.
    We won't get legislators for all these sessions, but all bills for these
    sessions are available and we want to be able to get to them.
    """
    scraper = Scraper()

    nm_locator_url = 'http://legis.state.nm.us/lcs/locator.aspx'
    with scraper.urlopen(nm_locator_url) as page:
        page = BeautifulSoup(page)

        #The first `tr` is simply 'Bill Locator`. Ignoring that
        data_table = page.find('table', id = 'ctl00_mainCopy_Locators')('tr')[1:]
        for session in data_table:
            session_tag = session.find('a')
            session_name = ' '.join([tag.string.strip() for tag in session_tag('span')]).strip()

            session_year, sub_session_name = SESSION_NAME_RE.match(session_name).groups()
            if session_year in metadata['sessions']:
                if sub_session_name not in metadata['session_details'][session_year]['sub_sessions']:
                    metadata['session_details'][session_year]['sub_sessions'].append(sub_session_name)
            else:
                metadata['sessions'].append(session_year)
                metadata['session_details'][session_year] = dict(years = session_year, sub_sessions = [sub_session_name])

示例#2

显示文件

文件： __init__.py 项目： rzuck/openstates

def get_session_details():
    """
    We will fetch a list of available sessions from the 'bill locator' page.
    We won't get legislators for all these sessions, but all bills for these
    sessions are available and we want to be able to get to them.
    """
    scraper = Scraper()

    nm_locator_url = 'http://legis.state.nm.us/lcs/locator.aspx'
    with scraper.urlopen(nm_locator_url) as page:
        page = BeautifulSoup(page)

        #The first `tr` is simply 'Bill Locator`. Ignoring that
        data_table = page.find('table', id = 'ctl00_mainCopy_Locators')('tr')[1:]
        for session in data_table:
            session_tag = session.find('a')
            session_name = ' '.join([tag.string.strip() for tag in session_tag('span')]).strip()

            session_year, sub_session_name = SESSION_NAME_RE.match(session_name).groups()
            if session_year in metadata['sessions']:
                if sub_session_name not in metadata['session_details'][session_year]['sub_sessions']:
                    metadata['session_details'][session_year]['sub_sessions'].append(sub_session_name)
            else:
                metadata['sessions'].append(session_year)
                metadata['session_details'][session_year] = dict(years = session_year, sub_sessions = [sub_session_name])

示例#3

显示文件

文件： utils.py 项目： rhymeswithcycle/scrapers-ca

def lxmlize(url, encoding="utf-8", user_agent=requests.utils.default_user_agent()):
    scraper = Scrapelib(follow_robots=False, requests_per_minute=0)
    scraper.user_agent = user_agent
    entry = scraper.urlopen(url)
    if encoding != "utf-8" or not isinstance(entry, unicode):
        entry = entry.encode(encoding)
    page = lxml.html.fromstring(entry)
    meta = page.xpath('//meta[@http-equiv="refresh"]')
    if meta:
        _, url = meta[0].attrib["content"].split("=", 1)
        return lxmlize(url, encoding)
    else:
        page.make_links_absolute(url)
        return page

示例#4

显示文件

文件： import_bills.py 项目： BrandonLewis/watchingaz

def import_bills(state, last_updated, cache_dir, data_dir):
    if last_updated:
        scraper = Scraper(cache_dir=cache_dir)
        url = BILL_INDEX + "?%s"
        query = {'state': state, 'updated_since': last_updated, # YYYY-MM-DD
                 'apikey': settings.SUNLIGHT_API_KEY}
        query = urllib.urlencode(query)
        url = url % query
        with scraper.urlopen(url) as bill_index:
            bills = json.loads(bill_index)
            for b in bills:
                url = BILL_INDEX + "%s/%s/%s/?apikey=%s" % (b['state'], b['session'],
                                                  urllib.quote(b['bill_id']), settings.SUNLIGHT_API_KEY)
                with scraper.urlopen(url) as bill_page:
                    bill = json.loads(bill_page)
                    process_bill(bill)
    else:
        pattern = os.path.join(data_dir, state, 'bills', state)
        sessions = Session.objects.values_list('name')
        _request_frequency = 1
        _last_request = 0
        for session in sessions:
            for chamber in ('upper', 'lower'):
                paths = glob.glob(os.path.join(pattern, session[0], chamber, '*'))
                for path in sorted(paths):
                    now = time.time()
                    diff = _request_frequency - (now - _last_request)
                    if diff > 0:
                        print "sleeping for %fs" % diff
                        time.sleep(diff)
                        _last_request = time.time()
                    else:
                        _last_request = now
                    page = open(path, 'rb')
                    bill = json.load(page)
                    page.close()
                    process_bill(bill)

示例#5

显示文件

文件： utils.py 项目： fchagnon/scrapers-ca

def lxmlize(url,
            encoding='utf-8',
            user_agent=requests.utils.default_user_agent()):
    scraper = Scrapelib(follow_robots=False, requests_per_minute=0)
    scraper.user_agent = user_agent
    entry = scraper.urlopen(url)
    if encoding != 'utf-8' or not isinstance(entry, unicode):
        entry = entry.encode(encoding)
    page = lxml.html.fromstring(entry)
    meta = page.xpath('//meta[@http-equiv="refresh"]')
    if meta:
        _, url = meta[0].attrib['content'].split('=', 1)
        return lxmlize(url, encoding)
    else:
        page.make_links_absolute(url)
        return page