Python Scraper.Scraper 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: scrapelib

클래스/타입: Scraper

메소드/함수: Scraper

hotexamples.com에서의 예제들: 7

Python Scraper.Scraper - 7개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 scrapelib.Scraper.Scraper에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

Scraper(7)

cache_storage(4)

cache_write_only(2)

request(2)

urlopen(2)

user_agent(2)

timeout(1)

예제 #1

파일 보기

파일: __init__.py 프로젝트: rzuck/openstates

def get_session_details():
    """
    We will fetch a list of available sessions from the 'bill locator' page.
    We won't get legislators for all these sessions, but all bills for these
    sessions are available and we want to be able to get to them.
    """
    scraper = Scraper()

    nm_locator_url = 'http://legis.state.nm.us/lcs/locator.aspx'
    with scraper.urlopen(nm_locator_url) as page:
        page = BeautifulSoup(page)

        #The first `tr` is simply 'Bill Locator`. Ignoring that
        data_table = page.find('table', id = 'ctl00_mainCopy_Locators')('tr')[1:]
        for session in data_table:
            session_tag = session.find('a')
            session_name = ' '.join([tag.string.strip() for tag in session_tag('span')]).strip()

            session_year, sub_session_name = SESSION_NAME_RE.match(session_name).groups()
            if session_year in metadata['sessions']:
                if sub_session_name not in metadata['session_details'][session_year]['sub_sessions']:
                    metadata['session_details'][session_year]['sub_sessions'].append(sub_session_name)
            else:
                metadata['sessions'].append(session_year)
                metadata['session_details'][session_year] = dict(years = session_year, sub_sessions = [sub_session_name])

예제 #2

파일 보기

    def newfunc(
        header: typing.List[str],
        retries: int,
        retry_wait: int,
        rpm: int,
        timeout: int,
        user_agent: str,
        verbosity: int,
        verify: bool,
        fastmode: bool,
        **kwargs: str,
    ) -> None:
        scraper = Scraper(
            requests_per_minute=rpm,
            retry_attempts=retries,
            retry_wait_seconds=retry_wait,
            verify=verify,
        )
        scraper.timeout = timeout
        scraper.user_agent = user_agent
        # only update headers, don't overwrite defaults
        scraper.headers.update(
            {k.strip(): v.strip()
             for k, v in [h.split(":") for h in header]})
        if fastmode:
            scraper.cache_storage = SQLiteCache("spatula-cache.db")
            scraper.cache_write_only = False

        if verbosity == -1:
            level = logging.INFO if func.__name__ != "test" else logging.DEBUG
        elif verbosity == 0:  # pragma: no cover
            level = logging.ERROR
        elif verbosity == 1:  # pragma: no cover
            level = logging.INFO
        elif verbosity >= 2:  # pragma: no cover
            level = logging.DEBUG

        if verbosity < 3:
            # replace parent library logging
            logging.getLogger("scrapelib").setLevel(logging.ERROR)
            logging.getLogger("urllib3").setLevel(logging.WARNING)
        logging.basicConfig(level=level)

        return func(**kwargs, scraper=scraper)

예제 #3

파일 보기

def test_to_items_scout():
    scraper = Scraper()
    page = FirstPage()
    items = list(page._to_items(scraper, scout=True))
    assert len(items) == 3
    assert items[0] == {
        "data": {
            "first": 1
        },
        "__next__": "SecondPage source=NullSource",
    }
    assert items[1] == {
        "data": {
            "first": 2
        },
        "__next__": "SecondPage source=NullSource",
    }
    assert items[2] == {
        "data": {
            "first": 3
        },
        "__next__": "SecondPage source=NullSource",
    }

예제 #4

파일 보기

SYNONYMS = [('isis', 'isil', 'islamic state'),
            ('veteran', 'veterans', 'shinseki', 'va'),
            ('affordable care act', 'obamacare', 'healthcare', 'health care',
             'insurance'), ('ukraine', 'ukrainian', 'crimea'),
            ('unemployed', 'unemployment')]

#('palestine', 'palestinians'),
#('israel', 'israeli', 'palestine', 'palestinians'),
#('iraq', 'iraqis', 'iraqs'),
#('executive order', 'executive action'),
#('economy', 'economic'),

ROOT_URL = 'http://www.whitehouse.gov/briefing-room/press-briefings'
CSV_PATH = 'briefing_links.csv'

s = Scraper(requests_per_minute=60)
s.cache_storage = FileCache('press_briefing_cache')
s.cache_write_only = False


@task(default=True)
def update():
    """
    Stub function for updating app-specific data.
    """
    #update_featured_social()


@task
def scrape_briefings():
    for index in range(0, 22):

예제 #5

파일 보기

# reading in what was output on the last run can make things more efficient.
#
import unicodecsv
from lxml.html import fromstring
from urlparse import urljoin
from urllib import quote_plus
import re
import os, os.path

OUTPUT_FILE = "zctas.csv"

from scrapelib import Scraper, FileCache, HTTPError
BASE_URL = 'https://en.wikipedia.org/wiki/'

# would like to follow robots, but think that the robots parser is broken...
s = Scraper(requests_per_minute=90, follow_robots=False)
s.cache_storage = FileCache('../wikipedia_cache')
s.cache_write_only = False


def test_zips():
    existing = {}
    if os.path.exists(OUTPUT_FILE):
        for row in unicodecsv.DictReader(open(OUTPUT_FILE)):
            existing[row['zip']] = row
        os.rename(OUTPUT_FILE, OUTPUT_FILE + '.bak')
    r = unicodecsv.DictReader(open("2013_Gaz_zcta_national.txt"),
                              delimiter="\t")
    f = r.fieldnames
    writer = unicodecsv.DictWriter(open(OUTPUT_FILE, "w"), ['zip', 'wiki_url'])
    writer.writerow({'zip': 'zip', 'wiki_url': 'wiki_url'})

예제 #6

파일 보기

파일: test_sources.py 프로젝트: jamesturk/spatula

def test_source_no_timeout():
    source = URL("https://httpbin.org/delay/1")
    assert source.get_response(Scraper()).status_code == 200

예제 #7

파일 보기

파일: test_sources.py 프로젝트: jamesturk/spatula

def test_source_timeout():
    source = URL("https://httpbin.org/delay/1", timeout=0.1)
    with pytest.raises(OSError):
        source.get_response(Scraper())