예제 #1
0
    def newfunc(
        header: typing.List[str],
        retries: int,
        retry_wait: int,
        rpm: int,
        timeout: int,
        user_agent: str,
        verbosity: int,
        verify: bool,
        fastmode: bool,
        **kwargs: str,
    ) -> None:
        scraper = Scraper(
            requests_per_minute=rpm,
            retry_attempts=retries,
            retry_wait_seconds=retry_wait,
            verify=verify,
        )
        scraper.timeout = timeout
        scraper.user_agent = user_agent
        # only update headers, don't overwrite defaults
        scraper.headers.update(
            {k.strip(): v.strip()
             for k, v in [h.split(":") for h in header]})
        if fastmode:
            scraper.cache_storage = SQLiteCache("spatula-cache.db")
            scraper.cache_write_only = False

        if verbosity == -1:
            level = logging.INFO if func.__name__ != "test" else logging.DEBUG
        elif verbosity == 0:  # pragma: no cover
            level = logging.ERROR
        elif verbosity == 1:  # pragma: no cover
            level = logging.INFO
        elif verbosity >= 2:  # pragma: no cover
            level = logging.DEBUG

        if verbosity < 3:
            # replace parent library logging
            logging.getLogger("scrapelib").setLevel(logging.ERROR)
            logging.getLogger("urllib3").setLevel(logging.WARNING)
        logging.basicConfig(level=level)

        return func(**kwargs, scraper=scraper)
예제 #2
0
            ('affordable care act', 'obamacare', 'healthcare', 'health care',
             'insurance'), ('ukraine', 'ukrainian', 'crimea'),
            ('unemployed', 'unemployment')]

#('palestine', 'palestinians'),
#('israel', 'israeli', 'palestine', 'palestinians'),
#('iraq', 'iraqis', 'iraqs'),
#('executive order', 'executive action'),
#('economy', 'economic'),

ROOT_URL = 'http://www.whitehouse.gov/briefing-room/press-briefings'
CSV_PATH = 'briefing_links.csv'

s = Scraper(requests_per_minute=60)
s.cache_storage = FileCache('press_briefing_cache')
s.cache_write_only = False


@task(default=True)
def update():
    """
    Stub function for updating app-specific data.
    """
    #update_featured_social()


@task
def scrape_briefings():
    for index in range(0, 22):
        list = '%s?page=%i' % (ROOT_URL, index)
        print 'parsing %s' % list
예제 #3
0
#!/usr/bin/env python

import unicodecsv
import statestyle
import re
from lxml.html import fromstring
from urlparse import urljoin

# I copied this code from scrape_states.py

from scrapelib import Scraper, FileCache

s = Scraper(requests_per_minute=60, follow_robots=False)
s.cache_storage = FileCache('wikipedia_cache')
s.cache_write_only = False

# My Stuff

CD_LIST = 'https://en.wikipedia.org/wiki/List_of_United_States_congressional_districts'
NON_VOTING = ['American Samoa', 'District of Columbia', 'Guam', 'Northern Mariana Islands', 'Puerto Rico', 'United States Virgin Islands']
NOT_STATES = ['Philippines', 'U.S. Virgin Islands']

def parse_cd_file():
	writer = unicodecsv.writer(open('cd_wiki_data.csv', 'w'))
	writer.writerow(['full_geoid', 'wiki_url'])
	response = s.urlopen(CD_LIST)
	doc = fromstring(response)
	for h2 in doc.findall('.//h2')[2:59]:
		for span in h2.find_class('mw-headline'):
			if span.text_content() in NOT_STATES:
				break