def newfunc( header: typing.List[str], retries: int, retry_wait: int, rpm: int, timeout: int, user_agent: str, verbosity: int, verify: bool, fastmode: bool, **kwargs: str, ) -> None: scraper = Scraper( requests_per_minute=rpm, retry_attempts=retries, retry_wait_seconds=retry_wait, verify=verify, ) scraper.timeout = timeout scraper.user_agent = user_agent # only update headers, don't overwrite defaults scraper.headers.update( {k.strip(): v.strip() for k, v in [h.split(":") for h in header]}) if fastmode: scraper.cache_storage = SQLiteCache("spatula-cache.db") scraper.cache_write_only = False if verbosity == -1: level = logging.INFO if func.__name__ != "test" else logging.DEBUG elif verbosity == 0: # pragma: no cover level = logging.ERROR elif verbosity == 1: # pragma: no cover level = logging.INFO elif verbosity >= 2: # pragma: no cover level = logging.DEBUG if verbosity < 3: # replace parent library logging logging.getLogger("scrapelib").setLevel(logging.ERROR) logging.getLogger("urllib3").setLevel(logging.WARNING) logging.basicConfig(level=level) return func(**kwargs, scraper=scraper)
('veteran', 'veterans', 'shinseki', 'va'), ('affordable care act', 'obamacare', 'healthcare', 'health care', 'insurance'), ('ukraine', 'ukrainian', 'crimea'), ('unemployed', 'unemployment')] #('palestine', 'palestinians'), #('israel', 'israeli', 'palestine', 'palestinians'), #('iraq', 'iraqis', 'iraqs'), #('executive order', 'executive action'), #('economy', 'economic'), ROOT_URL = 'http://www.whitehouse.gov/briefing-room/press-briefings' CSV_PATH = 'briefing_links.csv' s = Scraper(requests_per_minute=60) s.cache_storage = FileCache('press_briefing_cache') s.cache_write_only = False @task(default=True) def update(): """ Stub function for updating app-specific data. """ #update_featured_social() @task def scrape_briefings(): for index in range(0, 22): list = '%s?page=%i' % (ROOT_URL, index)
# import unicodecsv from lxml.html import fromstring from urlparse import urljoin from urllib import quote_plus import re import os, os.path OUTPUT_FILE = "zctas.csv" from scrapelib import Scraper, FileCache, HTTPError BASE_URL = 'https://en.wikipedia.org/wiki/' # would like to follow robots, but think that the robots parser is broken... s = Scraper(requests_per_minute=90, follow_robots=False) s.cache_storage = FileCache('../wikipedia_cache') s.cache_write_only = False def test_zips(): existing = {} if os.path.exists(OUTPUT_FILE): for row in unicodecsv.DictReader(open(OUTPUT_FILE)): existing[row['zip']] = row os.rename(OUTPUT_FILE, OUTPUT_FILE + '.bak') r = unicodecsv.DictReader(open("2013_Gaz_zcta_national.txt"), delimiter="\t") f = r.fieldnames writer = unicodecsv.DictWriter(open(OUTPUT_FILE, "w"), ['zip', 'wiki_url']) writer.writerow({'zip': 'zip', 'wiki_url': 'wiki_url'}) hits = misses = 0
import codecs import csv import urlparse from scrapelib import Scraper, FileCache, HTTPError s = Scraper(requests_per_minute=60) s.cache_storage = FileCache('walmart_cache') s.cache_write_only = False def read_csv(): with open('scrapedsearch.csv', 'rb') as f: reader = csv.DictReader( f, fieldnames=["title", "link", "date", "description"]) for row in reader: print row if row['link'] != 'link': scrape_release(row) def scrape_release(row): path = urlparse.urlparse(row['link'])[2] components = path.split('/') if len(components) > 4: year = components[-4] month = components[-3] day = components[-2] slug = components[-1] filename = '%s-%s-%s-%s' % (year, month, day, slug)
#!/usr/bin/env python import unicodecsv import statestyle import re from lxml.html import fromstring from urlparse import urljoin # I copied this code from scrape_states.py from scrapelib import Scraper, FileCache s = Scraper(requests_per_minute=60, follow_robots=False) s.cache_storage = FileCache('wikipedia_cache') s.cache_write_only = False # My Stuff CD_LIST = 'https://en.wikipedia.org/wiki/List_of_United_States_congressional_districts' NON_VOTING = ['American Samoa', 'District of Columbia', 'Guam', 'Northern Mariana Islands', 'Puerto Rico', 'United States Virgin Islands'] NOT_STATES = ['Philippines', 'U.S. Virgin Islands'] def parse_cd_file(): writer = unicodecsv.writer(open('cd_wiki_data.csv', 'w')) writer.writerow(['full_geoid', 'wiki_url']) response = s.urlopen(CD_LIST) doc = fromstring(response) for h2 in doc.findall('.//h2')[2:59]: for span in h2.find_class('mw-headline'): if span.text_content() in NOT_STATES: break
('affordable care act', 'obamacare', 'healthcare', 'health care', 'insurance'), ('ukraine', 'ukrainian', 'crimea'), ('unemployed', 'unemployment') ] #('palestine', 'palestinians'), #('israel', 'israeli', 'palestine', 'palestinians'), #('iraq', 'iraqis', 'iraqs'), #('executive order', 'executive action'), #('economy', 'economic'), ROOT_URL = 'http://www.whitehouse.gov/briefing-room/press-briefings' CSV_PATH = 'briefing_links.csv' s = Scraper(requests_per_minute=60) s.cache_storage = FileCache('press_briefing_cache') s.cache_write_only = False @task(default=True) def update(): """ Stub function for updating app-specific data. """ #update_featured_social() @task def scrape_briefings(): for index in range(0, 22): list = '%s?page=%i' % (ROOT_URL, index) print 'parsing %s' % list write_corpus(list)