Exemplo n.º 1
0
    def newfunc(
        header: typing.List[str],
        retries: int,
        retry_wait: int,
        rpm: int,
        timeout: int,
        user_agent: str,
        verbosity: int,
        verify: bool,
        fastmode: bool,
        **kwargs: str,
    ) -> None:
        scraper = Scraper(
            requests_per_minute=rpm,
            retry_attempts=retries,
            retry_wait_seconds=retry_wait,
            verify=verify,
        )
        scraper.timeout = timeout
        scraper.user_agent = user_agent
        # only update headers, don't overwrite defaults
        scraper.headers.update(
            {k.strip(): v.strip()
             for k, v in [h.split(":") for h in header]})
        if fastmode:
            scraper.cache_storage = SQLiteCache("spatula-cache.db")
            scraper.cache_write_only = False

        if verbosity == -1:
            level = logging.INFO if func.__name__ != "test" else logging.DEBUG
        elif verbosity == 0:  # pragma: no cover
            level = logging.ERROR
        elif verbosity == 1:  # pragma: no cover
            level = logging.INFO
        elif verbosity >= 2:  # pragma: no cover
            level = logging.DEBUG

        if verbosity < 3:
            # replace parent library logging
            logging.getLogger("scrapelib").setLevel(logging.ERROR)
            logging.getLogger("urllib3").setLevel(logging.WARNING)
        logging.basicConfig(level=level)

        return func(**kwargs, scraper=scraper)
Exemplo n.º 2
0
            ('veteran', 'veterans', 'shinseki', 'va'),
            ('affordable care act', 'obamacare', 'healthcare', 'health care',
             'insurance'), ('ukraine', 'ukrainian', 'crimea'),
            ('unemployed', 'unemployment')]

#('palestine', 'palestinians'),
#('israel', 'israeli', 'palestine', 'palestinians'),
#('iraq', 'iraqis', 'iraqs'),
#('executive order', 'executive action'),
#('economy', 'economic'),

ROOT_URL = 'http://www.whitehouse.gov/briefing-room/press-briefings'
CSV_PATH = 'briefing_links.csv'

s = Scraper(requests_per_minute=60)
s.cache_storage = FileCache('press_briefing_cache')
s.cache_write_only = False


@task(default=True)
def update():
    """
    Stub function for updating app-specific data.
    """
    #update_featured_social()


@task
def scrape_briefings():
    for index in range(0, 22):
        list = '%s?page=%i' % (ROOT_URL, index)
Exemplo n.º 3
0
#
import unicodecsv
from lxml.html import fromstring
from urlparse import urljoin
from urllib import quote_plus
import re
import os, os.path

OUTPUT_FILE = "zctas.csv"

from scrapelib import Scraper, FileCache, HTTPError
BASE_URL = 'https://en.wikipedia.org/wiki/'

# would like to follow robots, but think that the robots parser is broken...
s = Scraper(requests_per_minute=90, follow_robots=False)
s.cache_storage = FileCache('../wikipedia_cache')
s.cache_write_only = False


def test_zips():
    existing = {}
    if os.path.exists(OUTPUT_FILE):
        for row in unicodecsv.DictReader(open(OUTPUT_FILE)):
            existing[row['zip']] = row
        os.rename(OUTPUT_FILE, OUTPUT_FILE + '.bak')
    r = unicodecsv.DictReader(open("2013_Gaz_zcta_national.txt"),
                              delimiter="\t")
    f = r.fieldnames
    writer = unicodecsv.DictWriter(open(OUTPUT_FILE, "w"), ['zip', 'wiki_url'])
    writer.writerow({'zip': 'zip', 'wiki_url': 'wiki_url'})
    hits = misses = 0
Exemplo n.º 4
0
import codecs
import csv
import urlparse

from scrapelib import Scraper, FileCache, HTTPError

s = Scraper(requests_per_minute=60)
s.cache_storage = FileCache('walmart_cache')
s.cache_write_only = False


def read_csv():
    with open('scrapedsearch.csv', 'rb') as f:
        reader = csv.DictReader(
            f, fieldnames=["title", "link", "date", "description"])
        for row in reader:
            print row
            if row['link'] != 'link':
                scrape_release(row)


def scrape_release(row):
    path = urlparse.urlparse(row['link'])[2]
    components = path.split('/')
    if len(components) > 4:
        year = components[-4]
        month = components[-3]
        day = components[-2]
        slug = components[-1]

        filename = '%s-%s-%s-%s' % (year, month, day, slug)
Exemplo n.º 5
0
#!/usr/bin/env python

import unicodecsv
import statestyle
import re
from lxml.html import fromstring
from urlparse import urljoin

# I copied this code from scrape_states.py

from scrapelib import Scraper, FileCache

s = Scraper(requests_per_minute=60, follow_robots=False)
s.cache_storage = FileCache('wikipedia_cache')
s.cache_write_only = False

# My Stuff

CD_LIST = 'https://en.wikipedia.org/wiki/List_of_United_States_congressional_districts'
NON_VOTING = ['American Samoa', 'District of Columbia', 'Guam', 'Northern Mariana Islands', 'Puerto Rico', 'United States Virgin Islands']
NOT_STATES = ['Philippines', 'U.S. Virgin Islands']

def parse_cd_file():
	writer = unicodecsv.writer(open('cd_wiki_data.csv', 'w'))
	writer.writerow(['full_geoid', 'wiki_url'])
	response = s.urlopen(CD_LIST)
	doc = fromstring(response)
	for h2 in doc.findall('.//h2')[2:59]:
		for span in h2.find_class('mw-headline'):
			if span.text_content() in NOT_STATES:
				break	
Exemplo n.º 6
0
    ('affordable care act', 'obamacare', 'healthcare', 'health care', 'insurance'),
    ('ukraine', 'ukrainian', 'crimea'),
    ('unemployed', 'unemployment')
]
    
    #('palestine', 'palestinians'),
    #('israel', 'israeli', 'palestine', 'palestinians'),
    #('iraq', 'iraqis', 'iraqs'),
    #('executive order', 'executive action'),
    #('economy', 'economic'),

ROOT_URL = 'http://www.whitehouse.gov/briefing-room/press-briefings'
CSV_PATH = 'briefing_links.csv'

s = Scraper(requests_per_minute=60)
s.cache_storage = FileCache('press_briefing_cache')
s.cache_write_only = False

@task(default=True)
def update():
    """
    Stub function for updating app-specific data.
    """
    #update_featured_social()

@task
def scrape_briefings():
    for index in range(0, 22):
        list = '%s?page=%i' % (ROOT_URL, index)
        print 'parsing %s' % list
        write_corpus(list)