def scrape(self, session): """Iterates through all the sites in url_dict to extract new documents. This is implemented as follows: 1. Download each of the pages. 2. Extract the URLs from the pages. 3. Check which of those URLs are not yet in our database. 4. For each of the URLs that are not yet in our database, add them as a new Bid object to the database. """ scraper = scrapelib.Scraper() for url, xpaths in self.url_dict.items(): page = scraper.get(URL_PREFIX + url) # doc_ids is dictionary: relative URL => title of doc doc_ids = \ results_page_scraper.scrape_results_page(page.content, xpaths) log.info("Found docs: {}".format(doc_ids)) new_urls = get_new_urls( session, doc_ids.keys(), # relative URL is the identifier self.get_site() ) log.info("New docs: {}".format(new_urls)) new_docs = self.add_new_documents(new_urls, doc_ids) session.add_all(new_docs) # Save all the new docs from this results page in one db call. session.commit()
def import_versions(state, rpm=60): scraper = scrapelib.Scraper(requests_per_minute=rpm) for bill in db.bills.find({'state': state}): logging.info("Importing %s" % bill['bill_id']) bill_changed = False for version in bill['versions']: if 'document_id' in version or 'url' not in version: continue doc = scraper.urlopen(version['url']) metadata = {'bill': {'state': bill['state'], 'chamber': bill['chamber'], 'session': bill['session'], 'bill_id': bill['bill_id'], 'title': bill['title']}, 'name': version['name'], 'url': version['url']} content_type = doc.response.headers['content-type'] version['document_id'] = put_document(doc, content_type, metadata) bill_changed = True if bill_changed: db.bills.save(bill, safe=True)
def scrape_legislator_list(self, session_num): url = ("http://www.legis.state.ak.us/publicservice/basis/members" "?minifyresult=false&session=" + session_num) xml = scrapelib.Scraper().get(url).content for line in lxml.etree.fromstring(xml).xpath("//Member/MemberDetails"): person = self.handle_list_item(line, session_num) yield person
def start_data(search_url, params=None): browser = webdriver.Chrome(options=OPTIONS) browser.set_page_load_timeout(-1) browser.implicitly_wait(15) if params is None: browser.get(BASE_URL + search_url) else: browser.get(BASE_URL + search_url + "?" + urllib.parse.urlencode(params)) download_link = browser.find_element_by_id('download-button') payload = { 'cacheId': download_link.get_attribute('data-cacheid'), 'typeOfReport': download_link.get_attribute('data-typeofreport'), 'token': str(datetime.datetime.now()) } scraper = scrapelib.Scraper(retry_attempts=20) response = scraper.post( 'https://www.nlrb.gov/nlrb-downloads/start-download/' + payload['typeOfReport'] + '/' + payload['cacheId'] + '/' + payload['token']) result = response.json()['data'] return result
def __init__(self, username, password): self.headers = {'Referer': self._REFERER, 'User-Agent': self._USER_AGENT} self.cookies = dict() self.client_data = {'client_id': self._CLIENT_ID, 'client_secret': self._CLIENT_SECRET} self.scraper = scrapelib.Scraper(retry_attempts=3) self._authenticate(username=username, password=password)
def dump(self, abbr, filename, validate, schema_dir): scraper = scrapelib.Scraper(requests_per_minute=600, retry_attempts=3) zip = zipfile.ZipFile(filename, 'w', zipfile.ZIP_DEFLATED, allowZip64=True) if not schema_dir: cwd = os.path.split(__file__)[0] schema_dir = os.path.join(cwd, "../../schemas/api/") with open(os.path.join(schema_dir, "bill.json")) as f: bill_schema = json.load(f) with open(os.path.join(schema_dir, "legislator.json")) as f: legislator_schema = json.load(f) with open(os.path.join(schema_dir, "committee.json")) as f: committee_schema = json.load(f) # write out metadata response = scraper.get(api_url('metadata/%s' % abbr)).content zip.writestr('metadata.json', response) logging.info('exporting %s legislators...' % abbr) for legislator in db.legislators.find({settings.LEVEL_FIELD: abbr}): path = 'legislators/%s' % legislator['_id'] url = api_url(path) response = scraper.get(url).content if validate: validictory.validate(json.loads(response), legislator_schema, validator_cls=APIValidator) zip.writestr(path, response) logging.info('exporting %s committees...' % abbr) for committee in db.committees.find({settings.LEVEL_FIELD: abbr}): path = 'committees/%s' % committee['_id'] url = api_url(path) response = scraper.get(url).content if validate: validictory.validate(json.loads(response), committee_schema, validator_cls=APIValidator) zip.writestr(path, response) logging.info('exporting %s bills...' % abbr) for bill in db.bills.find({settings.LEVEL_FIELD: abbr}, timeout=False): path = "bills/%s/%s/%s/%s" % (abbr, bill['session'], bill['chamber'], bill['bill_id']) url = api_url(path) response = scraper.get(url).content if validate: validictory.validate(json.loads(response), bill_schema, validator_cls=APIValidator) zip.writestr(path, response) zip.close()
def dump_json(abbr, filename, validate, schema_dir): scraper = scrapelib.Scraper(requests_per_minute=600, follow_robots=False) level = metadata(abbr)['level'] zip = zipfile.ZipFile(filename, 'w', zipfile.ZIP_DEFLATED) if not schema_dir: cwd = os.path.split(__file__)[0] schema_dir = os.path.join(cwd, "../schemas/api/") with open(os.path.join(schema_dir, "bill.json")) as f: bill_schema = json.load(f) with open(os.path.join(schema_dir, "legislator.json")) as f: legislator_schema = json.load(f) with open(os.path.join(schema_dir, "committee.json")) as f: committee_schema = json.load(f) logging.info('dumping %s bills...' % abbr) for bill in db.bills.find({'level': level, level: abbr}, timeout=False): path = "bills/%s/%s/%s/%s" % (abbr, bill['session'], bill['chamber'], bill['bill_id']) url = api_url(path) response = scraper.urlopen(url) if validate: validictory.validate(json.loads(response), bill_schema, validator_cls=APIValidator) zip.writestr(path, response) logging.info('dumping %s legislators...' % abbr) for legislator in db.legislators.find({'level': level, level: abbr}): path = 'legislators/%s' % legislator['_id'] url = api_url(path) response = scraper.urlopen(url) if validate: validictory.validate(json.loads(response), legislator_schema, validator_cls=APIValidator) zip.writestr(path, response) logging.info('dumping %s committees...' % abbr) for committee in db.committees.find({'level': level, level: abbr}): path = 'committees/%s' % committee['_id'] url = api_url(path) response = scraper.urlopen(url) if validate: validictory.validate(json.loads(response), committee_schema, validator_cls=APIValidator) zip.writestr(path, response) zip.close()
def get_latest(): """ Get and load the latest SQL dumps from the California legislature. """ scraper = scrapelib.Scraper() meta = db.metadata.find_one({'_id': 'ca'}) last_update = meta['_last_update'] base_url = "ftp://www.leginfo.ca.gov/pub/bill/" with scraper.urlopen(base_url) as page: next_day = last_update + datetime.timedelta(days=1) while next_day.date() < datetime.date.today(): for f in parse_directory_listing(page): if (re.match(r'pubinfo_(Mon|Tue|Wed|Thu|Fri|Sat|Sun)\.zip', f['filename']) and f['mtime'].date() == next_day.date()): url = base_url + f['filename'] print "Getting %s" % url get_and_load(url) meta['_last_update'] = next_day db.metadata.save(meta, safe=True) break else: print "Couldn't find entry for %s" % str(next_day.date()) break next_day = next_day + datetime.timedelta(days=1)
def session_list(): html = scrapelib.Scraper().get('http://www.sdlegislature.gov/' 'Legislative_Session/Menu.aspx').text doc = lxml.html.fromstring(html) sessions = doc.xpath('//div[@id="ctl00_ContentPlaceHolder1_BlueBoxLeft"]//ul/li' '/a/div/text()') return sessions
def run(self): # We first will ensure the cache and data directories exist if not os.path.exists(CACHE_DIR): os.makedirs(CACHE_DIR) if not os.path.exists(DATA_DIR): os.makedirs(DATA_DIR) # We use scrapelib as we are unsure of the integrity of the server we will be pulling from s = scrapelib.Scraper(retry_wait_seconds=5, retry_attempts=10) # Enable caching so we don't repeat downloads s.cache_storage = scrapelib.FileCache(CACHE_DIR) s.cache_write_only = False # Simple download function def download_entity(s, filename): """ Download an asset """ logging.info('Downloading %s from %s' % (filename, join(SOURCE_URL, filename))) s.urlretrieve('%s/%s' % (SOURCE_URL, filename), '%s/%s' % (self.output().path, filename)) # Download the data! os.system('mkdir -p "%s"' % self.output().path) for filename in CANDIDATE_SOURCE_FILES.values(): download_entity(s, filename) for filename in COMMITTEE_SOURCE_FILES.values(): download_entity(s, filename) for filename in META_SOURCE_FILES.values(): download_entity(s, filename)
def session_list(): import scrapelib import lxml.html html = scrapelib.Scraper().get( 'http://legis.sd.gov/Legislative_Session/Menu.aspx').text doc = lxml.html.fromstring(html) return doc.xpath( '//div[@id="ContentPlaceHolder1_BlueBoxLeft"]//ul/li/a/div/text()')
def get_session_list(self): import scrapelib import lxml.html url = 'http://www.legis.nd.gov/assembly/' html = scrapelib.Scraper().get(url).text doc = lxml.html.fromstring(html) doc.make_links_absolute(url) return doc.xpath("//div[@class='view-content']//a/text()")
def session_list(): import scrapelib text = scrapelib.Scraper().get('ftp://ftp.cga.ct.gov').text sessions = [line.split()[-1] for line in text.splitlines()] for not_session_name in ('incoming', 'pub', 'CGAAudio', 'rba', 'NCSL', "apaac"): sessions.remove(not_session_name) return sessions
def download(url): scraper = scrapelib.Scraper() with scraper.urlopen(url) as resp: (fd, path) = tempfile.mkstemp('.zip') with os.fdopen(fd, 'wb') as w: w.write(resp) return path
def get_session_list(self): html = scrapelib.Scraper().get('http://www.sdlegislature.gov/' 'Legislative_Session/archive.aspx').text doc = lxml.html.fromstring(html) sessions = [ x.strip() for x in doc.xpath('//table//td[@data-title="Year"]/text()') ] return sessions
def dump_json(state, filename, validate, schema_dir): scraper = scrapelib.Scraper(requests_per_minute=600) zip = zipfile.ZipFile(filename, 'w') if not schema_dir: cwd = os.path.split(__file__)[0] schema_dir = os.path.join(cwd, "../schemas/api/") with open(os.path.join(schema_dir, "bill.json")) as f: bill_schema = json.load(f) with open(os.path.join(schema_dir, "legislator.json")) as f: legislator_schema = json.load(f) with open(os.path.join(schema_dir, "committee.json")) as f: committee_schema = json.load(f) for bill in db.bills.find({'state': state}): path = "bills/%s/%s/%s/%s" % (state, bill['session'], bill['chamber'], bill['bill_id']) url = api_url(path) response = scraper.urlopen(url) if validate: validictory.validate(json.loads(response), bill_schema, validator_cls=APIValidator) zip.writestr(path, scraper.urlopen(url)) for legislator in db.legislators.find({'state': state}): path = 'legislators/%s' % legislator['_id'] url = api_url(path) response = scraper.urlopen(url) if validate: validictory.validate(json.loads(response), legislator_schema, validator_cls=APIValidator) zip.writestr(path, response) for committee in db.committees.find({'state': state}): path = 'committees/%s' % committee['_id'] url = api_url(path) response = scraper.urlopen(url) if validate: validictory.validate(json.loads(response), committee_schema, validator_cls=APIValidator) zip.writestr(path, response) zip.close()
def __init__(self, initial_page, page_processors=None, *, scraper: scrapelib.Scraper = None): self.initial_page = initial_page if not isinstance(page_processors, (list, tuple)): self.page_processors = [page_processors] else: self.page_processors = page_processors if not scraper: self.scraper = scrapelib.Scraper()
def do_scrape( self, scraper: typing.Optional[scrapelib.Scraper] = None ) -> typing.Iterable[typing.Any]: """ yield results from this page and any subpages :param scraper: Optional `scrapelib.Scraper` instance to use for running scrape. :returns: Generator yielding results from the scrape. """ if scraper is None: scraper = scrapelib.Scraper() yield from self._to_items(scraper)
def get_session_list(self): import scrapelib import lxml.html url = "http://www.legis.nd.gov/assembly/" html = scrapelib.Scraper().get(url).text doc = lxml.html.fromstring(html) doc.make_links_absolute(url) sessions = doc.xpath("//div[@class='view-content']//a/text()") sessions = [ session for session in sessions if "Territorial Assembly" not in session ] return sessions
def download_files(self): s = scrapelib.Scraper(requests_per_minute=self.req_per_min, retry_attempts=2) #enable cache on scrapelib cache_dir = os.path.join(os.getcwd(), 'cache') if not cache_dir: os.mkdir(cache_dir) s.cache_storage = scrapelib.FileCache(cache_dir) s.cache_write_only = False # TODO : update scrapelib to check last modified header with closing( shelve.open(os.path.join(self.import_dir, self.shelf_file))) as db: for key in db.keys(): dir_for_solnbr = self.create_dir_by_solnbr(key) attachments = db[key]['attachments'] for (i, a) in enumerate(attachments): self.log.info("Downloading file ({}: {}) from {}".format( a['filename'], a['desc'], a['url'])) # parse URL into components u = urlparse(a['url']) # match main portion to dict of special cases, get function to use downloader_func = downloaders.func_map.get( u.netloc, downloaders.default) try: local_file_path = downloader_func(s, a['url'], dir_for_solnbr, solnbr=key) a.update({'local_file_path': local_file_path}) except: self.log.exception( "Attachment couldn't be retrieved for unknown reasons. URL: {} Continuing." .format(a['url'])) a.update({'exception': 1}) continue finally: attachments[i] = a meta = {'dl_complete': True, 'num_dl': len(attachments)} db[key] = {'attachments': attachments, 'meta': meta}
def get_session_list(self): html = scrapelib.Scraper().get('http://www.sdlegislature.gov/' 'Legislative_Session/archive.aspx').text doc = lxml.html.fromstring(html) sessions = [x.strip() for x in doc.xpath('//table//td[@data-title="Year"]/text()')] # Archive page lacks the latest session current_session_url = doc.xpath( '//*[@id="ctl00_divHeader_mnuMain"]/li[6]/ul/li[1]/a/@href')[0] current_session = current_session_url.replace( '/Legislative_Session/Bills/Default.aspx?Session=', '') if current_session not in sessions: sessions.append(current_session) return sessions
def get_data(file_name, search_url, params=None): attempts = 0 while True: print("Attempt " + str(attempts + 1)) try: result = start_data(search_url=search_url, params=params) break except: attempts += 1 sleep(10) if attempts > 40: print("Unable to download") raise previous = 0 s = scrapelib.Scraper(retry_attempts=20) with tqdm.tqdm(total=result['total'], desc='NLRB.gov preparing download') as pbar: while not result['finished']: response = s.get(BASE_URL + '/nlrb-downloads/progress/' + str(result['id'])) result = response.json()['data'] # update progress bar current = result['processed'] pbar.update(current - previous) previous = current #sleep(2) print(BASE_URL + result['filename']) attempts = 0 while True: file_out = s.get(BASE_URL + result['filename']) if file_out.content[0:6] == b'Region' or file_out.content[ 0:5] == b'"Case': break else: if attempts > 30: Exception("Cannot download") attempts += 1 sleep(10) with open(DOWNLOAD_FOLDER + "/" + file_name, 'wb') as f: f.write(file_out.content)
def main(abbr): request_defaults = { # 'proxies': {"http": "localhost:8888"}, 'timeout': 5.0, 'headers': { 'Accept': ('text/html,application/xhtml+xml,application/' 'xml;q=0.9,*/*;q=0.8'), 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'en-us,en;q=0.5', 'User-Agent': ('Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:10.0.2) ' 'Gecko/20100101 Firefox/10.0.2'), }, 'follow_robots': False, # Note, this script needs run in the same dir as billy_settings.py } logger = logbook.Logger() DATA = join(settings.BILLY_DATA_DIR, abbr, 'billtext') try: os.makedirs(DATA) except OSError: pass logger.info('writing files to %r' % DATA) session = scrapelib.Scraper(cache_obj=scrapelib.FileCache('cache'), cache_write_only=False, use_cache_first=True, requests_per_minute=0, **request_defaults) for bill in db.bills.find({'state': abbr}): if len(bill['versions']): bill_id = bill['bill_id'] url = bill['versions'][0]['url'] logger.info('trying %r: %r' % (bill_id, url)) text = session.get(url).text with open(join(DATA, bill['_id']), 'w') as f: f.write(text.encode('utf-8'))
def scrape(old_data_path, no_download): """ Main function -- read in existing data, scrape new data, merge the two sets, and save to the output location. """ scraper = scrapelib.Scraper(requests_per_minute=180, retry_attempts=5, retry_wait_seconds=15) if old_data_path: existing_parolees = get_existing_parolees(old_data_path) else: existing_parolees = {} if no_download: new_parolees = [] else: new_parolees = scrape_interviews(scraper) new_parolees = scrape_details(scraper, new_parolees) for parolee in new_parolees: din = parolee[u"din"] interview_date = parolee[u"parole board interview date"] key = (din, interview_date) # Clear out any hearing date that corresponds to a hearing that hadn't # yet happened. This occurs because specific dates aren't set in # advance -- only a month and year. This is notated via the date # "YYYY-MM-*". However, once the interview has happened, we have # a date and should replace that row. scheduled_date = '-'.join(interview_date.split('-')[0:2]) + '-*' scheduled_key = (din, scheduled_date) if key != scheduled_key: if scheduled_key in existing_parolees: del existing_parolees[scheduled_key] existing_parolees[key] = parolee print_data(existing_parolees.values())
def dump(self, abbr, filename): scraper = scrapelib.Scraper(requests_per_minute=600, retry_attempts=3) zip = zipfile.ZipFile(filename, 'w', zipfile.ZIP_DEFLATED, allowZip64=True) # write out metadata response = scraper.get(api_url('metadata/%s' % abbr)).content zip.writestr('metadata.json', response) logging.info('exporting %s legislators...' % abbr) for legislator in db.legislators.find({settings.LEVEL_FIELD: abbr}): path = 'legislators/%s' % legislator['_id'] url = api_url(path) response = scraper.get(url).content zip.writestr(path, response) logging.info('exporting %s committees...' % abbr) for committee in db.committees.find({settings.LEVEL_FIELD: abbr}): path = 'committees/%s' % committee['_id'] url = api_url(path) response = scraper.get(url).content zip.writestr(path, response) logging.info('exporting %s bills...' % abbr) for bill in db.bills.find({settings.LEVEL_FIELD: abbr}, timeout=False): path = "bills/%s/%s/%s/%s" % (abbr, bill['session'], bill['chamber'], bill['bill_id']) url = api_url(path) response = scraper.get(url).content zip.writestr(path, response) zip.close()
def handle(self, *args, **options): self.scraper = scrapelib.Scraper() self.base_url = "https://efile.fara.gov/pls/apex/" self.per_page = 15 #right now we have no way of displaying #anything but 15 at a time, this shouldn't #be changed without a much better understanding #of how they're passing parameters start_record = 0 end_record = 0 total_records = 1 while end_record < total_records: page_param = "pgR_min_row={}max_rows={}rows_fetched={}" page_param = page_param.format(end_record + 1, self.per_page, self.per_page) next_page = self.get_page(page_param) data_table = next_page.find('div', {'id': 'apexir_DATA_PANEL'}) self.process_records(data_table.findAll("tr")) page_info = next_page.find("td", {"class": "pagination"}).text page_info = page_info.replace('of', '-') record_info = page_info.split('-') start_record, end_record, total_records = [ int(r.strip()) for r in record_info ] num_records_on_page = end_record - start_record + 1
# read in an opt-in config file for changing directories and supplying email settings # returns None if it's not there, and this should always be handled gracefully path = "config.yml" if os.path.exists(path): # Don't use a cached config file, just in case, and direct_yaml_load is not yet defined. import yaml config = yaml.load(open(path)) else: config = None eastern_time_zone = timezone('US/Eastern') # scraper should be instantiated at class-load time, so that it can rate limit appropriately scraper = scrapelib.Scraper(requests_per_minute=120, follow_robots=False, retry_attempts=3) scraper.user_agent = "unitedstates/congress (https://github.com/unitedstates/congress)" def format_datetime(obj): if isinstance(obj, datetime.datetime): return eastern_time_zone.localize( obj.replace(microsecond=0)).isoformat() elif isinstance(obj, datetime.date): return obj.isoformat() elif isinstance(obj, (str, unicode)): return obj else: return None
import logging import yaml from bs4 import BeautifulSoup from datetime import datetime import ssl import requests import urllib.parse import io import gzip import certifi from . import admin # scraper should be instantiated at class-load time, so that it can rate limit appropriately import scrapelib scraper = scrapelib.Scraper(requests_per_minute=120, retry_attempts=3) scraper.user_agent = "unitedstates/inspectors-general (https://github.com/unitedstates/inspectors-general)" class Soft404HttpAdapter(requests.adapters.HTTPAdapter): """Transport adapter that checks all responses against a blacklist of "file not found" pages that are served with 200 status codes.""" SOFT_404_URLS_RE = re.compile( r"^(http://www\.dodig\.mil/errorpages/index\.html|http://www\.fec\.gov/404error\.shtml|http://www\.gpo\.gov/maintenance/error\.htm)$" ) SOFT_404_BODY_SIGNATURES = { "cftc.gov": b"<title>404 Page Not Found - CFTC</title>", "cpb.org": b"<title>CPB: Page Not Found</title>", "ncua.gov": b"Redirect.aspx?404", "si.edu": b"<title>Page Not Found Smithsonian</title>",
'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'en-us,en;q=0.5', 'Connection': 'keep-alive', }, 'user_agent': ('Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:10.0.2) ' 'Gecko/20100101 Firefox/10.0.2'), 'follow_robots': False, } if __name__ == '__main__': session = scrapelib.Scraper(**request_defaults) def fetch(url): logger.info('trying %r' % url) try: return session.get(url) except Exception as e: logger.exception(e) filenames = os.listdir(join(PATH, 'urls')) filenames = filter(lambda s: '~' not in s, filenames) for urls_filename in filenames: abbr = urls_filename.lower().replace('.txt', '') if sys.argv[1:] and (abbr not in sys.argv[1:]): continue with open(join(PATH, 'urls', urls_filename)) as urls:
import csv import math import warnings import click import dj_database_url import django import scrapelib from django.contrib.postgres.search import SearchVector from django.db import transaction from django.db.models import Count from extract.utils import jid_to_abbr, abbr_to_jid from extract import get_extract_func, DoNotDownload, CONVERSION_FUNCTIONS # disable SSL validation and ignore warnings scraper = scrapelib.Scraper(verify=False) scraper.user_agent = "Mozilla" warnings.filterwarnings("ignore", module="urllib3") MIMETYPES = { "application/pdf": "pdf", "text/html": "html", "application/msword": "doc", "application/rtf": "rtf", "application/vnd.openxmlformats-officedocument.wordprocessingml.document": "docx", } def init_django(): from django.conf import settings