def download_sopr(options): if options.get("loglevel", None): log.setLevel(options["loglevel"]) def _url_to_loc(url): fname = requests.utils.urlparse(url).path.split("/")[-1] year, quarter = fname.split(".")[0].split("_") output_dir = os.path.join(CACHE_DIR, "sopr", year, "Q" + quarter) if not os.path.exists(output_dir): mkdir_p(output_dir) output_loc = os.path.join(output_dir, fname) return (url, output_loc) _url_template = "http://soprweb.senate.gov/downloads/{year}_{quarter}.zip" _urls = [ _url_template.format(year=year, quarter=quarter) for year, quarter in product(xrange(1999, 2015), xrange(1, 5)) ] downloaded = _urls >> st_map(_url_to_loc) >> st_filter(is_not_cached) >> ThreadPool(download_all, poolsize=4) for url, output_loc, content_length in downloaded: log.info( "successfully downloaded {url} to {output_loc}({size})".format( url=url, output_loc=output_loc, size=content_length ) ) for url, exception in downloaded.failure: log.error("downloading from {url} failed: {exception}".format(url=url, exception=exception))
def download_house_xml(options): FORM_URL = "http://disclosures.house.gov/ld/LDDownload.aspx?KeepThis=true" if options.get("loglevel", None): log.setLevel(options["loglevel"]) OUT_DIR = os.path.join(CACHE_DIR, "house_clerk") if not os.path.exists(OUT_DIR): mkdir_p(OUT_DIR) jar = cookielib.CookieJar() form_page = requests.get(FORM_URL, cookies=jar) d = pq(form_page.text, parser="html") form_data = {input.attr("name"): input.val() for input in d("input[name]").items()} filing_selector = d("select#selFilesXML") space = r"(\ )" filing_type = r"(?P<filing_type>(?P<filing_type_year>\d{4})\ (?P<filing_type_form>MidYear|Registrations|YearEnd|(1st|2nd|3rd|4th)Quarter))" xml = "(XML)" date = r"(\(\ (?P<updated_date>(?P<updated_date_day>\d{1,2})\/(?P<updated_date_month>\d{2})\/(?P<updated_date_year>\d{4})))" time = r"(?P<updated_time>(?P<updated_time_hour>\d{1,2}):(?P<updated_time_min>\d{2}):(?P<updated_time_sec>\d{2})\ (?P<updated_time_am_pm>PM|AM)\))" option_rgx = re.compile(filing_type + space + xml + space + date + space + time) dl_options = filing_selector.find("option") def _get_request_loc_pair(value): info = re.match(option_rgx, value).groupdict() fields = dict(form_data, **{filing_selector.attr("name"): value}) output_dir = os.path.join(CACHE_DIR, "house_clerk") mkdir_p(output_dir) output_name = os.path.join(output_dir, "%s_%s_XML.zip" % (info["filing_type_year"], info["filing_type_form"])) log.info("starting download of {output_loc}".format(output_loc=output_name)) response = requests.post(FORM_URL, data=fields, stream=True, cookies=jar) return (response, output_name) def _download_all(q): for result in q: yield result[1], response_download(result) downloaded = ( (option.attr("value") for option in dl_options.items()) >> st_map(_get_request_loc_pair) >> st_filter(response_is_not_cached) >> ThreadPool(_download_all, poolsize=4) ) for output_loc, content_length in downloaded: log.info("successfully downloaded to {output_loc}({size})".format(output_loc=output_loc, size=content_length)) for (response, output_loc), exception in downloaded.failure: log.error("downloading to {output_loc} failed: {exception}".format(output_loc=output_loc, exception=exception))