def url_for(options, page=1): year_range = inspector.year_range(options, archive) url = "https://uspsoig.gov/document-library?" # there's always a first year, and it defaults to current year datetime_since = datetime(year=year_range[0], month=1, day=1) # Expected date format: Monday, September 1, 2014 usps_formatted_datetime = datetime_since.strftime("%A, %B %d, %Y") url += "&field_doc_date_value[value][date]=%s" % usps_formatted_datetime only = options.get('types') if not only: only = "audit,congress,research" only = only.split(",") params = ["field_doc_cat_tid[]=%s" % id for (name, id) in CATEGORIES \ if (name in only)] url += "&%s" % str.join("&", params) # they added this crazy thing annoying_prefix = "0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C" # page is 0-indexed if page > 1: url += "&page=%s%i" % (annoying_prefix, (page - 1)) return url
def run(options): year_range = inspector.year_range(options, archive) keys = set() # Pull the reports for report_type, url in REPORT_URLS: doc = utils.beautifulsoup_from_url(url) results = doc.select("section#content ul li") if results: for result in results: report = report_from_list(result, url, report_type, year_range) if report: if report["url"]: key = (report["report_id"], unquote(report["url"])) else: key = (report["report_id"], report["url"]) if key not in keys: inspector.save_report(report) keys.add(key) else: results = doc.select("section#content p") if not results: raise inspector.NoReportsFoundError("Federal Labor Relations Authority (%s)" % report_type) for result in results: report = report_from_paragraph(result, url, report_type, year_range) if report: key = (report["report_id"], report["url"]) if key not in keys: inspector.save_report(report) keys.add(key)
def url_for(options, page, category_id): year_range = inspector.year_range(options, archive) year_start = min(year_range) year_end = max(year_range) if category_id in ("94", "91", "93", "96"): # Always get all semiannual reports to congress, testimonies, news, and # "other" documents # This avoids false positives from the "no reports found" heuristic year_start = archive url = "https://uspsoig.gov/document-library" url += "?field_doc_date_value_op=between" url += "&field_doc_date_value[min][date]=%d" % year_start url += "&field_doc_date_value[max][date]=%d" % year_end url += "&field_document_type_tid[]=%s" % category_id # page is 0-indexed if page > 1: url += "&page=%i" % (page - 1) # Add a cache buster, this helps once we start retrying pages url += "&t=%i" % int(time.time()) return url
def run(options): year_range = inspector.year_range(options, archive) # Pull the audit reports for year in year_range: if year < 2006: # The oldest year for audit reports continue url = AUDIT_REPORTS_URL.format(year=year) doc = BeautifulSoup(utils.download(url)) results = doc.select("div#content li") for result in results: report = audit_report_from(result, url, year, year_range) if report: inspector.save_report(report) # Pull the semiannual reports doc = BeautifulSoup(utils.download(SEMIANNUAL_REPORTS_URL)) results = doc.select("div#content li") for result in results: report = semiannual_report_from(result, year_range) if report: inspector.save_report(report) # Pull the Peer Review doc = BeautifulSoup(utils.download(PEER_REVIEWS_URL)) result = doc.find("div", id='content').find("a", text=True) report = peer_review_from(result, year_range) inspector.save_report(report)
def run(options): year_range = inspector.year_range(options, archive) # Pull the general reports doc = utils.beautifulsoup_from_url(REPORTS_URL) results = doc.select("div#mainContent li.mainContenttext a") if not results: raise inspector.NoReportsFoundError("Farm Credit Administration (reports)") for result in results: report = report_from(result, REPORTS_URL, year_range) if report: inspector.save_report(report) # Pull the archive reports doc = utils.beautifulsoup_from_url(REPORT_ARCHIVE_URL) results = doc.select("div#mainContent li.mainContenttext a") + doc.select("div#mainContent span.mainContenttext a") if not results: raise inspector.NoReportsFoundError("Farm Credit Administration (archive)") for result in results: if not result.text: continue report = report_from(result, REPORT_ARCHIVE_URL, year_range) if report: inspector.save_report(report) # Pull the semiannual reports doc = utils.beautifulsoup_from_url(SEMIANNUAL_REPORTS_URL) results = doc.select("div#mainContent li.mainContenttext a") if not results: raise inspector.NoReportsFoundError("Farm Credit Administration (semiannual reports)") for result in results: report = semiannual_report_from(result, year_range) if report: inspector.save_report(report)
def run(options): year_range = inspector.year_range(options, archive) # Pull the reports with pagination for report_type, report_url_format in PAGINATED_REPORT_FORMATS: for page in range(0, 999): url = report_url_format.format(page=page) doc = utils.beautifulsoup_from_url(url) if report_type == "audit" and page == 0 and not doc.select("div.views-field-field-auditreport-doc-1"): raise Exception("Report number CSS class has changed") results = doc.select("li.views-row") if not results: if page == 0: raise inspector.NoReportsFoundError("USAID (%s)" % report_type) else: break for result in results: report = report_from(result, url, report_type, year_range) if report: inspector.save_report(report) # Pull the semiannual reports (no pagination) doc = utils.beautifulsoup_from_url(SEMIANNUAL_REPORTS_URL) results = doc.select("li.views-row") if not results: raise inspector.NoReportsFoundError("USAID (semiannual reports)") for result in results: report = semiannual_report_from(result, year_range) if report: inspector.save_report(report)
def run(options): year_range = inspector.year_range(options, archive) pre_1998_done = False # Pull the audit reports for year in year_range: if year < 1998: if pre_1998_done: continue else: pre_1998_done = True for page_number in range(0, 10000): year_url = url_for(year, page_number) doc = beautifulsoup_from_url(year_url) results = doc.select("ol li") if not results: if page_number == 0: raise inspector.NoReportsFoundError("Department of Labor (%s)" % year_url) else: break for result in results: report = report_from(result, year_url) if report: inspector.save_report(report) # Pull the semiannual reports doc = beautifulsoup_from_url(SEMIANNUAL_REPORTS_URL) results = doc.select("p > a:nth-of-type(1)") if not results: raise inspector.NoReportsFoundError("Department of Labor (semiannal reports)") for result in results: report = semiannual_report_from(result, year_range) if report: inspector.save_report(report)
def run(options): year_range = inspector.year_range(options) max_pages = int(options.get('pages', 1)) for year in year_range: page = 1 done = False while not done: url = url_for(options, page, year) body = utils.download(url) doc = BeautifulSoup(body) next_page = page + 1 found_next_page = False page_links = doc.select("li.pager-item a.active") for page_link in page_links: if page_link.text == str(next_page): found_next_page = True break if not found_next_page: done = True if next_page > max_pages: done = True results = doc.select("table.views-table > tbody > tr") for result in results: report = report_from(result) inspector.save_report(report) page = next_page if not done: print('Moving to next page (%d)' % page)
def run(options): year_range = inspector.year_range(options, archive) # Pull the audit reports doc = utils.beautifulsoup_from_url(REPORTS_URL) results = doc.select("#rounded-corner > tr") if not results: raise inspector.NoReportsFoundError("Federal Reserve (audit reports)") for result in results: report = report_from(result, year_range) if report: inspector.save_report(report) # Pull the semiannual reports doc = utils.beautifulsoup_from_url(SEMIANNUAL_REPORTS_URL) results = doc.select("div.style-aside ul > li > a") if not results: raise inspector.NoReportsFoundError( "Federal Reserve (semiannual reports)") for result in results: report_url = urljoin(BASE_PAGE_URL, result.get('href')) report = semiannual_report_from(report_url, year_range) if report: inspector.save_report(report) # The most recent semiannual report will be embedded on the main page report = semiannual_report_from(SEMIANNUAL_REPORTS_URL, year_range) if report: inspector.save_report(report)
def run(options): year_range = inspector.year_range(options, archive) # Can limit search to any of the components listed at the top of this script component = options.get('component') if component and component in components: source_links = {} link = "%s/oig/reports/%s.htm" % (base_url, component) source_links[link] = components[component] # Otherwise, get links to each component's landing page from main page. else: starting_point = "http://www.justice.gov/oig/reports/components.htm" content = get_content(starting_point) source_links = {} for c in content: links = c.find_all("a") for l in links: name = l.string link = base_url + l.get("href") source_links[link] = name # For each component's landing page, run the processor over it keys = list(source_links.keys()) keys.sort() for link in keys: content = get_content(link) extract_info(content, source_links[link], year_range) logging.info("Found %i reports, for year %i to %i" % (len(list(report.keys())), year_range[0], year_range[-1])) for key in list(report.keys()): inspector.save_report(report[key])
def run(options): year_range = inspector.year_range(options, archive) doc = utils.beautifulsoup_from_url(AUDIT_REPORTS_URL) results = doc.article.find_all("tr") if not results: raise inspector.NoReportsFoundError("FCC (audit reports)") for result in results: report = report_from(result, AUDIT_REPORTS_URL, year_range) if report: inspector.save_report(report) doc = utils.beautifulsoup_from_url(SEMIANNUAL_REPORTS_URL) results = doc.article.find_all("tr") if not results: raise inspector.NoReportsFoundError("FCC (semiannual reports)") for result in results: report = semiannual_report_from(result, SEMIANNUAL_REPORTS_URL, year_range) if report: inspector.save_report(report) doc = utils.beautifulsoup_from_url(OTHER_REPORTS_URL) results = doc.article.find_all("p") if not results: raise inspector.NoReportsFoundError("FCC (other)") for result in results: report = other_report_from(result, OTHER_REPORTS_URL, year_range) if report: inspector.save_report(report)
def run(options): year_range = inspector.year_range(options, archive) topics = options.get('topics') if topics: topics = topics.split(",") else: topics = TOPIC_TO_URL.keys() all_reports = {} for topic in topics: year_urls = urls_for(year_range, topic) for year_url in year_urls: logging.debug("Scraping %s" % year_url) body = utils.download(year_url) doc = BeautifulSoup(body) if not doc.select(".view-business-areas"): raise inspector.NoReportsFoundError("DOT (%s)" % topic) results = doc.select(".view-business-areas .views-row") for result in results: report = report_from(result, year_range, topic, options) if report: report_id = report["report_id"] if report_id in all_reports: all_reports[report_id]["topic"] = all_reports[report_id]["topic"] \ + ", " + topic else: all_reports[report_id] = report for report in all_reports.values(): inspector.save_report(report)
def run(options): year_range = inspector.year_range(options, archive) pages = options.get('pages', ALL_PAGES) for page in range(0, int(pages)): logging.debug("## Downloading page %i" % page) url = BASE_URL.format(page=page) results = extract_reports_for_page( url, page, year_range, listing_xpath="div.row.report-listings-copy") if not results: break for page in range(0, int(pages)): logging.debug("## Downloading testimony page %i" % page) url = TESTIMONY_BASE_URL.format(page=page) results = extract_reports_for_page( url, page, year_range, listing_xpath="div.row.report-listings-data") if not results: break
def run(options): year_range = inspector.year_range(options, archive) # Pull the reports with pagination for report_type, report_url_format in PAGINATED_REPORT_FORMATS.items(): for page in range(0, 999): url = report_url_format.format(page=page) doc = BeautifulSoup(utils.download(url)) results = doc.select("li.views-row") if not results: if page == 0: raise inspector.NoReportsFoundError("USAID (%s)" % report_type) else: break for result in results: report = report_from(result, url, report_type, year_range) if report: inspector.save_report(report) # Pull the semiannual reports (no pagination) doc = BeautifulSoup(utils.download(SEMIANNUAL_REPORTS_URL)) results = doc.select("li.views-row") if not results: raise inspector.NoReportsFoundError("USAID (semiannual reports)") for result in results: report = semiannual_report_from(result, year_range) if report: inspector.save_report(report)
def run(options): year_range = inspector.year_range(options, archive) min_year = min(year_range) page = 0 last_page = 0 while page <= last_page: doc = utils.beautifulsoup_from_url( REPORT_SEARCH_URL.format(min_year, page)) last_page_link = doc.find("a", title="Go to last page") if last_page_link: href = last_page_link["href"] page_match = re.search("[?&]page=([0-9]+)(?:&|$)", href) if page_match: last_page = int(page_match.group(1)) results = doc.select(".view-reports-advanced-search .views-row") if not results: raise inspector.NoReportsFoundError("Department of the Interior") for result in results: report = report_from(result, year_range) if report: inspector.save_report(report) page += 1 if last_page == 0: raise Exception("Did not find last page link")
def url_for(options, page, category_id): year_range = inspector.year_range(options, archive) url = "https://uspsoig.gov/document-library" # hidden input, always the same url += "?type=All" # there's always a first year, and it defaults to current year datetime_since = datetime(year=year_range[0], month=1, day=1) # Expected date format: 2015-02-26 usps_formatted_datetime = datetime_since.strftime("%Y-%m-%d") url += "&field_doc_date_value[value][date]=%s" % usps_formatted_datetime url += "&field_doc_cat_tid[]=%s" % category_id # they added this crazy thing annoying_prefix = "0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C0%2C" # page is 0-indexed if page > 1: url += "&page=%s%i" % (annoying_prefix, (page - 1)) # Add a cache buster, this helps once we start retrying pages url += "&t=%i" % int(time.time()) return url
def run(options): year_range = inspector.year_range(options, archive) # Pull the reports with pagination for report_type, report_url_format in PAGINATED_REPORT_FORMATS: for page in range(0, 999): url = report_url_format.format(page=page) doc = utils.beautifulsoup_from_url(url) if report_type == "audit" and page == 0 and not doc.select( "div.views-field-field-auditreport-doc-1"): raise Exception("Report number CSS class has changed") results = doc.select("li.views-row") if not results: if page == 0: raise inspector.NoReportsFoundError("USAID (%s)" % report_type) else: break for result in results: report = report_from(result, url, report_type, year_range) if report: inspector.save_report(report) # Pull the semiannual reports (no pagination) doc = utils.beautifulsoup_from_url(SEMIANNUAL_REPORTS_URL) results = doc.select("li.views-row") if not results: raise inspector.NoReportsFoundError("USAID (semiannual reports)") for result in results: report = semiannual_report_from(result, year_range) if report: inspector.save_report(report)
def scrape_restricted_reports(options): """Restricted Products. A single HTML page lists unreleased reports since 2014, with no links.""" # These reports are unreleased -- we could make this the text? """The following products have been determined to contain either classified information or controlled unclassified information by the audited agencies and cannot be publicly released. Members of Congress or congressional staff who wish to obtain one or more of these products should call or e-mail the Congressional Relations Office. All others who wish to obtain one or more of these products should follow the instructions found on Requesting Restricted Products.""" REPORTS_URL = 'http://www.gao.gov/restricted/restricted_reports' archive = 2014 year_range = inspector.year_range(options, archive) doc = utils.beautifulsoup_from_url(REPORTS_URL) results = doc.select("div.listing") for result in results: report = process_restricted_report(result, year_range, REPORTS_URL) if report: inspector.save_report(report)
def run(options): year_range = inspector.year_range(options, archive) min_year = min(year_range) page = 0 last_page = 0 while page <= last_page: doc = utils.beautifulsoup_from_url(REPORT_SEARCH_URL.format(min_year, page)) last_page_link = doc.find("a", title="Go to last page") if last_page_link: href = last_page_link["href"] page_match = re.search("[?&]page=([0-9]+)(?:&|$)", href) if page_match: last_page = int(page_match.group(1)) results = doc.select(".view-reports-advanced-search .views-row") if not results: raise inspector.NoReportsFoundError("Department of the Interior") for result in results: report = report_from(result, year_range) if report: inspector.save_report(report) page += 1 if last_page == 0: raise Exception("Did not find last page link")
def run(options): year_range = inspector.year_range(options, archive) # Pull the audit reports for year in year_range: if year < 2005: # This is the earliest audits go back continue url = AUDIT_REPORTS_URL.format(year=year) doc = BeautifulSoup(utils.download(url)) results = doc.select("div.content") if not results: raise inspector.NoReportsFoundError("Tennessee Valley Authority (%d)" % year) for result in results: report = audit_report_from(result, url, year_range) if report: inspector.save_report(report) # Pull the semiannual reports doc = BeautifulSoup(utils.download(SEMIANNUAL_REPORTS_URL)) results = doc.select("report") if not results: raise inspector.NoReportsFoundError("Tennessee Valley Authority (semiannual reports)") for result in results: report = semiannual_report_from(result, year_range) if report: inspector.save_report(report)
def run(options): year_range = inspector.year_range(options, archive) # Pull the audit reports for year in year_range: if year < 2006: # This is the oldest year for these reports year = 2006 url = AUDIT_REPORTS_BASE_URL.format(year) doc = beautifulsoup_from_url(url) results = [] results.extend(doc.select("tr.ms-rteTableOddRow-default")) results.extend(doc.select("tr.ms-rteTableEvenRow-default")) for result in results: report = audit_report_from(result, url, year_range) if report: inspector.save_report(report) for report_type, url in OTHER_URLS.items(): doc = beautifulsoup_from_url(url) results = doc.select("#ctl00_PlaceHolderMain_ctl05_ctl01__ControlWrapper_RichHtmlField > p > a") for result in results: report = report_from(result, url, report_type, year_range) if report: inspector.save_report(report) doc = beautifulsoup_from_url(SEMIANNUAL_REPORTS_URL) results = doc.select("#ctl00_PlaceHolderMain_ctl05_ctl01__ControlWrapper_RichHtmlField > p > a") for result in results: report = semiannual_report_from(result, SEMIANNUAL_REPORTS_URL, year_range) if report: inspector.save_report(report)
def run(options): year_range = inspector.year_range(options, archive) # Pull the reports doc = BeautifulSoup(utils.download(REPORTS_URL)) semiannual_report_results, other_results = doc.select( "table tr")[1].select("td") if not semiannual_report_results: raise inspector.NoReportsFoundException("EEOC (semiannual reports)") if not other_results: raise inspector.NoReportsFoundException("EEOC (other reports)") merge_items(semiannual_report_results) merge_items(other_results) for result in semiannual_report_results.select("li"): report = semiannual_report_from(result, year_range, title_prefix="Semiannual Report - ") if report: inspector.save_report(report) for result in other_results.select("li"): report = report_from(result, year_range) if report: inspector.save_report(report)
def run(options): year_range = inspector.year_range(options) only_id = options.get('report_id') print("## Downloading reports from %i to %i" % (year_range[0], year_range[-1])) url = url_for() body = utils.download(url) doc = BeautifulSoup(body) results = doc.select("section") for result in results: try: year = int(result.get("title")) # check that the fetched year is in the range if year not in year_range: continue print("## Downloading year %i " % year) except ValueError: continue # gets each table entry and sends generates a report from it listings = result.div.table.tbody.contents for item in listings: if type(item) is not bs4.element.Tag: continue report = report_from(item) # can limit it to just one report, for debugging convenience if only_id and only_id != report['report_id']: continue inspector.save_report(report)
def run(options): year_range = inspector.year_range(options, archive) # # Pull the RSS feed doc = BeautifulSoup(utils.download(RSS_URL)) results = doc.select("item") for result in results: report = rss_report_from(result, year_range) if report: inspector.save_report(report) # # Pull the recent audit reports. doc = BeautifulSoup(utils.download(RECENT_AUDITS_URL)) results = doc.select("div.block > a") for result in results: report = report_from(result, year_range) if report: inspector.save_report(report) # Pull the archive audit reports doc = BeautifulSoup(utils.download(AUDIT_ARCHIVE_URL)) results = doc.select("div.block a") for result in results: report = report_from(result, year_range) if report: inspector.save_report(report) # Pull the other reports doc = BeautifulSoup(utils.download(OTHER_REPORTS_URl)) results = doc.select("div.block > a") for result in results: report = report_from(result, year_range) if report: inspector.save_report(report)
def run(options): year_range = inspector.year_range(options, archive) results_flag = False # Pull the audit reports for year in year_range: if year < 2002: # The oldest page for audit reports continue if year == 2018: doc = utils.beautifulsoup_from_url(LATEST_AUDIT_REPORTS_URL) else: doc = utils.beautifulsoup_from_url(AUDIT_REPORTS_URL.format(year=year)) if doc is None: # Next year's audit page may not be published yet continue results = doc.select("div.mainCenter table tr") if results: results_flag = True for index, result in enumerate(results): if not index: # Skip the header row continue report = report_from(result, report_type='audit', year_range=year_range) if report: inspector.save_report(report) if not results_flag: raise inspector.NoReportsFoundError("NCUA (audit reports)") # Pull the other reports doc = utils.beautifulsoup_from_url(OTHER_REPORTS_URL) results = doc.select("div.mainCenter p") if not results: raise inspector.NoReportsFoundError("NCUA (other)") for result in results: report = other_report_from(result, year_range=year_range) if report: inspector.save_report(report) # Pull the semiannual reports doc = utils.beautifulsoup_from_url(SEMIANNUAL_REPORTS_URL) results = doc.select("div#mainColumns div.mainCenter a") if not results: raise inspector.NoReportsFoundError("NCUA (semiannual reports)") for result in results: report = semiannual_report_from(result, year_range) if report: inspector.save_report(report) # Pull the performance and strategic plans doc = utils.beautifulsoup_from_url(PLANS_URL) results = doc.select("div.mainCenter p") if not results: raise inspector.NoReportsFoundError("NCUA (performance/strategic plans)") for result in results: report = plan_from(result, year_range) if report: inspector.save_report(report)
def run(options): year_range = inspector.year_range(options, archive) # Pull the audit reports doc = utils.beautifulsoup_from_url(AUDIT_REPORTS_URL) headers = set([a.parent for a in doc.find_all("a", id=re.compile("^[0-9]{4}$"))]) headers.update(doc.find_all("p", class_="Ptitle1")) headers = sorted(headers, key=lambda p: int(p.text.strip()), reverse=True) if not headers: raise inspector.NoReportsFoundError("ITC") for header in headers: year = int(header.text.strip()) results = header.findNextSibling("ul").select("li") for result in results: if not inspector.sanitize(result.text): logging.debug("Skipping empty list item.") continue report = audit_report_from(year, result, AUDIT_REPORTS_URL, year_range) if report: inspector.save_report(report)
def run(options): year_range = inspector.year_range(options) # Pull the audit reports for year in year_range: url = audit_report_url(year) if url: parse_result_from_js_url(url, "auditreports", year, year_range) url = inspection_report_url(year) if url: parse_result_from_js_url(url, "iereports", year, year_range) # Pull the congressional testimony doc = BeautifulSoup(utils.download(CONGRESSIONAL_TESTIMONY_REPORTS_URL)) results = doc.findAll("ul", type='disc')[0].select("li") for result in results: report = congressional_testimony_report_from(result, year_range) if report: inspector.save_report(report) # Pull the semiannual reports doc = BeautifulSoup(utils.download(SEMIANNUAL_REPORTS_URL)) results = doc.findAll("ul", type='disc')[0].select("li") for result in results: report = semiannual_report_from(result, year_range) if report: inspector.save_report(report)
def run(options): year_range = inspector.year_range(options, archive) # Pull the audit reports for url, report_type in REPORT_URLS.items(): page_content = utils.download(url) # This typo confuses BS4 and interferes with our selectors page_content = page_content.replace('<h4>2015</h3>', '<h4>2015</h4>') doc = BeautifulSoup(page_content) results = doc.select("blockquote > ul > a") if not results: results = doc.select("blockquote > ul > li > a") if not results: results = doc.select("blockquote > font > ul > a") if not results: results = doc.select("blockquote > a") if not results: raise inspector.NoReportsFoundError("Legal Services Corporation (%s)" % url) for result in results: report = report_from(result, url, report_type, year_range) if report: inspector.save_report(report)
def run(options): year_range = inspector.year_range(options, archive) # Find the number of pages to iterate doc = BeautifulSoup(utils.download(REPORTS_URL)) page_count_text = doc.select("div.AspNet-GridView-Pagination")[0].text page_count = int(re.search("Page 1 of (\d+)", page_count_text).groups()[0]) # Iterate over those pages for page in range(1, page_count + 1): response = utils.scraper.post( REPORTS_URL, data={ "__EVENTTARGET": "ctl00$ctl00$MainContent$NavTreeSubContent$sv$GridViewSummary", "__EVENTARGUMENT": "Page${page_number}".format(page_number=page), }, cookies=COOKIES, ) doc = BeautifulSoup(response.content) results = doc.select("div.AspNet-GridView table tr") if not results: break for index, result in enumerate(results): if not index: # Skip the header row continue report = report_from(result, year_range) if report: inspector.save_report(report)
def run(options): year_range = inspector.year_range(options, archive) pages = options.get('pages', ALL_PAGES) # Pull the audit reports. Pages are 0-indexed. for page in range(0, int(pages) - 1): doc = utils.beautifulsoup_from_url(AUDIT_REPORTS_URL.format(page=page)) results = doc.select("span.field-content") if not results: if page == 0: raise inspector.NoReportsFound("FHFA (audit reports)") else: # No more results, we must have hit the last page break for result in results: report = report_from(result, year_range, report_type='audit') if report: inspector.save_report(report) # Grab the other reports for report_type, url in OTHER_REPORT_URLS.items(): doc = utils.beautifulsoup_from_url(url) results = doc.select(".views-field") if not results: results = doc.select(".views-row") if not results: raise inspector.NoReportsFound("FHFA (%s)" % report_type) for result in results: report = report_from(result, year_range, report_type) if report: inspector.save_report(report)
def run(self, options): self.options = options self.year_range = inspector.year_range(self.options, archive) self.first_date = datetime.datetime(self.year_range[0], 1, 1) self.last_date = datetime.datetime(self.year_range[-1], 12, 31) for url in self.urls_for(): page = BeautifulSoup(utils.download(url)) nodes = page.select('.energy-listing__results .node') if not nodes: nodes = page.select('.field-items .node') if not nodes: nodes = page.select('.node') if not nodes: raise inspector.NoReportsFoundException( "Department of Energy (%s)" % url) for node in nodes: report = self.report_from(node) if report: inspector.save_report(report) else: # Empty report indicates a report out of the date range, or not the ID. continue
def run(options): year_range = inspector.year_range(options, archive) pages = options.get('pages', ALL_PAGES) # Pull the audit reports. Pages are 0-indexed. for page in range(0, int(pages) - 1): doc = BeautifulSoup(utils.download(AUDIT_REPORTS_URL.format(page=page))) results = doc.select("span.field-content") if not results: # No more results, we must have hit the last page break for result in results: report = report_from(result, year_range, report_type='audit') if report: inspector.save_report(report) # Grab the other reports for report_type, url in OTHER_REPORT_URLS.items(): doc = BeautifulSoup(utils.download(url)) results = doc.select(".views-field") if not results: results = doc.select(".views-row") for result in results: report = report_from(result, year_range, report_type) if report: inspector.save_report(report)
def run(options): year_range = inspector.year_range(options, archive) topics = options.get('topics') if topics: topics = topics.split(",") else: topics = TOPIC_TO_URL.keys() all_reports = {} for topic in topics: year_urls = urls_for(year_range, topic) for year_url in year_urls: logging.debug("Scraping %s" % year_url) doc = utils.beautifulsoup_from_url(year_url) if not doc.select(".view-business-areas"): raise inspector.NoReportsFoundError("DOT (%s)" % topic) results = doc.select(".view-business-areas .views-row") for result in results: report = report_from(result, year_range, topic, options) if report: report_id = report["report_id"] if report_id in all_reports: all_reports[report_id]["topic"] = all_reports[report_id]["topic"] \ + ", " + topic else: all_reports[report_id] = report for report in all_reports.values(): inspector.save_report(report)
def run(options): year_range = inspector.year_range(options, archive) # Can limit search to any of the components listed at the top of this script component = options.get('component') if component and component in components: source_links = {} link = urljoin(base_url, "%s.htm" % component) source_links[link] = components[component] # Otherwise, get links to each component's landing page from main page. else: starting_point = "https://oig.justice.gov/reports/components.htm" content = get_content(starting_point) source_links = {} for c in content: links = c.find_all("a") for l in links: name = l.string link = urljoin(base_url, l.get("href")) source_links[link] = name # For each component's landing page, run the processor over it keys = list(source_links.keys()) keys.sort() for link in keys: content = get_content(link) extract_info(content, source_links[link], year_range) logging.info("Found %i reports, for year %i to %i" % (len(list(report.keys())), year_range[0], year_range[-1])) for key in list(report.keys()): inspector.save_report(report[key])
def run(options): year_range = inspector.year_range(options) pages = options.get('pages', ALL_PAGES) max_page = None for page in range(1, (int(pages) + 1)): if max_page and (page > max_page): print("End of pages!") break print("## Downloading page %i" % page) url = url_for(options, page) body = utils.download(url) doc = BeautifulSoup(body) max_page = last_page_for(doc) results = doc.select(".views-row") for result in results: report = report_from(result) # inefficient enforcement of --year arg, USPS doesn't support it server-side # TODO: change to published_on.year once it's a datetime if inspector.year_from(report) not in year_range: print("[%s] Skipping report, not in requested range." % report['report_id']) continue inspector.save_report(report)
def scrape_reports(options): """Pull reports from "Reports and Testimonies - Browse by date" web page.""" REPORTS_URL = 'http://www.gao.gov/browse/date/custom?adv_begin_date=01/01/' +\ '%s&adv_end_date=12/31/%s&rows=50&o=%s' # % (year, year, offset) archive = 1970 # Amazingly, reports go back to 1940, though those are unlikely to be # legible enough to OCR. Also very cool, even 1950s-era reports seem to have # a highlightable embedded text layer in them. Of course, it was the # General Accounting Office back then and less oversighty. year_range = inspector.year_range(options, archive) for year in year_range: is_next_page = True offset = 0 while is_next_page: doc = utils.beautifulsoup_from_url( REPORTS_URL % (year, year, offset)) results = doc.select("div.listing") for result in results: report = process_report(result, year_range) if report: inspector.save_report(report) page_links = doc.select("a.non-current_page") if len(page_links) and page_links[-1].text.startswith('Next'): offset += 50 else: is_next_page = False
def run(options): year_range = inspector.year_range(options, archive) # Pull the audit reports for year in year_range: if year < 2005: # This is the earliest audits go back continue url = AUDIT_REPORTS_URL.format(year=year) doc = BeautifulSoup(utils.download(url)) results = doc.select("div.content") if not results: raise inspector.NoReportsFoundError( "Tennessee Valley Authority (%d)" % year) for result in results: report = audit_report_from(result, url, year_range) if report: inspector.save_report(report) # Pull the semiannual reports doc = BeautifulSoup(utils.download(SEMIANNUAL_REPORTS_URL)) results = doc.select("report") if not results: raise inspector.NoReportsFoundError( "Tennessee Valley Authority (semiannual reports)") for result in results: report = semiannual_report_from(result, year_range) if report: inspector.save_report(report)
def run(options): year_range = inspector.year_range(options, archive) doc = BeautifulSoup(utils.download(REPORTS_URL)) # Pull the semiannual reports semiannul_results = doc.select("#AnnualManagementReports select")[0] for result in semiannul_results.select("option"): report = semiannual_report_from(result, year_range) if report: inspector.save_report(report) # Pull the special reports special_report_table = doc.find("table", attrs={"bordercolor": "#808080"}) for index, result in enumerate(special_report_table.select("tr")): if not index: # Skip the header row continue report = report_from(result, REPORTS_URL, report_type='other', year_range=year_range) if report: inspector.save_report(report) # Pull the audit reports for year in year_range: if year < 2001: # The oldest fiscal year page available continue year_url = AUDIT_REPORTS_URL.format(year=year) doc = BeautifulSoup(utils.download(year_url)) for index, result in enumerate(doc.select("#main table tr")): if not index: # Skip the header row continue report = report_from(result, year_url, report_type='audit', year_range=year_range) if report: inspector.save_report(report)
def run(options): year_range = inspector.year_range(options, archive) # Pull the audit reports for year in year_range: report_type = 'audit' for page in range(0, ALL_PAGES): reports_found = reports_from_page(AUDIT_REPORTS_URL, page, report_type, year_range, year) if not reports_found: if page == 0: raise inspector.NoReportsFoundError( "Social Security Administration (%d)" % year) else: break # Pull the other reports for report_type, report_format in OTHER_REPORT_URLS.items(): for page in range(0, ALL_PAGES): reports_found = reports_from_page(report_format, page, report_type, year_range) if not reports_found: if page == 0: raise inspector.NoReportsFoundError( "Social Security Administration (%s)" % report_type) else: break
def run(options): year_range = inspector.year_range(options, archive) component = options.get('component') if component: components = [component] else: components = list(COMPONENTS.keys()) report_id = options.get('report_id') limit = int(options.get('limit', 0)) all_audit_reports = {} for component in components: logging.info("## Fetching reports for component %s" % component) url = url_for(options, component) body = utils.download(url) doc = BeautifulSoup(body) results = doc.select("table.contentpaneopen table[border=1] tr") # accept only trs that look like body tr's (no 'align' attribute) # note: HTML is very inconsistent. cannot rely on thead or tbody results = [x for x in results if x.get('align') is None] if not results: raise inspector.NoReportsFoundError("DHS (%s)" % component) count = 0 for result in results: report = report_from(result, component, url) if not report: continue if report_id and (report_id != report['report_id']): continue if inspector.year_from(report) not in year_range: # logging.info("[%s] Skipping, not in requested range." % report['report_id']) continue key = (report["report_id"], report["title"]) if key in all_audit_reports: all_audit_reports[key]["agency"] = all_audit_reports[key]["agency"] + \ ", " + report["agency"] all_audit_reports[key]["agency_name"] = \ all_audit_reports[key]["agency_name"] + ", " + \ report["agency_name"] else: all_audit_reports[key] = report count += 1 if limit and (count >= limit): break logging.info("## Fetched %i reports for component %s\n\n" % (count, component)) for report in all_audit_reports.values(): inspector.save_report(report)
def run(options): year_range = inspector.year_range(options, archive) # Pull the audit reports for year in year_range: url = AUDITS_REPORTS_URL.format(str(year)[2:4]) doc = BeautifulSoup(utils.download(url)) results = doc.select("tr") if not results: raise inspector.NoReportsFoundError("NASA (%d)" % year) for index, result in enumerate(results): if not index or not result.text.strip(): # Skip the header row and any empty rows continue report = audit_report_from(result, url, year_range) if report: inspector.save_report(report) # Pull the other reports doc = BeautifulSoup(utils.download(OTHER_REPORT_URL)) results = doc.select("#subContainer ul li") if not results: raise inspector.NoReportsFoundError("NASA (other)") for result in results: report = other_report_from(result, year_range) if report: inspector.save_report(report)
def run(options): year_range = inspector.year_range(options, archive) # Pull the audit reports for year in year_range: if year < 2006: # The oldest year for audit reports continue url = AUDIT_REPORTS_URL.format(year=year) doc = BeautifulSoup(utils.download(url)) results = doc.select("div#content li") if not results: raise inspector.NoReportsFoundError( "National Archives and Records Administration audit reports") for result in results: report = audit_report_from(result, url, year, year_range) if report: inspector.save_report(report) # Pull the semiannual reports doc = BeautifulSoup(utils.download(SEMIANNUAL_REPORTS_URL)) results = doc.select("div#content li") if not results: raise inspector.NoReportsFoundError( "National Archives and Records Administration semiannual reports") for result in results: report = semiannual_report_from(result, year_range) if report: inspector.save_report(report) # Pull the Peer Review doc = BeautifulSoup(utils.download(PEER_REVIEWS_URL)) result = doc.find("div", id='content').find("a", text=True) report = peer_review_from(result, year_range) if report: inspector.save_report(report)
def run(options): year_range = inspector.year_range(options, archive) # Pull the general reports doc = BeautifulSoup(utils.download(REPORTS_URL)) results = doc.select("div#mainContent li.mainContenttext a") for result in results: report = report_from(result, REPORTS_URL, year_range) if report: inspector.save_report(report) # Pull the archive reports doc = BeautifulSoup(utils.download(REPORT_ARCHIVE_URL)) results = doc.select("div#mainContent li.mainContenttext a") + doc.select("div#mainContent span.mainContenttext a") for result in results: if not result.text: continue report = report_from(result, REPORT_ARCHIVE_URL, year_range) if report: inspector.save_report(report) # Pull the semiannual reports doc = BeautifulSoup(utils.download(SEMIANNUAL_REPORTS_URL)) results = doc.select("div#mainContent li.mainContenttext a") for result in results: report = semiannual_report_from(result, year_range) if report: inspector.save_report(report)
def run(options): year_range = inspector.year_range(options, archive) # Pull the audit and inspections reports for report_type, reports_url in REPORT_URLS: doc = BeautifulSoup(utils.download(reports_url)) results = doc.select("div.field-item") if not results: raise inspector.NoReportsFoundError( "National Labor Relations Board (%s)" % report_type) for result in results: report = report_from(result, report_type, reports_url, year_range) if report: inspector.save_report(report) # Pull the semiannual reports doc = BeautifulSoup(utils.download(SEMIANNUAL_REPORTS_URL)) results = doc.select("div.field-item") if not results: raise inspector.NoReportsFoundError( "National Labor Relations Board (semiannual reports)") for result in results: report = semiannual_report_from(result, year_range) if report: inspector.save_report(report)
def run(options): year_range = inspector.year_range(options, archive) # Pull the audit reports for page in range(1, 1000): doc = beautifulsoup_from_url("{}?RS={}".format(REPORTS_URL, page)) results = doc.select("div.leadin") if not results: if page == 1: raise inspector.NoReportsFoundError("VA (audit reports)") else: break for result in results: report = report_from(result, year_range) if report: inspector.save_report(report) # Pull the semiannual reports doc = beautifulsoup_from_url(SEMIANNUAL_REPORTS_URL) results = doc.select("div.leadin") if not results: raise inspector.NoReportsFoundError("VA (semiannual reports)") for result in results: report = semiannual_report_from(result, year_range) if report: inspector.save_report(report)
def run(options): year_range = inspector.year_range(options, archive) # Pull the audit reports for year in year_range: if year < 2002: # The oldest page for audit reports continue doc = BeautifulSoup(utils.download(AUDIT_REPORTS_URL.format(year=year))) results = doc.select("div.content table tr") for index, result in enumerate(results): if not index: # Skip the header row continue report = report_from(result, report_type="audit", year_range=year_range) if report: inspector.save_report(report) # Pull the FOIA reports doc = BeautifulSoup(utils.download(FOIA_REPORTS_URL)) results = doc.select("div.content table tr") for index, result in enumerate(results): if not index: # Skip the header row continue report = report_from(result, report_type="other", year_range=year_range) if report: inspector.save_report(report) # Pull the semiannual reports doc = BeautifulSoup(utils.download(SEMIANNUAL_REPORTS_URL)) results = doc.select("div.content a") for result in results: report = semiannual_report_from(result, year_range) if report: inspector.save_report(report)
def run(options): year_range = inspector.year_range(options, archive) if datetime.datetime.now().month >= 10: # October, November, and December fall into the next fiscal year # Add next year to year_range to compensate year_range.append(max(year_range) + 1) # Pull the audit reports for year in year_range: url = audit_report_url(year) if url: parse_result_from_js_url(url, "auditreports", year, year_range, report_type='audit') url = inspection_report_url(year) if url: parse_result_from_js_url(url, "iereports", year, year_range, report_type='inspection') # Pull the congressional testimony doc = utils.beautifulsoup_from_url(CONGRESSIONAL_TESTIMONY_REPORTS_URL) results = doc.findAll("ul", type='disc')[0].select("li") for result in results: report = congressional_testimony_report_from(result, year_range) if report: inspector.save_report(report) # Pull the semiannual reports doc = utils.beautifulsoup_from_url(SEMIANNUAL_REPORTS_URL) results = doc.findAll("ul", type='disc')[0].select("li") for result in results: report = semiannual_report_from(result, year_range) if report: inspector.save_report(report)
def run(options): year_range = inspector.year_range(options, archive) # Suggested flow, for an IG which paginates results. pages = options.get('pages', ALL_PAGES) for page in range(1, (int(pages) + 1)): data = { 'view_name': 'oig_nodes', 'view_display_id': 'block_search_oig_reports', } if page: # Only add page= if page > 0 data['page'] = page response = utils.scraper.post(REPORTS_AJAX_URL, data=data, headers={ "Content-Type": "application/x-www-form-urlencoded", }, ) page_html = response.json()[1]['data'] doc = BeautifulSoup(page_html) results = doc.select("tr") if not results: break for index, result in enumerate(results): if not index: # Skip the header row continue report = report_from(result, year_range) if report: inspector.save_report(report)
def run(options): year_range = inspector.year_range(options, archive) # Pull the audit reports doc = utils.beautifulsoup_from_url(REPORTS_URL) results = doc.select("#rounded-corner > tr") if not results: raise inspector.NoReportsFoundError("Federal Reserve (audit reports)") for result in results: report = report_from(result, year_range) if report: inspector.save_report(report) # Pull the semiannual reports doc = utils.beautifulsoup_from_url(SEMIANNUAL_REPORTS_URL) results = doc.select("div.style-aside ul > li > a") if not results: raise inspector.NoReportsFoundError("Federal Reserve (semiannual reports)") for result in results: report_url = urljoin(BASE_PAGE_URL, result.get('href')) report = semiannual_report_from(report_url, year_range) if report: inspector.save_report(report) # The most recent semiannual report will be embedded on the main page report = semiannual_report_from(SEMIANNUAL_REPORTS_URL, year_range) if report: inspector.save_report(report)
def run(options): year_range = inspector.year_range(options, archive) # Pull the audit reports results_flag = False for year in year_range: report_type = 'audit' for page in range(0, ALL_PAGES): reports_found = reports_from_page(AUDIT_REPORTS_URL, page, report_type, year_range, year) if not reports_found: break else: results_flag = True if not results_flag: raise inspector.NoReportsFoundError("Social Security Administration (audit)") # Pull the other reports for report_type, report_format in OTHER_REPORT_URLS.items(): for page in range(0, ALL_PAGES): reports_found = reports_from_page(report_format, page, report_type, year_range) if not reports_found: if page == 0: raise inspector.NoReportsFoundError("Social Security Administration (%s)" % report_type) else: break
def run(options): year_range = inspector.year_range(options, archive) # Pull the audit reports for url, report_type, parse_func in REPORT_PAGES_INFO: doc = utils.beautifulsoup_from_url(url) content = doc.select("section.article-content")[0] parse_func(content, url, report_type, year_range)
def run(options): year_range = inspector.year_range(options, archive) component = options.get('component') if component: components = [component] else: components = sorted(COMPONENTS.keys()) report_id = options.get('report_id') limit = int(options.get('limit', 0)) all_audit_reports = {} for component in components: logging.info("## Fetching reports for component %s" % component) url = url_for(options, component) doc = utils.beautifulsoup_from_url(url) results = doc.select("#content-area tbody tr") if not results: raise inspector.NoReportsFoundError("DHS (%s)" % component) count = 0 for result in results: report = report_from(result, component, url) if not report: continue if report_id and (report_id != report['report_id']): continue if inspector.year_from(report) not in year_range: # logging.info("[%s] Skipping, not in requested range." % report['report_id']) continue key = (report["report_id"], report["title"]) if key in all_audit_reports: all_audit_reports[key]["agency"] = "{}, {}".format( all_audit_reports[key]["agency"], report["agency"]) all_audit_reports[key]["agency_name"] = "{}, {}".format( all_audit_reports[key]["agency_name"], report["agency_name"]) else: all_audit_reports[key] = report count += 1 if limit and (count >= limit): break logging.info("## Fetched %i reports for component %s\n\n" % (count, component)) for report in all_audit_reports.values(): inspector.save_report(report)