def run(options): year_range = inspector.year_range(options, archive) # Pull the audit reports for year in year_range: report_type = 'audit' for page in range(0, ALL_PAGES): reports_found = reports_from_page(AUDIT_REPORTS_URL, page, report_type, year_range, year) if not reports_found: if page == 0: raise inspector.NoReportsFoundError( "Social Security Administration (%d)" % year) else: break # Pull the other reports for report_type, report_format in OTHER_REPORT_URLS.items(): for page in range(0, ALL_PAGES): reports_found = reports_from_page(report_format, page, report_type, year_range) if not reports_found: if page == 0: raise inspector.NoReportsFoundError( "Social Security Administration (%s)" % report_type) else: break
def run(options): year_range = inspector.year_range(options, archive) # Pull the audit reports for year in year_range: if year < 2005: # This is the earliest audits go back continue url = AUDIT_REPORTS_URL.format(year=year) doc = BeautifulSoup(utils.download(url)) results = doc.select("div.content") if not results: raise inspector.NoReportsFoundError( "Tennessee Valley Authority (%d)" % year) for result in results: report = audit_report_from(result, url, year_range) if report: inspector.save_report(report) # Pull the semiannual reports doc = BeautifulSoup(utils.download(SEMIANNUAL_REPORTS_URL)) results = doc.select("report") if not results: raise inspector.NoReportsFoundError( "Tennessee Valley Authority (semiannual reports)") for result in results: report = semiannual_report_from(result, year_range) if report: inspector.save_report(report)
def run(options): year_range = inspector.year_range(options, archive) # Pull the general reports doc = utils.beautifulsoup_from_url(REPORTS_URL) results = doc.select("div#mainContent li.mainContenttext a") if not results: raise inspector.NoReportsFoundError("Farm Credit Administration (reports)") for result in results: report = report_from(result, REPORTS_URL, year_range) if report: inspector.save_report(report) # Pull the archive reports doc = utils.beautifulsoup_from_url(REPORT_ARCHIVE_URL) results = doc.select("div#mainContent li.mainContenttext a") + doc.select("div#mainContent span.mainContenttext a") if not results: raise inspector.NoReportsFoundError("Farm Credit Administration (archive)") for result in results: if not result.text: continue report = report_from(result, REPORT_ARCHIVE_URL, year_range) if report: inspector.save_report(report) # Pull the semiannual reports doc = utils.beautifulsoup_from_url(SEMIANNUAL_REPORTS_URL) results = doc.select("div#mainContent li.mainContenttext a") if not results: raise inspector.NoReportsFoundError("Farm Credit Administration (semiannual reports)") for result in results: report = semiannual_report_from(result, year_range) if report: inspector.save_report(report)
def run(options): year_range = inspector.year_range(options, archive) # Pull the reports with pagination for report_type, report_url_format in PAGINATED_REPORT_FORMATS.items(): for page in range(0, 999): url = report_url_format.format(page=page) doc = BeautifulSoup(utils.download(url)) results = doc.select("li.views-row") if not results: if page == 0: raise inspector.NoReportsFoundError("USAID (%s)" % report_type) else: break for result in results: report = report_from(result, url, report_type, year_range) if report: inspector.save_report(report) # Pull the semiannual reports (no pagination) doc = BeautifulSoup(utils.download(SEMIANNUAL_REPORTS_URL)) results = doc.select("li.views-row") if not results: raise inspector.NoReportsFoundError("USAID (semiannual reports)") for result in results: report = semiannual_report_from(result, year_range) if report: inspector.save_report(report)
def run(options): year_range = inspector.year_range(options, archive) results_flag = False # Pull the audit reports for year in year_range: if year < 2002: # The oldest page for audit reports continue if year == 2018: doc = utils.beautifulsoup_from_url(LATEST_AUDIT_REPORTS_URL) else: doc = utils.beautifulsoup_from_url(AUDIT_REPORTS_URL.format(year=year)) if doc is None: # Next year's audit page may not be published yet continue results = doc.select("div.mainCenter table tr") if results: results_flag = True for index, result in enumerate(results): if not index: # Skip the header row continue report = report_from(result, report_type='audit', year_range=year_range) if report: inspector.save_report(report) if not results_flag: raise inspector.NoReportsFoundError("NCUA (audit reports)") # Pull the other reports doc = utils.beautifulsoup_from_url(OTHER_REPORTS_URL) results = doc.select("div.mainCenter p") if not results: raise inspector.NoReportsFoundError("NCUA (other)") for result in results: report = other_report_from(result, year_range=year_range) if report: inspector.save_report(report) # Pull the semiannual reports doc = utils.beautifulsoup_from_url(SEMIANNUAL_REPORTS_URL) results = doc.select("div#mainColumns div.mainCenter a") if not results: raise inspector.NoReportsFoundError("NCUA (semiannual reports)") for result in results: report = semiannual_report_from(result, year_range) if report: inspector.save_report(report) # Pull the performance and strategic plans doc = utils.beautifulsoup_from_url(PLANS_URL) results = doc.select("div.mainCenter p") if not results: raise inspector.NoReportsFoundError("NCUA (performance/strategic plans)") for result in results: report = plan_from(result, year_range) if report: inspector.save_report(report)
def run(options): year_range = inspector.year_range(options, archive) # Pull the audit reports for page in range(1, 1000): doc = beautifulsoup_from_url("{}?RS={}".format(REPORTS_URL, page)) results = doc.select("div.leadin") if not results: if page == 1: raise inspector.NoReportsFoundError("VA (audit reports)") else: break for result in results: report = report_from(result, year_range) if report: inspector.save_report(report) # Pull the semiannual reports doc = beautifulsoup_from_url(SEMIANNUAL_REPORTS_URL) results = doc.select("div.leadin") if not results: raise inspector.NoReportsFoundError("VA (semiannual reports)") for result in results: report = semiannual_report_from(result, year_range) if report: inspector.save_report(report)
def run(options): year_range = inspector.year_range(options, archive) # Pull the audit reports for year in year_range: if year < 2006: # The oldest year for audit reports continue url = AUDIT_REPORTS_URL.format(year=year) doc = BeautifulSoup(utils.download(url)) results = doc.select("div#content li") if not results: raise inspector.NoReportsFoundError( "National Archives and Records Administration audit reports") for result in results: report = audit_report_from(result, url, year, year_range) if report: inspector.save_report(report) # Pull the semiannual reports doc = BeautifulSoup(utils.download(SEMIANNUAL_REPORTS_URL)) results = doc.select("div#content li") if not results: raise inspector.NoReportsFoundError( "National Archives and Records Administration semiannual reports") for result in results: report = semiannual_report_from(result, year_range) if report: inspector.save_report(report) # Pull the Peer Review doc = BeautifulSoup(utils.download(PEER_REVIEWS_URL)) result = doc.find("div", id='content').find("a", text=True) report = peer_review_from(result, year_range) if report: inspector.save_report(report)
def run(options): year_range = inspector.year_range(options, archive) # Pull the audit and inspections reports for report_type, reports_url in REPORT_URLS: doc = BeautifulSoup(utils.download(reports_url)) results = doc.select("div.field-item") if not results: raise inspector.NoReportsFoundError( "National Labor Relations Board (%s)" % report_type) for result in results: report = report_from(result, report_type, reports_url, year_range) if report: inspector.save_report(report) # Pull the semiannual reports doc = BeautifulSoup(utils.download(SEMIANNUAL_REPORTS_URL)) results = doc.select("div.field-item") if not results: raise inspector.NoReportsFoundError( "National Labor Relations Board (semiannual reports)") for result in results: report = semiannual_report_from(result, year_range) if report: inspector.save_report(report)
def run(options): year_range = inspector.year_range(options, archive) # Pull the audit reports for year in year_range: url = AUDITS_REPORTS_URL.format(str(year)[2:4]) doc = BeautifulSoup(utils.download(url)) results = doc.select("tr") if not results: raise inspector.NoReportsFoundError("NASA (%d)" % year) for index, result in enumerate(results): if not index or not result.text.strip(): # Skip the header row and any empty rows continue report = audit_report_from(result, url, year_range) if report: inspector.save_report(report) # Pull the other reports doc = BeautifulSoup(utils.download(OTHER_REPORT_URL)) results = doc.select("#subContainer ul li") if not results: raise inspector.NoReportsFoundError("NASA (other)") for result in results: report = other_report_from(result, year_range) if report: inspector.save_report(report)
def run(options): year_range = inspector.year_range(options, archive) # Pull the reports with pagination for report_type, report_url_format in PAGINATED_REPORT_FORMATS: for page in range(0, 999): url = report_url_format.format(page=page) doc = utils.beautifulsoup_from_url(url) if report_type == "audit" and page == 0 and not doc.select( "div.views-field-field-auditreport-doc-1"): raise Exception("Report number CSS class has changed") results = doc.select("li.views-row") if not results: if page == 0: raise inspector.NoReportsFoundError("USAID (%s)" % report_type) else: break for result in results: report = report_from(result, url, report_type, year_range) if report: inspector.save_report(report) # Pull the semiannual reports (no pagination) doc = utils.beautifulsoup_from_url(SEMIANNUAL_REPORTS_URL) results = doc.select("li.views-row") if not results: raise inspector.NoReportsFoundError("USAID (semiannual reports)") for result in results: report = semiannual_report_from(result, year_range) if report: inspector.save_report(report)
def run(options): year_range = inspector.year_range(options, archive) doc = utils.beautifulsoup_from_url(AUDIT_REPORTS_URL) results = doc.article.find_all("tr") if not results: raise inspector.NoReportsFoundError("FCC (audit reports)") for result in results: report = report_from(result, AUDIT_REPORTS_URL, year_range) if report: inspector.save_report(report) doc = utils.beautifulsoup_from_url(SEMIANNUAL_REPORTS_URL) results = doc.article.find_all("tr") if not results: raise inspector.NoReportsFoundError("FCC (semiannual reports)") for result in results: report = semiannual_report_from(result, SEMIANNUAL_REPORTS_URL, year_range) if report: inspector.save_report(report) doc = utils.beautifulsoup_from_url(OTHER_REPORTS_URL) results = doc.article.find_all("p") if not results: raise inspector.NoReportsFoundError("FCC (other)") for result in results: report = other_report_from(result, OTHER_REPORTS_URL, year_range) if report: inspector.save_report(report)
def run(options): year_range = inspector.year_range(options, archive) # Pull the audit reports doc = utils.beautifulsoup_from_url(REPORTS_URL) results = doc.select("#rounded-corner > tr") if not results: raise inspector.NoReportsFoundError("Federal Reserve (audit reports)") for result in results: report = report_from(result, year_range) if report: inspector.save_report(report) # Pull the semiannual reports doc = utils.beautifulsoup_from_url(SEMIANNUAL_REPORTS_URL) results = doc.select("div.style-aside ul > li > a") if not results: raise inspector.NoReportsFoundError( "Federal Reserve (semiannual reports)") for result in results: report_url = urljoin(BASE_PAGE_URL, result.get('href')) report = semiannual_report_from(report_url, year_range) if report: inspector.save_report(report) # The most recent semiannual report will be embedded on the main page report = semiannual_report_from(SEMIANNUAL_REPORTS_URL, year_range) if report: inspector.save_report(report)
def run(options): year_range = inspector.year_range(options, archive) # Pull the audit reports doc = utils.beautifulsoup_from_url(AUDIT_REPORTS_URL) results = doc.select("table tr") if not results: raise inspector.NoReportsFoundError( "Federal Maritime Commission (audits)") for result in results: if result.th: # Skip the header row continue report = report_from(result, AUDIT_REPORTS_URL, report_type='audit', year_range=year_range) if report: inspector.save_report(report) # Pull historical audits audit_year_links = doc.select("div.col-2-3 ul li a") for year_link in audit_year_links: audit_year_url = urljoin(AUDIT_REPORTS_URL, year_link.get('href')) doc = utils.beautifulsoup_from_url(audit_year_url) results = doc.select("table tr") if not results: # Grab results other than first and last (header and extra links) results = doc.select("div.col-2-2 ul")[1:-1] if not results: raise inspector.NoReportsFoundError( "Federal Maritime Commission (%s)" % audit_year_url) for result in results: if result.th: # Skip the header row continue report = report_from(result, AUDIT_REPORTS_URL, report_type='audit', year_range=year_range) if report: inspector.save_report(report) # Pull the semiannual reports doc = utils.beautifulsoup_from_url(SEMIANNUAL_REPORTS_URL) results = doc.select("div.col-2-2 p a") + doc.select("div.col-2-2 li a") if not results: raise inspector.NoReportsFoundError( "Federal Maritime Commission (semiannual reports)") for result in results: report = report_from(result.parent, AUDIT_REPORTS_URL, report_type='semiannual_report', year_range=year_range) if report: inspector.save_report(report)
def run(options): year_range = inspector.year_range(options, archive) # Pull the audit reports doc = BeautifulSoup(utils.download(AUDIT_REPORTS_URL)) results = doc.select("td.text table tr") if not results: raise inspector.NoReportsFoundError("National Science Foundation (audit reports") for result in results: # ignore divider lines if result.select("img"): continue report = report_from(result, report_type='audit', year_range=year_range) if report: inspector.save_report(report) # Pull the semiannual reports doc = BeautifulSoup(utils.download(SEMIANNUAL_REPORTS_URL)) results = doc.select("td.text table tr") if not results: raise inspector.NoReportsFoundError("National Science Foundation (semiannual reports)") for result in results: if not result.text.strip(): continue report = semiannual_report_from(result, year_range) if report: inspector.save_report(report) # Pull the case reports response = utils.scraper.post( url=CASE_REPORTS_URL, data=CASE_REPORTS_DATA, ) doc = BeautifulSoup(response.content) results = doc.select("td.text table tr") if not results: raise inspector.NoReportsFoundError("National Science Foundation (case reports)") for index, result in enumerate(results): if not index or not result.text.strip(): # Skip the header row and empty rows continue report = case_report_from(result, CASE_REPORTS_URL, year_range) if report: inspector.save_report(report) # Pull the testimony doc = BeautifulSoup(utils.download(TESTIMONY_REPORTS_URL)) results = doc.select("td.text table tr") if not results: raise inspector.NoReportsFoundError("National Science Foundation (testimony)") for result in results: if not result.text.strip(): continue report = report_from(result, report_type='testimony', year_range=year_range) if report: inspector.save_report(report)
def run(options): year_range = inspector.year_range(options, archive) # Pull the audit reports for year in year_range: if year < 1998: # The earliest year for audit reports continue year_url = AUDIT_REPORTS_URL.format(year=year) doc = utils.beautifulsoup_from_url(year_url) results = doc.select("tr") if not results: raise inspector.NoReportsFoundError( "Pension Benefit Guaranty Corporation (audit reports)") for result in results: report = report_from(result, report_type='audit', year_range=year_range) if report: inspector.save_report(report) # Pull the congressional requests doc = utils.beautifulsoup_from_url(CONGRESSIONAL_REQUESTS_URL) results = doc.select("tr") if not results: raise inspector.NoReportsFoundError( "Pension Benefit Guaranty Corporation (congressional requests)") for result in results: report = report_from(result, report_type='congress', year_range=year_range) if report: inspector.save_report(report) # Pull the semiannual reports doc = utils.beautifulsoup_from_url(SEMIANNUAL_REPORTS_URL) results = doc.select("div.holder a") if not results: raise inspector.NoReportsFoundError( "Pension Benefit Guaranty Corporation (semiannual reports)") for result in results: report = semiannual_report_from(result, year_range) if report: inspector.save_report(report) # Pull the congressional testimony doc = utils.beautifulsoup_from_url(CONGRESSIONAL_TESTIMONY_URL) results = doc.select("div.holder a") if not results: raise inspector.NoReportsFoundError( "Pension Benefit Guaranty Corporation (congressional testimony)") for result in results: report = testimony_report_from(result, year_range) if report: inspector.save_report(report)
def run(options): year_range = inspector.year_range(options, archive) # Pull the audit reports for year in year_range: if year < 2002: # The oldest page for audit reports continue doc = BeautifulSoup(utils.download( AUDIT_REPORTS_URL.format(year=year))) # if it's a 404 page (200 response code), move on if not_found(doc): continue results = doc.select("div.content table tr") if not results: raise inspector.NoReportsFoundError("NCUA (%d)" % year) for index, result in enumerate(results): if not index: # Skip the header row continue report = report_from(result, report_type='audit', year_range=year_range) if report: inspector.save_report(report) # Pull the FOIA reports doc = BeautifulSoup(utils.download(FOIA_REPORTS_URL)) results = doc.select("div.content table tr") if not results: raise inspector.NoReportsFoundError("NCUA (FOIA)") for index, result in enumerate(results): if not index: # Skip the header row continue report = report_from(result, report_type='other', year_range=year_range) if report: inspector.save_report(report) # Pull the semiannual reports doc = BeautifulSoup(utils.download(SEMIANNUAL_REPORTS_URL)) results = doc.select("div.content a") if not results: raise inspector.NoReportsFoundError("NCUA (semiannual reports)") for result in results: report = semiannual_report_from(result, year_range) if report: inspector.save_report(report)
def run(options): year_range = inspector.year_range(options, archive) # Pull the reports doc = utils.beautifulsoup_from_url(AUDIT_REPORTS_URL) rows = doc.select("div.content > div > div > div > div > div.row") row_audits = rows[0] # Audit reports results = row_audits.select("ul li.pdf") if not results: raise inspector.NoReportsFoundError("CPB (audits)") for result in results: report = report_from(result, AUDIT_REPORTS_URL, "audit", year_range) if report: inspector.save_report(report) doc = utils.beautifulsoup_from_url(OTHER_REPORTS_URL) rows = doc.select("div.content > div > div > div > div.row") row_peer_review = rows[0] col_plans = rows[1].select("div.col-md-6")[0] col_congress = rows[1].select("div.col-md-6")[1] # Peer review results = row_peer_review.select("ul li.pdf") if not results: raise inspector.NoReportsFoundError("CPB (peer reviews)") for result in results: report = report_from(result, OTHER_REPORTS_URL, "other", year_range) if report: inspector.save_report(report) # Plans results = col_plans.select("ul li.pdf") if not results: raise inspector.NoReportsFoundError("CPB (plans)") for result in results: report = report_from(result, OTHER_REPORTS_URL, "other", year_range) if report: inspector.save_report(report) # Semiannual reports to congress results = col_congress.select("ul li.pdf") if not results: raise inspector.NoReportsFoundError("CPB (semiannual reports)") for result in results: report = report_from(result, OTHER_REPORTS_URL, "semiannual_report", year_range) if report: inspector.save_report(report)
def run(options): year_range = inspector.year_range(options, archive) if datetime.datetime.now().month >= 10: # October, November, and December fall into the next fiscal year # Add next year to year_range to compensate year_range.append(max(year_range) + 1) # Pull the audit reports for year in year_range: if year < 2006: # This is the oldest year for these reports continue url = AUDIT_REPORTS_BASE_URL.format(year) doc = utils.beautifulsoup_from_url(url) results = doc.find_all( "tr", class_=["ms-rteTableOddRow-default", "ms-rteTableEvenRow-default"]) if not results: if year != datetime.datetime.now().year + 1: raise inspector.NoReportsFoundError("Treasury (%d)" % year) for result in results: report = audit_report_from(result, url, year_range) if report: inspector.save_report(report) for report_type, url in OTHER_URLS.items(): doc = utils.beautifulsoup_from_url(url) results = doc.select( "#ctl00_PlaceHolderMain_ctl05_ctl01__ControlWrapper_RichHtmlField > p a" ) if not results: raise inspector.NoReportsFoundError("Treasury (%s)" % report_type) for result in results: if len(result.parent.find_all("a")) == 1: result = result.parent report = report_from(result, url, report_type, year_range) if report: inspector.save_report(report) doc = utils.beautifulsoup_from_url(SEMIANNUAL_REPORTS_URL) results = doc.select( "#ctl00_PlaceHolderMain_ctl05_ctl01__ControlWrapper_RichHtmlField > p > a" ) if not results: raise inspector.NoReportsFoundError("Treasury (semiannual reports)") for result in results: report = semiannual_report_from(result, SEMIANNUAL_REPORTS_URL, year_range) if report: inspector.save_report(report)
def run(options): year_range = inspector.year_range(options, archive) urls = [ARCHIVED_REPORTS_URL, PRIOR_PENDING_REPORTS_URL] for year in year_range: if year >= 2005: urls.append(AUDIT_REPORTS_URL.format(year)) # Pull the audit reports for url in urls: doc = utils.beautifulsoup_from_url(url) results = doc.find("table", border="1").select("tr") if not results: raise inspector.NoReportsFoundError("Nuclear Regulatory Commission (%d)" % year) for index, result in enumerate(results): if not index: # Skip the header row continue report = audit_report_from(result, url, year_range) if report: inspector.save_report(report) # Pull the congressional testimony doc = utils.beautifulsoup_from_url(SEMIANNUAL_REPORTS_URL) semiannual_reports_table = doc.find("table", border="1") results = semiannual_reports_table.select("tr") if not results: raise inspector.NoReportsFoundError("Nuclear Regulatory Commission (congressional testimony)") for index, result in enumerate(results): if index < 2: # Skip the first two header rows continue report = semiannual_report_from(result, year_range) if report: inspector.save_report(report) # Pull the other reports for reports_url, id_prefix in OTHER_REPORT_URLS: doc = utils.beautifulsoup_from_url(reports_url) results = doc.find("table", border="1").select("tr") if not results: raise inspector.NoReportsFoundError("Nuclear Regulatory Commission (other)") for index, result in enumerate(results): if not index: # Skip the header row continue report = other_report_from(result, year_range, id_prefix, reports_url) if report: inspector.save_report(report)
def run(options): year_range = inspector.year_range(options, archive) # Pull the audit reports for year in year_range: url = AUDITS_REPORTS_URL.format(year) doc = BeautifulSoup(utils.download(url)) results = doc.find("table", border="1").select("tr") if not results: raise inspector.NoReportsFoundError( "Nuclear Regulatory Commission (%d)" % year) for index, result in enumerate(results): if not index: # Skip the header row continue report = audit_report_from(result, url, year_range) if report: inspector.save_report(report) # Pull the congressional testimony doc = BeautifulSoup(utils.download(SEMIANNUAL_REPORTS_URL)) semiannual_reports_table = doc.find("table", border="1") results = semiannual_reports_table.select("tr") if not results: raise inspector.NoReportsFoundError( "Nuclear Regulatory Commission (congressional testimony)") for index, result in enumerate(results): if index < 2: # Skip the first two header rows continue report = semiannual_report_from(result, year_range) if report: inspector.save_report(report) # Pull the other reports for reports_url, id_prefix in OTHER_REPORT_URLS: doc = BeautifulSoup(utils.download(reports_url)) results = doc.find("table", border="1").select("tr") if not results: raise inspector.NoReportsFoundError( "Nuclear Regulatory Commission (other)") for index, result in enumerate(results): if not index: # Skip the header row continue report = other_report_from(result, year_range, id_prefix) if report: inspector.save_report(report)
def run(options): year_range = inspector.year_range(options, archive) # Pull the audit reports for url, report_type in REPORT_URLS.items(): page_content = utils.download(url) # This typo confuses BS4 and interferes with our selectors page_content = page_content.replace('<h4>2015</h3>', '<h4>2015</h4>') doc = BeautifulSoup(page_content) results = doc.select("blockquote > ul > a") if not results: results = doc.select("blockquote > ul > li > a") if not results: results = doc.select("blockquote > font > ul > a") if not results: results = doc.select("blockquote > a") if not results: raise inspector.NoReportsFoundError("Legal Services Corporation (%s)" % url) for result in results: report = report_from(result, url, report_type, year_range) if report: inspector.save_report(report)
def parse_investigation(content, landing_url, report_type, year_range): doj_flag = True doj_report_counter = 0 other_report_counter = 0 for child in content.children: if isinstance(child, Tag) and child.name == 'hr': doj_flag = False continue if doj_flag: if isinstance(child, Tag) and child.name == 'ul': report = report_from(child.li, landing_url, report_type, year_range) if report: inspector.save_report(report) doj_report_counter = doj_report_counter + 1 else: if isinstance(child, Tag): if child.name != 'h3' and child.text.strip(): report = report_from(child, landing_url, report_type, year_range) if report: inspector.save_report(report) other_report_counter = other_report_counter + 1 elif isinstance(child, Comment): continue elif isinstance(child, NavigableString): if child.strip(): raise Exception("Unexpected text!: " + child) if doj_report_counter == 0 or other_report_counter == 0: raise inspector.NoReportsFoundError("Legal Services Corporation (%s)" % landing_url)
def run(options): year_range = inspector.year_range(options, archive) component = options.get('component') if component: components = [component] else: components = list(COMPONENTS.keys()) report_id = options.get('report_id') limit = int(options.get('limit', 0)) all_audit_reports = {} for component in components: logging.info("## Fetching reports for component %s" % component) url = url_for(options, component) body = utils.download(url) doc = BeautifulSoup(body) results = doc.select("table.contentpaneopen table[border=1] tr") # accept only trs that look like body tr's (no 'align' attribute) # note: HTML is very inconsistent. cannot rely on thead or tbody results = [x for x in results if x.get('align') is None] if not results: raise inspector.NoReportsFoundError("DHS (%s)" % component) count = 0 for result in results: report = report_from(result, component, url) if not report: continue if report_id and (report_id != report['report_id']): continue if inspector.year_from(report) not in year_range: # logging.info("[%s] Skipping, not in requested range." % report['report_id']) continue key = (report["report_id"], report["title"]) if key in all_audit_reports: all_audit_reports[key]["agency"] = all_audit_reports[key]["agency"] + \ ", " + report["agency"] all_audit_reports[key]["agency_name"] = \ all_audit_reports[key]["agency_name"] + ", " + \ report["agency_name"] else: all_audit_reports[key] = report count += 1 if limit and (count >= limit): break logging.info("## Fetched %i reports for component %s\n\n" % (count, component)) for report in all_audit_reports.values(): inspector.save_report(report)
def parse_mapping(content, landing_url, report_type, year_range): links = content.find_all("a") if not links: raise inspector.NoReportsFoundError("Legal Services Corporation (%s)" % landing_url) for link in links: href = link.get("href") href = urljoin(landing_url, href) result = None if href == "https://www.oig.lsc.gov/images/mapping/mapping.zip": continue elif href == MAPPING_PROJECT_ARCHIVE_GRANTEE_URL: continue elif href.startswith("mailto:"): continue elif href == "https://www.oig.lsc.gov/evaluation-of-legal-services-mapping-prsentation": link["href"] = "https://oig.lsc.gov/mapping/phaseIIbriefing.pdf" result = link.parent elif href in ( "https://www.oig.lsc.gov/images/pdfs/mapping/MeekerOIGMappingReport.pdf", "https://www.oig.lsc.gov/core-legal-services", ): result = link.parent elif href == "https://www.oig.lsc.gov/images/mapping/Mapping_Evaluation_Phase_I_Volume_I_Final_Report.pdf": result = link.parent.parent elif (href.startswith("https://oig.lsc.gov/mapping/references/eval") and href.endswith(".pdf")): result = link else: raise Exception( "Unexpected link found on a mapping project page: %s" % href) report = report_from(result, landing_url, report_type, year_range) if report: inspector.save_report(report)
def run(options): year_range = inspector.year_range(options, archive) min_year = min(year_range) page = 0 last_page = 0 while page <= last_page: doc = utils.beautifulsoup_from_url( REPORT_SEARCH_URL.format(min_year, page)) last_page_link = doc.find("a", title="Go to last page") if last_page_link: href = last_page_link["href"] page_match = re.search("[?&]page=([0-9]+)(?:&|$)", href) if page_match: last_page = int(page_match.group(1)) results = doc.select(".view-reports-advanced-search .views-row") if not results: raise inspector.NoReportsFoundError("Department of the Interior") for result in results: report = report_from(result, year_range) if report: inspector.save_report(report) page += 1 if last_page == 0: raise Exception("Did not find last page link")
def run(options): year_range = inspector.year_range(options, archive) topics = options.get('topics') if topics: topics = topics.split(",") else: topics = TOPIC_TO_URL.keys() all_reports = {} for topic in topics: year_urls = urls_for(year_range, topic) for year_url in year_urls: logging.debug("Scraping %s" % year_url) body = utils.download(year_url) doc = BeautifulSoup(body) if not doc.select(".view-business-areas"): raise inspector.NoReportsFoundError("DOT (%s)" % topic) results = doc.select(".view-business-areas .views-row") for result in results: report = report_from(result, year_range, topic, options) if report: report_id = report["report_id"] if report_id in all_reports: all_reports[report_id]["topic"] = all_reports[report_id]["topic"] \ + ", " + topic else: all_reports[report_id] = report for report in all_reports.values(): inspector.save_report(report)
def get_content(url): page = utils.download(url) page = BeautifulSoup(page) content = page.select(".content-left") if not content: raise inspector.NoReportsFoundError("DOJ (%s)" % url) return content
def extract_reports_for_subtopic(subtopic_url, year_range, topic_name, subtopic_name): doc = beautifulsoup_from_url(subtopic_url) if not doc: raise Exception("Failure fetching subtopic URL: %s" % subtopic_url) results = None # This URL is different than the rest and needs to find the "p > a"s first. if subtopic_url == TOPIC_TO_URL['TMPC']: results = doc.select("#leftContentInterior > p > a") if not results: results = doc.select("#leftContentInterior dl dd") if not results: results = doc.select("#leftContentInterior ul li") if not results: results = doc.select("#leftContentInterior > p > a") if not results: raise inspector.NoReportsFoundError("HHS (%s)" % subtopic_name) for result in results: if 'crossref' in result.parent.parent.attrs.get('class', []): continue if result.parent.parent.attrs.get('id') == 'related': continue report = report_from(result, year_range, topic_name, subtopic_url, subtopic_name) if report: deduplicate_save_report(report)
def run(self, options): self.options = options self.year_range = inspector.year_range(self.options, archive) self.first_date = datetime.datetime(self.year_range[0], 1, 1) self.last_date = datetime.datetime(self.year_range[-1], 12, 31) for url in self.urls_for(): page = utils.beautifulsoup_from_url(url) nodes = page.select('.energy-listing__results .node') if not nodes: nodes = page.select('.field-items .node') if not nodes: nodes = page.select('.node') if not nodes: raise inspector.NoReportsFoundError( "Department of Energy (%s)" % url) for node in nodes: report = self.report_from(node) if report: inspector.save_report(report) else: # Empty report indicates a report out of the date range, or not the ID. continue
def run(options): year_range = inspector.year_range(options, archive) keys = set() # Pull the reports for report_type, url in REPORT_URLS: doc = utils.beautifulsoup_from_url(url) results = doc.select("section#content ul li") if results: for result in results: report = report_from_list(result, url, report_type, year_range) if report: if report["url"]: key = (report["report_id"], unquote(report["url"])) else: key = (report["report_id"], report["url"]) if key not in keys: inspector.save_report(report) keys.add(key) else: results = doc.select("section#content p") if not results: raise inspector.NoReportsFoundError("Federal Labor Relations Authority (%s)" % report_type) for result in results: report = report_from_paragraph(result, url, report_type, year_range) if report: key = (report["report_id"], report["url"]) if key not in keys: inspector.save_report(report) keys.add(key)