def other_report_from(result, year_range): link = result.find("a") report_id = inspector.sanitize( clean_text("-".join( link.text.replace("/", "-").replace("'", "").replace(":", "").split()))) report_id = re.sub('--*', '-', report_id) report_url = urljoin(OTHER_REPORTS_URL, link.get('href')) match = OTHER_REPORT_RE.match(inspector.sanitize(clean_text(link.text))) title = match.group(1) published_on_text = match.group(2) published_on = datetime.datetime.strptime(published_on_text, '%B %d, %Y') if published_on.year not in year_range: logging.debug("[%s] Skipping, not in requested range." % report_url) return report = { 'inspector': "ncua", 'inspector_url': "http://www.ncua.gov/about/Leadership/Pages/page_oig.aspx", 'agency': "ncua", 'agency_name': "National Credit Union Administration", 'type': "other", 'report_id': report_id, 'url': report_url, 'title': title, 'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"), } return report
def other_report_from(result, year_range): link = result.find("a") report_id = inspector.sanitize(clean_text("-".join(link.text.replace("/", "-").replace("'", "").replace(":", "").split()))) report_id = re.sub('--*', '-', report_id) report_url = urljoin(OTHER_REPORTS_URL, link.get('href')) match = OTHER_REPORT_RE.match(inspector.sanitize(clean_text(link.text))) title = match.group(1) published_on_text = match.group(2) published_on = datetime.datetime.strptime(published_on_text, '%B %d, %Y') if published_on.year not in year_range: logging.debug("[%s] Skipping, not in requested range." % report_url) return report = { 'inspector': "ncua", 'inspector_url': "http://www.ncua.gov/about/Leadership/Pages/page_oig.aspx", 'agency': "ncua", 'agency_name': "National Credit Union Administration", 'type': "other", 'report_id': report_id, 'url': report_url, 'title': title, 'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"), } return report
def semiannual_report_from(result, year_range): link = result.find("a") report_url = link.get('href') report_filename = report_url.split("/")[-1] report_id, _ = os.path.splitext(report_filename) title = "Semiannual report - {}".format(link.text.strip()) link_text = inspector.sanitize(link.text) published_on_text = link_text.split("-")[-1].strip().replace(".pdf", "") published_on = datetime.datetime.strptime(published_on_text, '%B %d, %Y') if published_on.year not in year_range: logging.debug("[%s] Skipping, not in requested range." % report_url) return report = { 'inspector': 'nlrb', 'inspector_url': "https://www.nlrb.gov/who-we-are/inspector-general", 'agency': 'nlrb', 'agency_name': "National Labor Relations Board", 'type': 'semiannual_report', 'report_id': report_id, 'url': report_url, 'title': title, 'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"), } return report
def fetch_from_landing_page(self, landing_url): """Returns a tuple of (pdf_link, summary_text, is_unreleased).""" unreleased = False page = BeautifulSoup(utils.download(landing_url)) summary = None field_items = page.select('.field-items') if field_items: text = [node.strip() for node in field_items[0].findAll(text=True)] summary = '\n\n'.join(text).strip() if not summary: logging.info('\tno summary text found') # sanitize now instead of later, to compare to regexes else: summary = inspector.sanitize(summary) if (summary and (RE_NOT_AVAILABLE.search(summary) or RE_NOT_AVAILABLE_2.search(summary) or RE_NOT_AVAILABLE_3.search(summary) or RE_NOT_AVAILABLE_4.search(summary) or RE_WITHDRAWN.search(summary) or RE_CLASSIFIED.search(summary))): unreleased = True report_url = None pdf_link = page.select('.file a') if not pdf_link: logging.warn('No pdf link found on page: {0}'.format(landing_url)) else: report_url = pdf_link[0]['href'] return report_url, summary, unreleased
def fetch_from_landing_page(self, landing_url): """Returns a tuple of (pdf_link, summary_text, is_unreleased).""" unreleased = False page = utils.beautifulsoup_from_url(landing_url) summary = None field_items = page.select('.field-items') if field_items: text = [node.strip() for node in field_items[0].findAll(text=True)] summary = '\n\n'.join(text).strip() if not summary: logging.info('\tno summary text found') # sanitize now instead of later, to compare to regexes else: summary = inspector.sanitize(summary) if (summary and (RE_NOT_AVAILABLE.search(summary) or RE_NOT_AVAILABLE_2.search(summary) or RE_NOT_AVAILABLE_3.search(summary) or RE_NOT_AVAILABLE_4.search(summary) or RE_WITHDRAWN.search(summary) or RE_CLASSIFIED.search(summary))): unreleased = True report_url = None pdf_link = page.select('.field-name-field-download-files a') if not pdf_link: logging.warn('No pdf link found on page: {0}'.format(landing_url)) else: report_url = pdf_link[0]['href'] return report_url, summary, unreleased
def run(options): year_range = inspector.year_range(options, archive) # Pull the audit reports doc = utils.beautifulsoup_from_url(AUDIT_REPORTS_URL) headers = set([a.parent for a in doc.find_all("a", id=re.compile("^[0-9]{4}$"))]) headers.update(doc.find_all("p", class_="Ptitle1")) headers = sorted(headers, key=lambda p: int(p.text.strip()), reverse=True) if not headers: raise inspector.NoReportsFoundError("ITC") for header in headers: year = int(header.text.strip()) results = header.findNextSibling("ul").select("li") for result in results: if not inspector.sanitize(result.text): logging.debug("Skipping empty list item.") continue report = audit_report_from(year, result, AUDIT_REPORTS_URL, year_range) if report: inspector.save_report(report)
def parse_year_accordion(content, landing_url, report_type, year_range): accordions = content.select("div.accordion-group") if not accordions: raise inspector.NoReportsFoundError("Legal Services Corporation (%s)" % landing_url) for accordion in accordions: heading = accordion.select("div.accordion-heading")[0] year_text = inspector.sanitize(heading.text) body = accordion.select("div.accordion-body div.accordion-inner")[0] if year_text == "FY1995" and body.text.strip() == "FY1995": continue results = [a for a in body.find_all("a") if a.text.strip()] if not results: raise inspector.NoReportsFoundError( "Legal Services Corporation (%s)" % landing_url) for result in results: report = report_from(result, landing_url, report_type, year_range) if report: inspector.save_report(report)
def parse_year_accordion(content, landing_url, report_type, year_range): accordions = content.select("div.accordion-group") if not accordions: raise inspector.NoReportsFoundError("Legal Services Corporation (%s)" % landing_url) for accordion in accordions: heading = accordion.select("div.accordion-heading")[0] year_text = inspector.sanitize(heading.text) body = accordion.select("div.accordion-body div.accordion-inner")[0] if year_text == "FY1995" and body.text.strip() == "FY1995": continue results = [a for a in body.find_all("a") if a.text.strip()] if not results: raise inspector.NoReportsFoundError("Legal Services Corporation (%s)" % landing_url) for result in results: report = report_from(result, landing_url, report_type, year_range) if report: inspector.save_report(report)
def run(options): year_range = inspector.year_range(options, archive) # Pull the audit reports doc = BeautifulSoup(utils.download(AUDIT_REPORTS_URL)) headers = doc.select("p.Ptitle1") if not headers: raise inspector.NoReportsFoundError("ITC") for header in headers: year = int(header.text.strip()) results = header.findNextSibling("ul").select("li") for result in results: if not inspector.sanitize(result.text): logging.debug("Skipping empty list item.") continue report = audit_report_from(year, result, AUDIT_REPORTS_URL, year_range) if report: inspector.save_report(report)
def report_from(result, year_range, topic, subtopic_url, subtopic=None): # Ignore links to other subsections if result.get('class') and result['class'][0] == 'crossref': return if result.name == 'a': # Sometimes we already have a link result_link = result else: result_link = result.find("a") # No link found, this is probably just an extra <li> on the page. if result_link is None: return # If this is just a anchor link on the same page, skip if not strip_url_fragment(result_link['href']): return title = result_link.text title = title.replace("\xe2\x80\x93", "-") title = inspector.sanitize(title) title = re.sub('\s+', ' ', title) if title in TITLE_NORMALIZATION: title = TITLE_NORMALIZATION[title] if title in BLACKLIST_TITLES: return report_url = urljoin(subtopic_url, result_link['href']).strip() if report_url in REPORT_URL_MAPPING: report_url = REPORT_URL_MAPPING[report_url] # Fix copy-paste error in link if (title == "Medicare Compliance Review of Altru Hospital for " "2012 and 2013" and report_url == "http://oig.hhs.gov/oas/reports/region4/41408036.asp"): report_url = "http://oig.hhs.gov/oas/reports/region7/71505070.asp" # Ignore reports from other sites if BASE_URL not in report_url: return if report_url in BLACKLIST_REPORT_URLS: return if report_url in OEI_COMBINED_LANDING_PAGES: report_url = OEI_COMBINED_LANDING_PAGES[report_url][title] report_filename = report_url.split("/")[-1] report_id, extension = os.path.splitext(report_filename) if report_filename == "11302505.pdf": report_id = report_id + "_early_alert" # Try a quick check from the listing page to see if we can bail out based on # the year try: published_on_text = result.find_previous("dt").text.strip() published_on = datetime.datetime.strptime(published_on_text, "%m-%d-%Y") except (AttributeError, ValueError): published_on = None if published_on and published_on.year not in year_range: logging.debug("[%s] Skipping, not in requested range." % report_url) return # This report is listed twice, once with the wrong date if published_on and published_on.year == 2012 and published_on.month == 1 and \ published_on.date == 12 and report_id == "20901002": return if report_id in REPORT_PUBLISHED_MAPPING: published_on = REPORT_PUBLISHED_MAPPING[report_id] else: # Process reports with landing pages if extension.lower() != '.pdf': report_url, published_on = report_from_landing_url(report_url) else: published_on = published_on_from_inline_link( result, report_filename, title, report_id, report_url, ) if not published_on: admin.log_no_date("hhs", report_id, title, report_url) return if published_on.year not in year_range: logging.debug("[%s] Skipping, not in requested range." % report_url) return result = { 'inspector': 'hhs', 'inspector_url': 'http://oig.hhs.gov', 'agency': 'hhs', 'agency_name': 'Health & Human Services', 'report_id': report_id, 'topic': topic.strip(), 'url': report_url, 'title': title, 'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"), } if subtopic: result['subtopic'] = subtopic return result
def report_from(result, year_range, topic, subtopic_url, subtopic=None): # Ignore links to other subsections if result.get("class") and result["class"][0] == "crossref": return if result.name == "a": # Sometimes we already have a link result_link = result else: result_link = result.find("a") # No link found, this is probably just an extra <li> on the page. if result_link is None: return # If this is just a anchor link on the same page, skip if not strip_url_fragment(result_link["href"]): return title = result_link.text title = title.replace("\xe2\x80\x93", "-") title = inspector.sanitize(title) title = re.sub("\s+", " ", title) if title in TITLE_NORMALIZATION: title = TITLE_NORMALIZATION[title] if title in BLACKLIST_TITLES: return report_url = urljoin(subtopic_url, result_link["href"]).strip() if report_url in REPORT_URL_MAPPING: report_url = REPORT_URL_MAPPING[report_url] # Ignore reports from other sites if BASE_URL not in report_url: return if report_url in BLACKLIST_REPORT_URLS: return if report_url in OEI_COMBINED_LANDING_PAGES: report_url = OEI_COMBINED_LANDING_PAGES[report_url][title] report_filename = report_url.split("/")[-1] report_id, extension = os.path.splitext(report_filename) if report_filename == "11302505.pdf": report_id = report_id + "_early_alert" # Try a quick check from the listing page to see if we can bail out based on # the year try: published_on_text = result.find_previous("dt").text.strip() published_on = datetime.datetime.strptime(published_on_text, "%m-%d-%Y") except (AttributeError, ValueError): published_on = None if published_on and published_on.year not in year_range: logging.debug("[%s] Skipping, not in requested range." % report_url) return # This report is listed twice, once with the wrong date if ( published_on and published_on.year == 2012 and published_on.month == 1 and published_on.date == 12 and report_id == "20901002" ): return if report_id in REPORT_PUBLISHED_MAPPING: published_on = REPORT_PUBLISHED_MAPPING[report_id] else: # Process reports with landing pages if extension.lower() != ".pdf": report_url, published_on = report_from_landing_url(report_url) else: published_on = published_on_from_inline_link(result, report_filename, title, report_id, report_url) if published_on.year not in year_range: logging.debug("[%s] Skipping, not in requested range." % report_url) return result = { "inspector": "hhs", "inspector_url": "http://oig.hhs.gov", "agency": "hhs", "agency_name": "Health & Human Services", "report_id": report_id, "topic": topic.strip(), "url": report_url, "title": title, "published_on": datetime.datetime.strftime(published_on, "%Y-%m-%d"), } if subtopic: result["subtopic"] = subtopic return result
def report_from(result, year_range): # walk backwards through the doc to find the header title for element in result.previous_elements: if element and \ isinstance(element, Tag) and \ element.name == "span" and \ element.has_attr("class") and \ "collapseomatic" in element["class"]: header = element.text.strip().lower() break else: raise Exception("Couldn't find the header for %s" % result) if header.startswith("inspection"): category = "inspection" elif header.startswith("semiannual"): category = "semiannual_report" else: category = "other" report_id = os.path.splitext(os.path.basename(result['href']))[0] report_url = urljoin(REPORTS_URL, result['href'].strip()) title = inspector.sanitize(result.text) # Each financial/performance report is linked twice, once for the IG's # transmittal letter and independent auditor's report, and once for # the IG's "Perspective on Management and Performance Challenges." # Skip the first one and save the second if "IG's Transmittal Letter and Independent Auditor's Report" in title \ and "(pages" in title: return None elif title == "Hotline Poster": return None published_on = REPORT_PUBLISHED_MAPPING.get(title) if not published_on: published_on = REPORT_PUBLISHED_MAPPING.get(report_id) if not published_on: date_match = DATE_RE.match(title) if date_match: published_on = datetime.datetime.strptime(date_match.group(1), "%Y.%m") if date_match.lastindex == 2: title = date_match.group(2) elif header.startswith("semiannual"): title = published_on.strftime("Semiannual Report to Congress, %B %Y") else: raise Exception("No good title for %s" % report_id) if not published_on: admin.log_no_date("denali", report_id, title, report_url) return if published_on.year not in year_range: logging.debug("[%s] Skipping, not in requested range." % report_url) return report = { 'inspector': "denali", 'inspector_url': "http://www.oig.denali.gov", 'agency': "denali", 'agency_name': "Denali Commission", 'report_id': report_id, 'url': report_url, 'title': title, 'type': category, 'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"), } return report
def report_from(result, year_range, report_type, title_prefix=None): report_url = urljoin(REPORTS_URL, result.select("a")[-1].get("href")) # Temporary hacks to account for link mistakes if report_url == "http://www.fec.gov/fecig/documents/Semi14a_000.pdf": report_url = "http://www.fec.gov/fecig/documents/Semi14a.pdf" if report_url == "http://www.fec.gov/fecig/documents/ReviewofOutstanding" \ "RecommendationsasofJune2014_001.pdf": report_url = "http://www.fec.gov/general/documents/ReviewofOutstanding" \ "RecommendationsasofJune2014.pdf" report_filename = report_url.split("/")[-1] report_id, extension = os.path.splitext(report_filename) published_on = None if report_url.endswith(".pdf"): # Inline report title = inspector.sanitize(result.contents[0].strip().rstrip("-")) title = re.sub("\\s+", " ", title) if title.endswith((" 200", " 201")): # some years are split up by a <span> tag title = title + result.contents[1].text else: # Some pages have separate landing pages. doc = utils.beautifulsoup_from_url(report_url) title = doc.select("h3")[1].text.strip() try: published_on_text = doc.select("h3")[2].text.strip() except IndexError: published_on_text = doc.select("h3")[1].text.strip() published_on_text = published_on_text.replace("Period ending ", "") published_on = datetime.datetime.strptime(published_on_text, '%B %d, %Y') if title == "November 2016" and report_url == "http://www.fec.gov/fecig/documents/OIGSemiannualReporttoCongress-May2016-FinalPublicDistribution.pdf": # Fix copy-paste error report_url = "http://www.fec.gov/fecig/documents/OIGFall2016SARFINAL.pdf" report_filename = report_url.split("/")[-1] report_id, extension = os.path.splitext(report_filename) if not published_on: if report_id in REPORT_PUBLISHED_MAPPING: published_on = REPORT_PUBLISHED_MAPPING[report_id] if not published_on: try: published_on_text = "-".join(re.search('(\w+)\s+(\d{4})', title).groups()) published_on = datetime.datetime.strptime(published_on_text, '%B-%Y') except (ValueError, AttributeError): pass if title_prefix: title = "{}{}".format(title_prefix, title) if not published_on: admin.log_no_date("fec", report_id, title, report_url) return if published_on.year not in year_range: logging.debug("[%s] Skipping, not in requested range." % report_url) return report = { 'inspector': "fec", 'inspector_url': "http://www.fec.gov/fecig/fecig.shtml", 'agency': "fec", 'agency_name': "Federal Election Commission", 'type': report_type, 'report_id': report_id, 'url': report_url, 'title': title, 'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"), # Date of publication } return report
def audit_report_from(result, landing_url, year, year_range): if not result.text.strip(): return link = result.find("a") report_url = urljoin(landing_url, link['href']) report_filename = report_url.split("/")[-1] report_id, _ = os.path.splitext(report_filename) try: title = result.select("blockquote")[0].contents[0] except IndexError: title = result.text title_prefixer = re.compile( "(Advisory|Management|Audit)\\s*(Letter|Report)\\s*[\\d\\-]+:\\s*", re.I) title = title_prefixer.sub("", title) estimated_date = False published_on = None if report_id in REPORT_PUBLISHED_MAP: published_on = REPORT_PUBLISHED_MAP[report_id] cleaned_text = re.sub("\s+", " ", inspector.sanitize(result.text)) if not published_on: try: published_on_text = re.search('(\w+ \d+, \d+)', cleaned_text).groups()[0] published_on = datetime.datetime.strptime(published_on_text, '%B %d, %Y') except AttributeError: pass if not published_on: try: published_on_text = re.search('(\w+ \d+ , \d+)', cleaned_text).groups()[0] published_on = datetime.datetime.strptime(published_on_text, '%B %d , %Y') except AttributeError: pass if not published_on: try: response = utils.scraper.request(method="HEAD", url=report_url) last_modified = response.headers["Last-Modified"] published_on = datetime.datetime.strptime( last_modified, "%a, %d %b %Y %H:%M:%S %Z") except ValueError: pass if not published_on: admin.log_no_date("archives", report_id, title, report_url) return if published_on.year not in year_range: logging.debug("[%s] Skipping, not in requested range." % report_url) return report = { 'inspector': 'archives', 'inspector_url': 'https://www.archives.gov/oig/', 'agency': 'archives', 'agency_name': 'National Archives and Records Administration', 'report_id': report_id, 'url': report_url, 'title': title, 'type': 'audit', 'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"), } if estimated_date: report['estimated_date'] = estimated_date return report
def report_from(result, report_type, year_range): tds = result.select("td") if len(tds) > 0: title = inspector.sanitize(tds[0].text) else: return if (not title) or (title in HEADER_ROW_TEXT): # Skip the header rows return published_on_text = tds[2].text try: published_on = datetime.datetime.strptime(published_on_text, '%m/%d/%Y') except ValueError: published_on = datetime.datetime.strptime(published_on_text, '%m/%Y') if published_on.year not in year_range: logging.debug("[%s] Skipping, not in requested range." % title) return unreleased = False link = result.find("a") landing_url = urljoin(BASE_REPORT_URL, link.get('href')) if landing_url.endswith(".pdf"): # Inline report report_url = landing_url landing_url = None summary = None else: landing_page = utils.beautifulsoup_from_url(landing_url) summary = " ".join(landing_page.select("div.holder")[0].text.split()) report_link = landing_page.find("a", href=PDF_REGEX) if report_link: report_url = urljoin(landing_url, report_link.get('href')) else: unreleased = True report_url = None report_id = tds[1].text.strip().replace("/", "-").replace(" ", "-") if report_id == "N-A": report_id = tds[0].text.strip().replace("/", "-").replace(" ", "-") if report_id == "": if report_url: report_id = os.path.splitext(os.path.basename(report_url))[0] else: report_id = os.path.splitext(os.path.basename(landing_url))[0] if report_url: # OIG MAR-2012-10/PA-12-87 is posted under both Audits/Evaluations/MARs and # Congressional Requests. if report_url in saved_report_urls: return saved_report_urls.add(report_url) report = { 'inspector': "pbgc", 'inspector_url': "http://oig.pbgc.gov", 'agency': "pbgc", 'agency_name': "Pension Benefit Guaranty Corporation", 'type': report_type, 'report_id': report_id, 'url': report_url, 'title': title, 'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"), } if summary: report['summary'] = summary if unreleased: report['unreleased'] = unreleased if landing_url: report['landing_url'] = landing_url return report
def report_from(result, base_url): report = { 'inspector': 'gsa', 'inspector_url': 'https://www.gsaig.gov/', 'agency': 'gsa', 'agency_name': 'General Services Administration' } title_h4 = result.find("div", property="dc:title").h4 title = inspector.sanitize(title_h4.text) if title_h4.a: report['landing_url'] = urljoin(base_url, title_h4.a["href"]) else: report['landing_url'] = base_url description = result.find("div", class_="field-name-field-description") if description: report['summary'] = inspector.sanitize(description.text) unreleased = False url = None file_section = result.find("span", class_="file") if file_section: file_links = file_section.find_all("a") if len(file_links) > 1: raise Exception("Multiple file links for %s" % title) link = file_links[0] url = link.get('href') url = urljoin(base_url, url) if url == "https://www.gsaig.gov/sites/default/files/recovery-reports/FINAL%20TESTIMONY%20FOR%20APRIL%2021.pdf": # This testimony is also posted in the testimony section, so we can skip # the one posted under recovery reports return report_id = os.path.splitext(os.path.basename(unquote_plus(url)))[0] report_id = re.sub('[-/\\s]+', '-', inspector.sanitize(report_id)) else: unreleased = report['unreleased'] = True report_id = re.sub('[-/\\s]+', '-', inspector.sanitize(title)) published_date_div = result.find("div", class_="field-name-post-date") if published_date_div: published_date = published_date_div.text date = datetime.strptime(published_date, "%B %d, %Y") else: # get last match match = None for match in DATE_RE.finditer(title): pass published_date = match.group(0) date = datetime.strptime(published_date, "%B %d, %Y") report_type = type_for(base_url) report['type'] = report_type report['published_on'] = datetime.strftime(date, "%Y-%m-%d") if not unreleased: report['url'] = url if url.lower().endswith(".pdf"): report['file_type'] = "pdf" elif url.lower().endswith(".doc"): report['file_type'] = "doc" elif url.lower().endswith(".xls"): report['file_type'] = "xls" elif url.lower().endswith(".ppt"): report['file_type'] = "ppt" else: raise Exception("Unexpected filetype for %s" % url) report['report_id'] = report_id report['title'] = title.strip() return report
def report_from(result, landing_url, report_type, year_range): if not result.text or result.text in BLACKLIST_REPORT_TITLES: # There are a few empty links due to bad html and some links for alternative # formats (PDF) that we will just ignore. return link_text = None if result.name == 'a': report_url = result.get('href') link_text = inspector.sanitize(result.text) title = inspector.sanitize("%s %s" % (result.text, result.next_sibling)) else: links = [link for link in result.find_all('a') if link.text.strip()] report_url = links[0].get('href') link_text = inspector.sanitize(result.a.text) title = inspector.sanitize(result.text) report_url = urljoin(landing_url, report_url) report_filename = os.path.basename(report_url) if title.endswith("PDF"): title = title[:-3] title = title.rstrip(" .") prev = result.previous_sibling if isinstance(prev, NavigableString) and "See, also:" in prev: return None report_no_match = REPORT_NO_RE.match(link_text) if report_no_match: report_id = report_no_match.group(0) if "fraud" in report_url.lower(): report_id = "fraud-alert-" + report_id elif "Client_Trust_Fund" in report_url: report_id = "CTF-" + report_id elif report_filename.startswith("sr"): report_id = "special-report-" + report_id else: report_id, _ = os.path.splitext(report_filename) report_id = unquote(report_id) report_id = "-".join(report_id.split()) report_id = report_id.replace("\\", "") # strip backslashes estimated_date = False published_on = None if report_id in REPORT_PUBLISHED_MAP: published_on = REPORT_PUBLISHED_MAP[report_id] elif link_text == "June 2015": published_on = datetime.datetime(2015, 6, 1) else: published_on_text = None try: published_on_text = re.search('(\d+/\d+/\d+)', title).groups()[0] except AttributeError: pass if not published_on_text: try: published_on_text = re.search('(\w+ \d+, \d+)', title).groups()[0] except AttributeError: pass if not published_on_text: try: published_on_text = re.search('(\d+/\d+)', title).groups()[0] except AttributeError: pass if not published_on_text: admin.log_no_date("lsc", report_id, title, report_url) return if not published_on: datetime_formats = [ '%B %d, %Y', '%m/%d/%Y', '%m/%d/%y', '%m/%Y', '%m/%y' ] for datetime_format in datetime_formats: try: published_on = datetime.datetime.strptime(published_on_text, datetime_format) except ValueError: pass else: break if not published_on: admin.log_no_date("lsc", report_id, title, report_url) return if published_on.year not in year_range: logging.debug("[%s] Skipping, not in requested range." % report_url) return report = { 'inspector': 'lsc', 'inspector_url': 'https://www.oig.lsc.gov', 'agency': 'lsc', 'agency_name': 'Legal Services Corporation', 'type': report_type, 'report_id': report_id, 'url': report_url, 'title': title, 'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"), } if estimated_date: report['estimated_date'] = estimated_date if report_url in ("https://www.oig.lsc.gov/core-legal-services"): report['file_type'] = "html" if report_url.startswith("https://oig.lsc.gov/mapping/references/eval"): report['unreleased'] = True report['missing'] = True return report
def report_from_list(result, landing_url, report_type, year_range): missing = False title = re.sub("\\s+", " ", inspector.sanitize(result.text)) report_id = None report_id_match = REPORT_ID_RE_1.search(title) if report_id_match: report_id = report_id_match.group(1) or report_id_match.group(2) if 'Non-Public Report' in title: unreleased = True report_url = None if report_id in ("ER-11-01", "ER-12-01", "ER-13-01", "ER-14-01", "ER-15-01", "ER-16-01", "ER-17-01"): # These reports are listed in two places, once with a PDF, once without return if not report_id: report_id = "-".join(title.split()) report_id = report_id.replace(":", "") else: unreleased = False link = result.find("a") if not link: return None # Some reports have incorrect relative paths relative_report_url = link.get('href').replace("../", "") report_url = urljoin(landing_url, relative_report_url) if report_url == "https://www.flra.gov/system/files/webfm/Inspector%20General/FLRA%20IPERA%20Compliance%202011.pdf" and report_id == "ER-12-02": report_url = "https://www.flra.gov/system/files/webfm/Inspector%20General/IPERA%20March%202012.pdf" if not report_id: report_filename = report_url.split("/")[-1] report_id, _ = os.path.splitext(report_filename) report_id = "-".join(unquote(report_id).split()) estimated_date = False published_on = None if report_id in REPORT_PUBLISHED_MAP: published_on = REPORT_PUBLISHED_MAP[report_id] if not published_on: try: published_on = datetime.datetime.strptime(title, '%B %Y') except ValueError: pass if not published_on: admin.log_no_date("flra", report_id, title, report_url) return if published_on.year not in year_range: logging.debug("[%s] Skipping, not in requested range." % report_url) return if published_on.year <= 2011 and not unreleased and not report_url: # Some older reports aren't posted unreleased = True missing = True report = { 'inspector': 'flra', 'inspector_url': 'https://www.flra.gov/components-offices/offices/office-inspector-general', 'agency': 'flra', 'agency_name': 'Federal Labor Relations Authority', 'file_type': 'pdf', 'type': report_type, 'report_id': report_id, 'url': report_url, 'title': title, 'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"), } if estimated_date: report['estimated_date'] = estimated_date if unreleased: report['unreleased'] = unreleased report['landing_url'] = landing_url if missing: report['missing'] = missing return report
def clean_text(text): return re.sub("[ \n]+", " ", inspector.sanitize(text))
def audit_report_from(result, landing_url, year, year_range): link = result.find("a") report_url = urljoin(landing_url, link.get('href')) report_filename = report_url.split("/")[-1] report_id, _ = os.path.splitext(report_filename) try: title = result.select("blockquote")[0].contents[0] except IndexError: title = result.text title_prefixer = re.compile("(Advisory|Management|Audit)\\s*(Letter|Report)\\s*[\\d\\-]+:\\s*", re.I) title = title_prefixer.sub("", title) estimated_date = False published_on = None if report_id in REPORT_PUBLISHED_MAP: published_on = REPORT_PUBLISHED_MAP[report_id] cleaned_text = re.sub("\s+", " ", inspector.sanitize(result.text)) if not published_on: try: published_on_text = re.search('(\w+ \d+, \d+)', cleaned_text).groups()[0] published_on = datetime.datetime.strptime(published_on_text, '%B %d, %Y') except AttributeError: pass if not published_on: try: published_on_text = re.search('(\w+ \d+ , \d+)', cleaned_text).groups()[0] published_on = datetime.datetime.strptime(published_on_text, '%B %d , %Y') except AttributeError: pass if not published_on: try: response = utils.scraper.request(method="HEAD", url=report_url) last_modified = response.headers["Last-Modified"] published_on = datetime.datetime.strptime(last_modified, "%a, %d %b %Y %H:%M:%S %Z") except ValueError: pass if not published_on: admin.log_no_date("archives", report_id, title, report_url) return if published_on.year not in year_range: logging.debug("[%s] Skipping, not in requested range." % report_url) return report = { 'inspector': 'archives', 'inspector_url': 'https://www.archives.gov/oig/', 'agency': 'archives', 'agency_name': 'National Archives and Records Administration', 'report_id': report_id, 'url': report_url, 'title': title, 'type': 'audit', 'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"), } if estimated_date: report['estimated_date'] = estimated_date return report
def report_from(result, landing_url, report_type, year_range, year=None): if not result.text or result.text in BLACKLIST_REPORT_TITLES: # There are a few empty links due to bad html and some links for alternative # formats (PDF) that we will just ignore. return link_text = None if result.name == 'a': report_url = result.get('href') link_text = inspector.sanitize(result.text) title = inspector.sanitize("%s %s" % (result.text, result.next_sibling)) else: links = [link for link in result.find_all('a') if link.text.strip()] report_url = links[0].get('href') link_text = inspector.sanitize(result.a.text) title = inspector.sanitize(result.text) report_url = urljoin(landing_url, report_url) report_filename = os.path.basename(report_url) prev = result.previous_sibling if isinstance(prev, NavigableString) and "See, also:" in prev: return None report_no_match = REPORT_NO_RE.match(link_text) if report_no_match: report_id = report_no_match.group(0) if "fraud" in report_url.lower(): report_id = "fraud-alert-" + report_id elif "Client_Trust_Fund" in report_url: report_id = "CTF-" + report_id elif report_filename.startswith("sr"): report_id = "special-report-" + report_id else: report_id, _ = os.path.splitext(report_filename) report_id = unquote(report_id) report_id = "-".join(report_id.split()) report_id = report_id.replace("\\", "") # strip backslashes estimated_date = False published_on = None if report_id in REPORT_PUBLISHED_MAP: published_on = REPORT_PUBLISHED_MAP[report_id] elif link_text == "June 2015": published_on = datetime.datetime(2015, 6, 1) else: try: published_on_text = re.search('(\d+/\d+/\d+)', title).groups()[0] except AttributeError: try: published_on_text = re.search('(\w+ \d+, \d+)', title).groups()[0] except AttributeError: try: published_on_text = re.search('(\d+/\d+)', title).groups()[0] except AttributeError: if year is None: raise Exception( "No date or year was detected for %s (%s)" % (report_id, title)) # Since we only have the year, set this to Nov 1st of that year published_on = datetime.datetime(year, 11, 1) estimated_date = True if not published_on: datetime_formats = [ '%B %d, %Y', '%m/%d/%Y', '%m/%d/%y', '%m/%Y', '%m/%y' ] for datetime_format in datetime_formats: try: published_on = datetime.datetime.strptime( published_on_text, datetime_format) except ValueError: pass else: break if published_on.year not in year_range: logging.debug("[%s] Skipping, not in requested range." % report_url) return report = { 'inspector': 'lsc', 'inspector_url': 'https://www.oig.lsc.gov', 'agency': 'lsc', 'agency_name': 'Legal Services Corporation', 'type': report_type, 'report_id': report_id, 'url': report_url, 'title': title, 'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"), } if estimated_date: report['estimated_date'] = estimated_date if report_url in ("https://www.oig.lsc.gov/core-legal-services"): report['file_type'] = "html" return report
def report_from(result, year_range): # walk backwards through the doc to find the header title for element in result.previous_elements: if element and \ isinstance(element, Tag) and \ element.name == "span" and \ element.has_attr("class") and \ "collapseomatic" in element["class"]: header = element.text.strip().lower() break else: raise Exception("Couldn't find the header for %s" % result) if header.startswith("inspection"): category = "inspection" elif header.startswith("semiannual"): category = "semiannual_report" else: category = "other" report_id = os.path.splitext(os.path.basename(result['href']))[0] report_url = urljoin(REPORTS_URL, result['href']) title = inspector.sanitize(result.text) # Each financial/performance report is linked twice, once for the IG's # transmittal letter and independent auditor's report, and once for # the IG's "Perspective on Management and Performance Challenges." # Skip the first one and save the second if "IG's Transmittal Letter and Independent Auditor's Report" in title \ and "(pages" in title: return None elif title == "Hotline Poster": return None published_on = REPORT_PUBLISHED_MAPPING.get(title) if not published_on: published_on = REPORT_PUBLISHED_MAPPING.get(report_id) if not published_on: date_match = DATE_RE.match(title) if date_match: published_on = datetime.datetime.strptime(date_match.group(1), "%Y.%m") if date_match.lastindex == 2: title = date_match.group(2) elif header.startswith("semiannual"): title = published_on.strftime( "Semiannual Report to Congress, %B %Y") else: raise Exception("No good title for %s" % report_id) if not published_on: raise Exception("Couldn't find date: %s, %s" % (title, report_id)) if published_on.year not in year_range: logging.debug("[%s] Skipping, not in requested range." % report_url) return report = { 'inspector': "denali", 'inspector_url': "http://www.oig.denali.gov", 'agency': "denali", 'agency_name': "Denali Commission", 'report_id': report_id, 'url': report_url, 'title': title, 'type': category, 'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"), } return report
def report_from(result, reports_page, report_type, year_range): unreleased = False summary = None landing_url = None estimated_date = False # audits have some data, but link to landing page for summary and URL if report_type == "audit": landing_a = result.select(".cell3 a")[0] landing_url = urljoin(reports_page, landing_a['href']) long_title = landing_a.text.strip() # https://www.cncsoig.gov/news-entry/97-09 and # https://www.cncsoig.gov/news-entry/97-09-0 are duplicates of each other if landing_url == "https://www.cncsoig.gov/news-entry/97-09-0": return # PDF URL and summary are on the report's landing page report_url, summary, title = extract_from_release_page(landing_url) if not report_url: unreleased = True if not title: title = long_title # the report PDF URL can be pulled from the comments # we're ignoring this since we're going to the landing page anyhow. # re.search("href=\"(/sites/default/files/.*?)\">GO", str(result)) report_id = result.select(".cell1")[0].text.strip() stamp = result.select(".cell2")[0].text.strip() published_on = datetime.datetime.strptime(stamp, "%m.%d.%Y") elif report_type == "investigation": stamp = result.select(".cell2")[0].text.strip() published_on = datetime.datetime.strptime(stamp, "%Y-%m-%d") title = result.select(".cell3 p")[0].text.strip() report_url = result.select(".cell3 a")[0]['href'] report_url = urljoin(reports_page, report_url) report_id = os.path.splitext(report_url.split("/")[-1])[0] elif report_type == "semiannual_report": report_url = result.select(".cell4 a")[0]['href'] report_url = urljoin(reports_page, report_url) report_id = os.path.splitext(report_url.split("/")[-1])[0] stamps = result.select(".cell2")[0].text.strip().split() # the agency can mess up the date order if stamps[2] == "09.30.2013": stamp = stamps[0] else: stamp = stamps[2] published_on = datetime.datetime.strptime(stamp, "%m.%d.%Y") title = str.join(" ", stamps) elif report_type == "case": report_type = "investigation" title = result.select("div")[0].text id_text = None summary = "" for p in result.select("p"): text = inspector.sanitize(p.text.strip()) summary += text + "\n\n" if text.lower().strip("-").strip().startswith( ("case id", "case d")): id_text = text summary = summary.strip() if not id_text: for div in result.select("div"): text = inspector.sanitize(div.text.strip()) if text.lower().strip("-").strip().startswith( ("case id", "case d")): id_text = text if not id_text: match = re.search("Case I?D: ([0-9]{4}-[0-9]{3})", title) if match: id_text = match.group(1) if not id_text: raise Exception("Could not find Case ID for an investigation\n%s" % result.text) # note that some cases have more than one id. We are taking only the last id. report_id = re.sub(r'\([^)]*\)', '', id_text).strip().split(" ")[-1] landing_url = reports_page unreleased = True report_url = None date_match = DATE_RE.match(result.text.replace(title, "").strip()) if date_match: published_on = datetime.datetime.strptime(date_match.group(0), "%Y-%m-%d") else: year = int(report_id.replace("\u2010", "-").split("-")[0]) published_on = datetime.date(year, 1, 1) estimated_date = True if published_on.year not in year_range: logging.debug("[%s] Skipping, not in requested range." % report_url) return report = { 'inspector': 'cncs', 'inspector_url': 'https://www.cncsoig.gov', 'agency': 'cncs', 'agency_name': 'Corporation for National and Community Service', 'report_id': report_id, 'url': report_url, 'title': title, 'type': report_type, 'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"), # Date of publication } if unreleased: report['unreleased'] = True if summary: report['summary'] = summary if landing_url: report['landing_url'] = landing_url if estimated_date: report['estimated_date'] = estimated_date return report
def report_from(result, reports_page, report_type, year_range): unreleased = False summary = None landing_url = None estimated_date = False # audits have some data, but link to landing page for summary and URL if report_type == "audit": landing_a = result.select(".cell3 a")[0] landing_url = urljoin(reports_page, landing_a['href']) long_title = landing_a.text.strip() # https://www.cncsoig.gov/news-entry/97-09 and # https://www.cncsoig.gov/news-entry/97-09-0 are duplicates of each other if landing_url == "https://www.cncsoig.gov/news-entry/97-09-0": return # PDF URL and summary are on the report's landing page report_url, summary, title = extract_from_release_page(landing_url) if not report_url: unreleased = True if not title: title = long_title # the report PDF URL can be pulled from the comments # we're ignoring this since we're going to the landing page anyhow. # re.search("href=\"(/sites/default/files/.*?)\">GO", str(result)) report_id = result.select(".cell1")[0].text.strip() stamp = result.select(".cell2")[0].text.strip() published_on = datetime.datetime.strptime(stamp, "%m.%d.%Y") elif report_type == "investigation": stamp = result.select(".cell2")[0].text.strip() published_on = datetime.datetime.strptime(stamp, "%Y-%m-%d") title = result.select(".cell3 p")[0].text.strip() report_url = result.select(".cell3 a")[0]['href'] report_url = urljoin(reports_page, report_url) report_id = os.path.splitext(report_url.split("/")[-1])[0] elif report_type == "semiannual_report": report_url = result.select(".cell4 a")[0]['href'] report_url = urljoin(reports_page, report_url) report_id = os.path.splitext(report_url.split("/")[-1])[0] stamps = result.select(".cell2")[0].text.strip().split() # the agency can mess up the date order if stamps[2] == "09.30.2013": stamp = stamps[0] else: stamp = stamps[2] published_on = datetime.datetime.strptime(stamp, "%m.%d.%Y") title = str.join(" ", stamps) elif report_type == "case": report_type = "investigation" title = result.select("div")[0].text id_text = None summary = "" for p in result.select("p"): text = inspector.sanitize(p.text.strip()) summary += text + "\n\n" if text.lower().strip("-").strip().startswith("case id"): id_text = text summary = summary.strip() if not id_text: for div in result.select("div"): text = inspector.sanitize(div.text.strip()) if text.lower().strip("-").strip().startswith("case id"): id_text = text if not id_text: raise Exception("Could not find Case ID for an investigation\n%s" % \ result.text) #note that some cases have more than one id. We are taking only the last id. report_id = re.sub(r'\([^)]*\)','',id_text).strip().split(" ")[-1] landing_url = reports_page unreleased = True report_url = None year = int(report_id.replace("\u2010", "-").split("-")[0]) published_on = datetime.date(year, 1, 1) estimated_date = True if published_on.year not in year_range: logging.debug("[%s] Skipping, not in requested range." % report_url) return report = { 'inspector': 'cncs', 'inspector_url': 'https://www.cncsoig.gov', 'agency': 'cncs', 'agency_name': 'Corporation for National and Community Service', 'report_id': report_id, 'url': report_url, 'title': title, 'type': report_type, 'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"), # Date of publication } if unreleased: report['unreleased'] = True if summary: report['summary'] = summary if landing_url: report['landing_url'] = landing_url if estimated_date: report['estimated_date'] = estimated_date return report
def report_from(result, year_range, topic, subtopic_url, subtopic=None): # Ignore links to other subsections if result.get('class') and result['class'][0] == 'crossref': return if result.name == 'a': # Sometimes we already have a link result_link = result else: result_link = result.find("a") # No link found, this is probably just an extra <li> on the page. if result_link is None: return # If this is just a anchor link on the same page, skip if not strip_url_fragment(result_link['href']): return title = result_link.text title = title.replace("\xe2\x80\x93", "-") title = inspector.sanitize(title) title = re.sub('\s+', ' ', title) if title in TITLE_NORMALIZATION: title = TITLE_NORMALIZATION[title] if title in BLACKLIST_TITLES: return report_url = urljoin(subtopic_url, result_link['href']).strip() if report_url in REPORT_URL_MAPPING: report_url = REPORT_URL_MAPPING[report_url] # Ignore reports from other sites if BASE_URL not in report_url: return if report_url in BLACKLIST_REPORT_URLS: return if report_url in OEI_COMBINED_LANDING_PAGES: report_url = OEI_COMBINED_LANDING_PAGES[report_url][title] report_filename = report_url.split("/")[-1] report_id, extension = os.path.splitext(report_filename) if report_filename == "11302505.pdf": report_id = report_id + "_early_alert" # Try a quick check from the listing page to see if we can bail out based on # the year try: published_on_text = result.find_previous("dt").text.strip() published_on = datetime.datetime.strptime(published_on_text, "%m-%d-%Y") except (AttributeError, ValueError): published_on = None if published_on and published_on.year not in year_range: logging.debug("[%s] Skipping, not in requested range." % report_url) return # This report is listed twice, once with the wrong date if published_on and published_on.year == 2012 and published_on.month == 1 \ and published_on.date == 12 and report_id == "20901002": return if report_id in REPORT_PUBLISHED_MAPPING: published_on = REPORT_PUBLISHED_MAPPING[report_id] else: # Process reports with landing pages if extension.lower() != '.pdf': report_url, published_on = report_from_landing_url(report_url) else: published_on = published_on_from_inline_link( result, report_filename, title, report_id, report_url, ) if published_on.year not in year_range: logging.debug("[%s] Skipping, not in requested range." % report_url) return result = { 'inspector': 'hhs', 'inspector_url': 'http://oig.hhs.gov', 'agency': 'hhs', 'agency_name': 'Health & Human Services', 'report_id': report_id, 'topic': topic.strip(), 'url': report_url, 'title': title, 'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"), } if subtopic: result['subtopic'] = subtopic return result
def report_from(result, year_range, report_type, title_prefix=None): report_url = urljoin(REPORTS_URL, result.select("a")[-1].get("href")) # Temporary hacks to account for link mistakes if report_url == "http://www.fec.gov/fecig/documents/Semi14a_000.pdf": report_url = "http://www.fec.gov/fecig/documents/Semi14a.pdf" if report_url == "http://www.fec.gov/fecig/documents/ReviewofOutstanding" \ "RecommendationsasofJune2014_001.pdf": report_url = "http://www.fec.gov/general/documents/ReviewofOutstanding" \ "RecommendationsasofJune2014.pdf" report_filename = report_url.split("/")[-1] report_id, extension = os.path.splitext(report_filename) published_on = None if report_url.endswith(".pdf"): # Inline report title = inspector.sanitize(result.contents[0].strip().rstrip("-")) title = re.sub("\\s+", " ", title) if title.endswith((" 200", " 201")): # some years are split up by a <span> tag title = title + result.contents[1].text else: # Some pages have separate landing pages. doc = utils.beautifulsoup_from_url(report_url) title = doc.select("h3")[1].text.strip() try: published_on_text = doc.select("h3")[2].text.strip() except IndexError: published_on_text = doc.select("h3")[1].text.strip() published_on_text = published_on_text.replace("Period ending ", "") published_on = datetime.datetime.strptime(published_on_text, '%B %d, %Y') if title == "November 2016" and report_url == "http://www.fec.gov/fecig/documents/OIGSemiannualReporttoCongress-May2016-FinalPublicDistribution.pdf": # Fix copy-paste error report_url = "http://www.fec.gov/fecig/documents/OIGFall2016SARFINAL.pdf" report_filename = report_url.split("/")[-1] report_id, extension = os.path.splitext(report_filename) if not published_on: if report_id in REPORT_PUBLISHED_MAPPING: published_on = REPORT_PUBLISHED_MAPPING[report_id] if not published_on: try: published_on_text = "-".join( re.search('(\w+)\s+(\d{4})', title).groups()) published_on = datetime.datetime.strptime(published_on_text, '%B-%Y') except (ValueError, AttributeError): pass if title_prefix: title = "{}{}".format(title_prefix, title) if not published_on: admin.log_no_date("fec", report_id, title, report_url) return if published_on.year not in year_range: logging.debug("[%s] Skipping, not in requested range." % report_url) return report = { 'inspector': "fec", 'inspector_url': "http://www.fec.gov/fecig/fecig.shtml", 'agency': "fec", 'agency_name': "Federal Election Commission", 'type': report_type, 'report_id': report_id, 'url': report_url, 'title': title, 'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"), # Date of publication } return report
def remove_linebreaks(s): #lots of weird tabs, etc. inside HTML strings. would replace all at once, but since utils.beautifulsoup_from_url #is taking the html straight to soup, we'll do it individually for the fields we need return inspector.sanitize(s.replace('\n','').replace('\t','').replace('\r',''))