Exemplo n.º 1
0
def run(options):
  year_range = inspector.year_range(options, archive)

  # Pull the audit reports
  doc = utils.beautifulsoup_from_url(REPORTS_URL)
  results = doc.select("#rounded-corner > tr")
  if not results:
    raise inspector.NoReportsFoundError("Federal Reserve (audit reports)")
  for result in results:
    report = report_from(result, year_range)
    if report:
      inspector.save_report(report)

  # Pull the semiannual reports
  doc = utils.beautifulsoup_from_url(SEMIANNUAL_REPORTS_URL)
  results = doc.select("div.style-aside ul > li > a")
  if not results:
    raise inspector.NoReportsFoundError("Federal Reserve (semiannual reports)")
  for result in results:
    report_url = urljoin(BASE_PAGE_URL, result.get('href'))
    report = semiannual_report_from(report_url, year_range)
    if report:
      inspector.save_report(report)

  # The most recent semiannual report will be embedded on the main page
  report = semiannual_report_from(SEMIANNUAL_REPORTS_URL, year_range)
  if report:
    inspector.save_report(report)
Exemplo n.º 2
0
  def get_reports_by_year(self):
    # This page contains semianual reports as well as a few Audit reports for
    # Fiscal Year 2014 and links to sub-pages that contain links for other
    # fiscal years.
    doc = utils.beautifulsoup_from_url(REPORTS_BY_YEAR_URL)

    # Get the semiannual reports to Congress.
    self.get_semiannual_reports_to_congress(doc)

    # Reports that are 'bare' on the page, listed explicitly
    (bare_report_ul_1,
     bare_report_ul_2) = self.get_uls_past_audit_header(doc)
    self.get_bare_reports(bare_report_ul_1)
    self.get_bare_reports(bare_report_ul_2)

    # Links on the page to audit reports from past fiscal years
    link_subpage_ul = doc.select(".submenu-submenu")[0]
    for li in link_subpage_ul.find_all('li'):
      link = li.find('a')
      if link:
        next_url = urljoin(REPORTS_BY_YEAR_URL, link['href'])
        doc = utils.beautifulsoup_from_url(next_url)
        uls = self.get_uls_past_audit_header(doc)
        assert len(uls) == 1, ('Mysterious additional ul data on page: %s' %
                               next_url)
        self.get_bare_reports(uls[0])
Exemplo n.º 3
0
def run(options):
  year_range = inspector.year_range(options, archive)

  # Pull the reports
  for report_type, report_url in REPORT_URLS:
    doc = utils.beautifulsoup_from_url(report_url)
    results = doc.select("td.mainInner div.ms-WPBody > div > ul > li")

    if not results:
      raise inspector.NoReportsFoundError("SIGTARP ({})".format(report_type))

    for result in results:
      report = report_from(result, report_type, year_range)
      if report:
        inspector.save_report(report)

  doc = utils.beautifulsoup_from_url(QUARTERLY_REPORTS_URL)
  results = doc.select("#MSOZoneCell_WebPartWPQ3 .s4-wpTopTable a")

  if not results:
    raise inspector.NoReportsFoundError("SIGTARP (quarterly reports)")

  for result in results:
    report = quarterly_report_from(result, year_range)
    if report:
      inspector.save_report(report)
Exemplo n.º 4
0
def run(options):
  year_range = inspector.year_range(options, archive)

  # Pull the audit reports
  for year in year_range:
    url = AUDITS_REPORTS_URL.format(str(year)[2:4])
    doc = utils.beautifulsoup_from_url(url)
    results = doc.select("tr")
    if not results:
      raise inspector.NoReportsFoundError("NASA (%d)" % year)
    for index, result in enumerate(results):
      if not index or not result.text.strip():
        # Skip the header row and any empty rows
        continue
      report = audit_report_from(result, url, year_range)
      if report:
        inspector.save_report(report)

  # Pull the other reports
  doc = utils.beautifulsoup_from_url(OTHER_REPORT_URL)
  results = doc.select("#subContainer ul li")
  if not results:
    raise inspector.NoReportsFoundError("NASA (other)")
  for result in results:
    report = other_report_from(result, year_range)
    if report:
      inspector.save_report(report)
Exemplo n.º 5
0
def run(options):
    year_range = inspector.year_range(options, archive)

    doc = utils.beautifulsoup_from_url(AUDIT_REPORTS_URL)
    results = doc.article.find_all("tr")
    if not results:
        raise inspector.NoReportsFoundError("FCC (audit reports)")
    for result in results:
        report = report_from(result, AUDIT_REPORTS_URL, year_range)
        if report:
            inspector.save_report(report)

    doc = utils.beautifulsoup_from_url(SEMIANNUAL_REPORTS_URL)
    results = doc.article.find_all("tr")
    if not results:
        raise inspector.NoReportsFoundError("FCC (semiannual reports)")
    for result in results:
        report = semiannual_report_from(result, SEMIANNUAL_REPORTS_URL,
                                        year_range)
        if report:
            inspector.save_report(report)

    doc = utils.beautifulsoup_from_url(OTHER_REPORTS_URL)
    results = doc.article.find_all("p")
    if not results:
        raise inspector.NoReportsFoundError("FCC (other)")
    for result in results:
        report = other_report_from(result, OTHER_REPORTS_URL, year_range)
        if report:
            inspector.save_report(report)
Exemplo n.º 6
0
def run(options):
    year_range = inspector.year_range(options, archive)

    # Pull the reports with pagination
    for report_type, report_url_format in PAGINATED_REPORT_FORMATS:
        for page in range(0, 999):
            url = report_url_format.format(page=page)
            doc = utils.beautifulsoup_from_url(url)
            if report_type == "audit" and page == 0 and not doc.select(
                    "div.views-field-field-auditreport-doc-1"):
                raise Exception("Report number CSS class has changed")
            results = doc.select("li.views-row")
            if not results:
                if page == 0:
                    raise inspector.NoReportsFoundError("USAID (%s)" %
                                                        report_type)
                else:
                    break

            for result in results:
                report = report_from(result, url, report_type, year_range)
                if report:
                    inspector.save_report(report)

    # Pull the semiannual reports (no pagination)
    doc = utils.beautifulsoup_from_url(SEMIANNUAL_REPORTS_URL)
    results = doc.select("li.views-row")
    if not results:
        raise inspector.NoReportsFoundError("USAID (semiannual reports)")
    for result in results:
        report = semiannual_report_from(result, year_range)
        if report:
            inspector.save_report(report)
Exemplo n.º 7
0
def run(options):
  year_range = inspector.year_range(options, archive)
  pages = options.get('pages', ALL_PAGES)

  # Pull the audit reports. Pages are 0-indexed.
  for page in range(0, int(pages) - 1):
    doc = utils.beautifulsoup_from_url(AUDIT_REPORTS_URL.format(page=page))
    results = doc.select("span.field-content")
    if not results:
      if page == 0:
        raise inspector.NoReportsFound("FHFA (audit reports)")
      else:
        # No more results, we must have hit the last page
        break

    for result in results:
      report = report_from(result, year_range, report_type='audit')
      if report:
        inspector.save_report(report)

  # Grab the other reports
  for report_type, url in OTHER_REPORT_URLS.items():
    doc = utils.beautifulsoup_from_url(url)
    results = doc.select(".views-field")
    if not results:
      results = doc.select(".views-row")
    if not results:
      raise inspector.NoReportsFound("FHFA (%s)" % report_type)
    for result in results:
      report = report_from(result, year_range, report_type)
      if report:
        inspector.save_report(report)
Exemplo n.º 8
0
    def urls_for_topics(self, topics):
        for topic in topics:
            # Topic might be a tuple for ADDITIONAL_TOPICS (not ones from command
            # line).
            self.report_type = None
            if isinstance(topic, tuple):
                topic, report_type = topic
                self.report_type = report_type

            last_page = False

            url = TOPIC_TO_URL[topic]
            page = utils.beautifulsoup_from_url(url)
            page_started = self.is_first_page(page)
            if page_started:
                yield url

            for link in page.select('li.pager-item a'):
                next_url = urljoin(url, link['href'])
                next_page = utils.beautifulsoup_from_url(next_url)
                if not page_started:
                    page_started = self.is_first_page(next_page)
                if page_started:
                    yield next_url
                last_page = self.is_last_page(next_page)
                if last_page:
                    break
            if last_page:
                continue
        self.report_type = None  # Clear this out afterwards
Exemplo n.º 9
0
def run(options):
    year_range = inspector.year_range(options, archive)
    pages = options.get('pages', ALL_PAGES)

    # Pull the audit reports. Pages are 0-indexed.
    for page in range(0, int(pages) - 1):
        doc = utils.beautifulsoup_from_url(AUDIT_REPORTS_URL.format(page=page))
        results = doc.select("span.field-content")
        if not results:
            if page == 0:
                raise inspector.NoReportsFound("FHFA (audit reports)")
            else:
                # No more results, we must have hit the last page
                break

        for result in results:
            report = report_from(result, year_range, report_type='audit')
            if report:
                inspector.save_report(report)

    # Grab the other reports
    for report_type, url in OTHER_REPORT_URLS.items():
        doc = utils.beautifulsoup_from_url(url)
        results = doc.select(".views-field")
        if not results:
            results = doc.select(".views-row")
        if not results:
            raise inspector.NoReportsFound("FHFA (%s)" % report_type)
        for result in results:
            report = report_from(result, year_range, report_type)
            if report:
                inspector.save_report(report)
Exemplo n.º 10
0
  def urls_for_topics(self, topics):
    for topic in topics:
      # Topic might be a tuple for ADDITIONAL_TOPICS (not ones from command
      # line).
      self.report_type = None
      if isinstance(topic, tuple):
        topic, report_type = topic
        self.report_type = report_type

      last_page = False

      url = TOPIC_TO_URL[topic]
      page = utils.beautifulsoup_from_url(url)
      page_started = self.is_first_page(page)
      if page_started:
        yield url

      for link in page.select('li.pager-item a'):
        next_url = urljoin(url, link['href'])
        next_page = utils.beautifulsoup_from_url(next_url)
        if not page_started:
          page_started = self.is_first_page(next_page)
        if page_started:
          yield next_url
        last_page = self.is_last_page(next_page)
        if last_page:
          break
      if last_page:
        continue
    self.report_type = None  # Clear this out afterwards
Exemplo n.º 11
0
  def urls_for(self):
    only = self.options.get('topics')
    if only: # if only...
      only = set(only.split(','))
      only = [(o, TOPIC_TO_REPORT_TYPE[o]) if o in TOPIC_TO_REPORT_TYPE else o
              for o in only]
      yield from self.urls_for_topics(only)
      # If there are topics selected, ONLY yield URLs for those.
      return

    # First yield the URLs for the topics that are tangential to the main
    # Calendar Year reports.
    yield from self.urls_for_topics(ADDITIONAL_TOPICS)

    # Not getting reports from specific topics, iterate over all Calendar Year
    # reports.
    page = utils.beautifulsoup_from_url(BASE_URL)

    # Iterate over each "Calendar Year XXXX" link
    for li in page.select('.field-items li'):
      md = RE_CALENDAR_YEAR.search(li.text)
      if md:
        cur_year = int(md.group(1))
        if cur_year >= self.year_range[0] and cur_year <= self.year_range[-1]:
          href = li.select('a')[0]['href']
          next_url = urljoin(BASE_URL, href)
          # The first page of reports is yielded.
          yield next_url

          # Next, read all the pagination links for the page and yield those. So
          # far, I haven't seen a page that doesn't have all of the following
          # pages enumerated.
          next_page = utils.beautifulsoup_from_url(next_url)
          for link in next_page.select('li.pager-item a'):
            yield urljoin(BASE_URL, link['href'])
Exemplo n.º 12
0
def run(options):
  year_range = inspector.year_range(options, archive)

  doc = utils.beautifulsoup_from_url(AUDIT_REPORTS_URL)
  results = doc.article.find_all("tr")
  if not results:
    raise inspector.NoReportsFoundError("FCC (audit reports)")
  for result in results:
    report = report_from(result, AUDIT_REPORTS_URL, year_range)
    if report:
      inspector.save_report(report)

  doc = utils.beautifulsoup_from_url(SEMIANNUAL_REPORTS_URL)
  results = doc.article.find_all("tr")
  if not results:
    raise inspector.NoReportsFoundError("FCC (semiannual reports)")
  for result in results:
    report = semiannual_report_from(result, SEMIANNUAL_REPORTS_URL, year_range)
    if report:
      inspector.save_report(report)

  doc = utils.beautifulsoup_from_url(OTHER_REPORTS_URL)
  results = doc.article.find_all("p")
  if not results:
    raise inspector.NoReportsFoundError("FCC (other)")
  for result in results:
    report = other_report_from(result, OTHER_REPORTS_URL, year_range)
    if report:
      inspector.save_report(report)
Exemplo n.º 13
0
def run(options):
    year_range = inspector.year_range(options, archive)

    # Pull the audit and inspections reports
    for report_type, reports_url in REPORT_URLS:
        doc = utils.beautifulsoup_from_url(reports_url)
        results = doc.select("div.field-item")
        if not results:
            raise inspector.NoReportsFoundError(
                "National Labor Relations Board (%s)" % report_type)
        for result in results:
            report = report_from(result, report_type, reports_url, year_range)
            if report:
                inspector.save_report(report)

    # Pull the semiannual reports
    doc = utils.beautifulsoup_from_url(SEMIANNUAL_REPORTS_URL)
    results = doc.select("div.field-item")
    if not results:
        raise inspector.NoReportsFoundError(
            "National Labor Relations Board (semiannual reports)")
    for result in results:
        report = semiannual_report_from(result, year_range)
        if report:
            inspector.save_report(report)
Exemplo n.º 14
0
def run(options):
  year_range = inspector.year_range(options, archive)

  # Pull the general reports
  doc = utils.beautifulsoup_from_url(REPORTS_URL)
  results = doc.select("div#mainContent li.mainContenttext a")
  if not results:
    raise inspector.NoReportsFoundError("Farm Credit Administration (reports)")
  for result in results:
    report = report_from(result, REPORTS_URL, year_range)
    if report:
      inspector.save_report(report)

  # Pull the archive reports
  doc = utils.beautifulsoup_from_url(REPORT_ARCHIVE_URL)
  results = doc.select("div#mainContent li.mainContenttext a") + doc.select("div#mainContent span.mainContenttext a")
  if not results:
    raise inspector.NoReportsFoundError("Farm Credit Administration (archive)")
  for result in results:
    if not result.text:
      continue
    report = report_from(result, REPORT_ARCHIVE_URL, year_range)
    if report:
      inspector.save_report(report)

  # Pull the semiannual reports
  doc = utils.beautifulsoup_from_url(SEMIANNUAL_REPORTS_URL)
  results = doc.select("div#mainContent li.mainContenttext a")
  if not results:
    raise inspector.NoReportsFoundError("Farm Credit Administration (semiannual reports)")
  for result in results:
    report = semiannual_report_from(result, year_range)
    if report:
      inspector.save_report(report)
Exemplo n.º 15
0
def run(options):
    year_range = inspector.year_range(options, archive)

    # Pull the audit reports
    doc = utils.beautifulsoup_from_url(REPORTS_URL)
    results = doc.select("#rounded-corner > tr")
    if not results:
        raise inspector.NoReportsFoundError("Federal Reserve (audit reports)")
    for result in results:
        report = report_from(result, year_range)
        if report:
            inspector.save_report(report)

    # Pull the semiannual reports
    doc = utils.beautifulsoup_from_url(SEMIANNUAL_REPORTS_URL)
    results = doc.select("div.style-aside ul > li > a")
    if not results:
        raise inspector.NoReportsFoundError(
            "Federal Reserve (semiannual reports)")
    for result in results:
        report_url = urljoin(BASE_PAGE_URL, result.get('href'))
        report = semiannual_report_from(report_url, year_range)
        if report:
            inspector.save_report(report)

    # The most recent semiannual report will be embedded on the main page
    report = semiannual_report_from(SEMIANNUAL_REPORTS_URL, year_range)
    if report:
        inspector.save_report(report)
Exemplo n.º 16
0
def run(options):
  year_range = inspector.year_range(options, archive)
  results_flag = False

  # Pull the audit reports
  for year in year_range:
    if year < 2002:  # The oldest page for audit reports
      continue
    if year == 2018:
      doc = utils.beautifulsoup_from_url(LATEST_AUDIT_REPORTS_URL)
    else:
      doc = utils.beautifulsoup_from_url(AUDIT_REPORTS_URL.format(year=year))

    if doc is None:
      # Next year's audit page may not be published yet
      continue

    results = doc.select("div.mainCenter table tr")
    if results:
      results_flag = True
    for index, result in enumerate(results):
      if not index:
        # Skip the header row
        continue
      report = report_from(result, report_type='audit', year_range=year_range)
      if report:
        inspector.save_report(report)

  if not results_flag:
    raise inspector.NoReportsFoundError("NCUA (audit reports)")

  # Pull the other reports
  doc = utils.beautifulsoup_from_url(OTHER_REPORTS_URL)
  results = doc.select("div.mainCenter p")
  if not results:
    raise inspector.NoReportsFoundError("NCUA (other)")
  for result in results:
    report = other_report_from(result, year_range=year_range)
    if report:
      inspector.save_report(report)

  # Pull the semiannual reports
  doc = utils.beautifulsoup_from_url(SEMIANNUAL_REPORTS_URL)
  results = doc.select("div#mainColumns div.mainCenter a")
  if not results:
    raise inspector.NoReportsFoundError("NCUA (semiannual reports)")
  for result in results:
    report = semiannual_report_from(result, year_range)
    if report:
      inspector.save_report(report)

  # Pull the performance and strategic plans
  doc = utils.beautifulsoup_from_url(PLANS_URL)
  results = doc.select("div.mainCenter p")
  if not results:
    raise inspector.NoReportsFoundError("NCUA (performance/strategic plans)")
  for result in results:
    report = plan_from(result, year_range)
    if report:
      inspector.save_report(report)
Exemplo n.º 17
0
def run(options):
  year_range = inspector.year_range(options, archive)

  # Pull the general reports
  doc = utils.beautifulsoup_from_url(REPORTS_URL)
  results = doc.select("div#mainContent li.mainContenttext a")
  if not results:
    raise inspector.NoReportsFoundError("Farm Credit Administration (reports)")
  for result in results:
    report = report_from(result, REPORTS_URL, year_range)
    if report:
      inspector.save_report(report)

  # Pull the archive reports
  doc = utils.beautifulsoup_from_url(REPORT_ARCHIVE_URL)
  results = doc.select("div#mainContent li.mainContenttext a") + doc.select("div#mainContent span.mainContenttext a")
  if not results:
    raise inspector.NoReportsFoundError("Farm Credit Administration (archive)")
  for result in results:
    if not result.text:
      continue
    report = report_from(result, REPORT_ARCHIVE_URL, year_range)
    if report:
      inspector.save_report(report)

  # Pull the semiannual reports
  doc = utils.beautifulsoup_from_url(SEMIANNUAL_REPORTS_URL)
  results = doc.select("div#mainContent li.mainContenttext a")
  if not results:
    raise inspector.NoReportsFoundError("Farm Credit Administration (semiannual reports)")
  for result in results:
    report = semiannual_report_from(result, year_range)
    if report:
      inspector.save_report(report)
Exemplo n.º 18
0
def run(options):
  year_range = inspector.year_range(options, archive)

  # Pull the audit reports
  for year in year_range:
    url = AUDITS_REPORTS_URL.format(str(year)[2:4])
    doc = utils.beautifulsoup_from_url(url)
    results = doc.select("tr")
    if not results:
      raise inspector.NoReportsFoundError("NASA (%d)" % year)
    for index, result in enumerate(results):
      if not index or not result.text.strip():
        # Skip the header row and any empty rows
        continue
      report = audit_report_from(result, url, year_range)
      if report:
        inspector.save_report(report)

  # Pull the other reports
  doc = utils.beautifulsoup_from_url(OTHER_REPORT_URL)
  results = doc.select("#subContainer ul li")
  if not results:
    raise inspector.NoReportsFoundError("NASA (other)")
  for result in results:
    report = other_report_from(result, year_range)
    if report:
      inspector.save_report(report)
Exemplo n.º 19
0
def run(options):
  year_range = inspector.year_range(options, archive)

  # Pull the reports with pagination
  for report_type, report_url_format in PAGINATED_REPORT_FORMATS:
    for page in range(0, 999):
      url = report_url_format.format(page=page)
      doc = utils.beautifulsoup_from_url(url)
      if report_type == "audit" and page == 0 and not doc.select("div.views-field-field-auditreport-doc-1"):
        raise Exception("Report number CSS class has changed")
      results = doc.select("li.views-row")
      if not results:
        if page == 0:
          raise inspector.NoReportsFoundError("USAID (%s)" % report_type)
        else:
          break

      for result in results:
        report = report_from(result, url, report_type, year_range)
        if report:
          inspector.save_report(report)

  # Pull the semiannual reports (no pagination)
  doc = utils.beautifulsoup_from_url(SEMIANNUAL_REPORTS_URL)
  results = doc.select("li.views-row")
  if not results:
    raise inspector.NoReportsFoundError("USAID (semiannual reports)")
  for result in results:
    report = semiannual_report_from(result, year_range)
    if report:
      inspector.save_report(report)
Exemplo n.º 20
0
def run(options):
  year_range = inspector.year_range(options, archive)
  if datetime.datetime.now().month >= 10:
    # October, November, and December fall into the next fiscal year
    # Add next year to year_range to compensate
    year_range.append(max(year_range) + 1)

  # Pull the audit reports
  for year in year_range:
    url = audit_report_url(year)
    if url:
      parse_result_from_js_url(url, "auditreports", year, year_range, report_type='audit')
    url = inspection_report_url(year)
    if url:
      parse_result_from_js_url(url, "iereports", year, year_range, report_type='inspection')

  # Pull the congressional testimony
  doc = utils.beautifulsoup_from_url(CONGRESSIONAL_TESTIMONY_REPORTS_URL)
  results = doc.findAll("ul", type='disc')[0].select("li")
  for result in results:
    report = congressional_testimony_report_from(result, year_range)
    if report:
      inspector.save_report(report)

  # Pull the semiannual reports
  doc = utils.beautifulsoup_from_url(SEMIANNUAL_REPORTS_URL)
  results = doc.findAll("ul", type='disc')[0].select("li")
  for result in results:
    report = semiannual_report_from(result, year_range)
    if report:
      inspector.save_report(report)
Exemplo n.º 21
0
def run(options):
    year_range = inspector.year_range(options, archive)

    # Pull the audit reports
    doc = utils.beautifulsoup_from_url(AUDIT_REPORTS_URL)
    results = doc.select("table tr")
    if not results:
        raise inspector.NoReportsFoundError(
            "Federal Maritime Commission (audits)")
    for result in results:
        if result.th:
            # Skip the header row
            continue
        report = report_from(result,
                             AUDIT_REPORTS_URL,
                             report_type='audit',
                             year_range=year_range)
        if report:
            inspector.save_report(report)

    # Pull historical audits
    audit_year_links = doc.select("div.col-2-3 ul li a")
    for year_link in audit_year_links:
        audit_year_url = urljoin(AUDIT_REPORTS_URL, year_link.get('href'))
        doc = utils.beautifulsoup_from_url(audit_year_url)
        results = doc.select("table tr")
        if not results:
            # Grab results other than first and last (header and extra links)
            results = doc.select("div.col-2-2 ul")[1:-1]
        if not results:
            raise inspector.NoReportsFoundError(
                "Federal Maritime Commission (%s)" % audit_year_url)
        for result in results:
            if result.th:
                # Skip the header row
                continue
            report = report_from(result,
                                 AUDIT_REPORTS_URL,
                                 report_type='audit',
                                 year_range=year_range)
            if report:
                inspector.save_report(report)

    # Pull the semiannual reports
    doc = utils.beautifulsoup_from_url(SEMIANNUAL_REPORTS_URL)
    results = doc.select("div.col-2-2 p a") + doc.select("div.col-2-2 li a")
    if not results:
        raise inspector.NoReportsFoundError(
            "Federal Maritime Commission (semiannual reports)")
    for result in results:
        report = report_from(result.parent,
                             AUDIT_REPORTS_URL,
                             report_type='semiannual_report',
                             year_range=year_range)
        if report:
            inspector.save_report(report)
Exemplo n.º 22
0
def run(options):
  year_range = inspector.year_range(options, archive)

  # Pull the audit reports
  doc = utils.beautifulsoup_from_url(AUDIT_REPORTS_URL)
  results = doc.select("#inner-content tr")
  if not results:
    raise inspector.NoReportsFoundError("National Science Foundation (audit reports)")
  for result in results:
    # ignore divider lines
    if result.select("img"): continue

    report = report_from(result, report_type='audit', year_range=year_range, base_url=AUDIT_REPORTS_URL)
    if report:
      inspector.save_report(report)

  # Pull the semiannual reports
  doc = utils.beautifulsoup_from_url(SEMIANNUAL_REPORTS_URL)
  results = doc.select("#inner-content li")
  if not results:
    raise inspector.NoReportsFoundError("National Science Foundation (semiannual reports)")
  for result in results:
    if not result.text.strip():
      continue
    report = semiannual_report_from(result, year_range)
    if report:
      inspector.save_report(report)

  # Pull the case reports
  response = utils.scraper.post(
    url=CASE_REPORTS_URL,
    data=CASE_REPORTS_DATA,
  )
  doc = BeautifulSoup(response.content, "lxml")
  results = doc.select("#inner-content tr")
  if not results:
    raise inspector.NoReportsFoundError("National Science Foundation (case reports)")
  for index, result in enumerate(results):
    if not index or not result.text.strip():  # Skip the header row and empty rows
      continue
    report = case_report_from(result, CASE_REPORTS_URL, year_range)
    if report:
      inspector.save_report(report)

  # Pull the testimony
  doc = utils.beautifulsoup_from_url(TESTIMONY_REPORTS_URL)
  results = doc.select("#inner-content tr")
  if not results:
    raise inspector.NoReportsFoundError("National Science Foundation (testimony)")
  for result in results:
    if not result.text.strip():
      continue
    report = report_from(result, report_type='testimony', year_range=year_range, base_url=TESTIMONY_REPORTS_URL)
    if report:
      inspector.save_report(report)
Exemplo n.º 23
0
def run(options):
    year_range = inspector.year_range(options, archive)

    # Pull the audit reports
    for year in year_range:
        if year < 1998:  # The earliest year for audit reports
            continue
        year_url = AUDIT_REPORTS_URL.format(year=year)
        doc = utils.beautifulsoup_from_url(year_url)
        results = doc.select("tr")
        if not results:
            raise inspector.NoReportsFoundError(
                "Pension Benefit Guaranty Corporation (audit reports)")
        for result in results:
            report = report_from(result,
                                 report_type='audit',
                                 year_range=year_range)
            if report:
                inspector.save_report(report)

    # Pull the congressional requests
    doc = utils.beautifulsoup_from_url(CONGRESSIONAL_REQUESTS_URL)
    results = doc.select("tr")
    if not results:
        raise inspector.NoReportsFoundError(
            "Pension Benefit Guaranty Corporation (congressional requests)")
    for result in results:
        report = report_from(result,
                             report_type='congress',
                             year_range=year_range)
        if report:
            inspector.save_report(report)

    # Pull the semiannual reports
    doc = utils.beautifulsoup_from_url(SEMIANNUAL_REPORTS_URL)
    results = doc.select("div.holder a")
    if not results:
        raise inspector.NoReportsFoundError(
            "Pension Benefit Guaranty Corporation (semiannual reports)")
    for result in results:
        report = semiannual_report_from(result, year_range)
        if report:
            inspector.save_report(report)

    # Pull the congressional testimony
    doc = utils.beautifulsoup_from_url(CONGRESSIONAL_TESTIMONY_URL)
    results = doc.select("div.holder a")
    if not results:
        raise inspector.NoReportsFoundError(
            "Pension Benefit Guaranty Corporation (congressional testimony)")
    for result in results:
        report = testimony_report_from(result, year_range)
        if report:
            inspector.save_report(report)
Exemplo n.º 24
0
def run(options):
    year_range = inspector.year_range(options, archive)

    # Pull the reports
    doc = utils.beautifulsoup_from_url(AUDIT_REPORTS_URL)
    rows = doc.select("div.content > div > div > div > div > div.row")
    row_audits = rows[0]

    # Audit reports
    results = row_audits.select("ul li.pdf")
    if not results:
        raise inspector.NoReportsFoundError("CPB (audits)")
    for result in results:
        report = report_from(result, AUDIT_REPORTS_URL, "audit", year_range)
        if report:
            inspector.save_report(report)

    doc = utils.beautifulsoup_from_url(OTHER_REPORTS_URL)
    rows = doc.select("div.content > div > div > div > div.row")
    row_peer_review = rows[0]
    col_plans = rows[1].select("div.col-md-6")[0]
    col_congress = rows[1].select("div.col-md-6")[1]

    # Peer review
    results = row_peer_review.select("ul li.pdf")
    if not results:
        raise inspector.NoReportsFoundError("CPB (peer reviews)")
    for result in results:
        report = report_from(result, OTHER_REPORTS_URL, "other", year_range)
        if report:
            inspector.save_report(report)

    # Plans
    results = col_plans.select("ul li.pdf")
    if not results:
        raise inspector.NoReportsFoundError("CPB (plans)")
    for result in results:
        report = report_from(result, OTHER_REPORTS_URL, "other", year_range)
        if report:
            inspector.save_report(report)

    # Semiannual reports to congress
    results = col_congress.select("ul li.pdf")
    if not results:
        raise inspector.NoReportsFoundError("CPB (semiannual reports)")
    for result in results:
        report = report_from(result, OTHER_REPORTS_URL, "semiannual_report",
                             year_range)
        if report:
            inspector.save_report(report)
Exemplo n.º 25
0
def run(options):
  year_range = inspector.year_range(options, archive)

  urls = [ARCHIVED_REPORTS_URL, PRIOR_PENDING_REPORTS_URL]
  for year in year_range:
    if year >= 2005:
        urls.append(AUDIT_REPORTS_URL.format(year))

  # Pull the audit reports
  for url in urls:
    doc = utils.beautifulsoup_from_url(url)
    results = doc.find("table", border="1").select("tr")
    if not results:
      raise inspector.NoReportsFoundError("Nuclear Regulatory Commission (%d)" % year)
    for index, result in enumerate(results):
      if not index:
        # Skip the header row
        continue
      report = audit_report_from(result, url, year_range)
      if report:
        inspector.save_report(report)

  # Pull the congressional testimony
  doc = utils.beautifulsoup_from_url(SEMIANNUAL_REPORTS_URL)
  semiannual_reports_table = doc.find("table", border="1")
  results = semiannual_reports_table.select("tr")
  if not results:
    raise inspector.NoReportsFoundError("Nuclear Regulatory Commission (congressional testimony)")
  for index, result in enumerate(results):
    if index < 2:
      # Skip the first two header rows
      continue
    report = semiannual_report_from(result, year_range)
    if report:
      inspector.save_report(report)

  # Pull the other reports
  for reports_url, id_prefix in OTHER_REPORT_URLS:
    doc = utils.beautifulsoup_from_url(reports_url)
    results = doc.find("table", border="1").select("tr")
    if not results:
      raise inspector.NoReportsFoundError("Nuclear Regulatory Commission (other)")
    for index, result in enumerate(results):
      if not index:
        # Skip the header row
        continue
      report = other_report_from(result, year_range, id_prefix, reports_url)
      if report:
        inspector.save_report(report)
Exemplo n.º 26
0
def run(options):
    year_range = inspector.year_range(options, archive)
    if datetime.datetime.now().month >= 10:
        # October, November, and December fall into the next fiscal year
        # Add next year to year_range to compensate
        year_range.append(max(year_range) + 1)

    # Pull the audit reports
    for year in year_range:
        if year < 2006:  # This is the oldest year for these reports
            continue
        url = AUDIT_REPORTS_BASE_URL.format(year)
        doc = utils.beautifulsoup_from_url(url)
        results = doc.find_all(
            "tr",
            class_=["ms-rteTableOddRow-default", "ms-rteTableEvenRow-default"])
        if not results:
            if year != datetime.datetime.now().year + 1:
                raise inspector.NoReportsFoundError("Treasury (%d)" % year)
        for result in results:
            report = audit_report_from(result, url, year_range)
            if report:
                inspector.save_report(report)

    for report_type, url in OTHER_URLS.items():
        doc = utils.beautifulsoup_from_url(url)
        results = doc.select(
            "#ctl00_PlaceHolderMain_ctl05_ctl01__ControlWrapper_RichHtmlField > p a"
        )
        if not results:
            raise inspector.NoReportsFoundError("Treasury (%s)" % report_type)
        for result in results:
            if len(result.parent.find_all("a")) == 1:
                result = result.parent
            report = report_from(result, url, report_type, year_range)
            if report:
                inspector.save_report(report)

    doc = utils.beautifulsoup_from_url(SEMIANNUAL_REPORTS_URL)
    results = doc.select(
        "#ctl00_PlaceHolderMain_ctl05_ctl01__ControlWrapper_RichHtmlField > p > a"
    )
    if not results:
        raise inspector.NoReportsFoundError("Treasury (semiannual reports)")
    for result in results:
        report = semiannual_report_from(result, SEMIANNUAL_REPORTS_URL,
                                        year_range)
        if report:
            inspector.save_report(report)
Exemplo n.º 27
0
def run(options):
  year_range = inspector.year_range(options, archive)

  # Pull the reports
  doc = utils.beautifulsoup_from_url(AUDIT_REPORTS_URL)
  rows = doc.select("div.content > div > div > div > div > div.row")
  row_audits = rows[0]

  # Audit reports
  results = row_audits.select("ul li.pdf")
  if not results:
    raise inspector.NoReportsFoundError("CPB (audits)")
  for result in results:
    report = report_from(result, AUDIT_REPORTS_URL, "audit", year_range)
    if report:
      inspector.save_report(report)

  doc = utils.beautifulsoup_from_url(OTHER_REPORTS_URL)
  rows = doc.select("div.content > div > div > div > div.row")
  row_peer_review = rows[0]
  col_plans = rows[1].select("div.col-md-6")[0]
  col_congress = rows[1].select("div.col-md-6")[1]

  # Peer review
  results = row_peer_review.select("ul li.pdf")
  if not results:
    raise inspector.NoReportsFoundError("CPB (peer reviews)")
  for result in results:
    report = report_from(result, OTHER_REPORTS_URL, "other", year_range)
    if report:
      inspector.save_report(report)

  # Plans
  results = col_plans.select("ul li.pdf")
  if not results:
    raise inspector.NoReportsFoundError("CPB (plans)")
  for result in results:
    report = report_from(result, OTHER_REPORTS_URL, "other", year_range)
    if report:
      inspector.save_report(report)

  # Semiannual reports to congress
  results = col_congress.select("ul li.pdf")
  if not results:
    raise inspector.NoReportsFoundError("CPB (semiannual reports)")
  for result in results:
    report = report_from(result, OTHER_REPORTS_URL, "semiannual_report", year_range)
    if report:
      inspector.save_report(report)
Exemplo n.º 28
0
def scrape_reports(options):
  """Pull reports from "Reports and Testimonies - Browse by date" web page."""

  REPORTS_URL = 'http://www.gao.gov/browse/date/custom?adv_begin_date=01/01/' +\
    '%s&adv_end_date=12/31/%s&rows=50&o=%s' # % (year, year, offset)
  archive = 1970
  # Amazingly, reports go back to 1940, though those are unlikely to be
  # legible enough to OCR. Also very cool, even 1950s-era reports seem to have
  # a highlightable embedded text layer in them. Of course, it was the
  # General Accounting Office back then and less oversighty.

  year_range = inspector.year_range(options, archive)
  for year in year_range:
    is_next_page = True
    offset = 0
    while is_next_page:
      doc = utils.beautifulsoup_from_url(
        REPORTS_URL % (year, year, offset))
      results = doc.select("div.listing")
      for result in results:
        report = process_report(result, year_range)
        if report:
          inspector.save_report(report)
      page_links = doc.select("a.non-current_page")
      if len(page_links) and page_links[-1].text.startswith('Next'):
        offset += 50
      else:
        is_next_page = False
Exemplo n.º 29
0
def extract_reports_for_subtopic(subtopic_url, year_range, topic_name, subtopic_name):
  doc = utils.beautifulsoup_from_url(subtopic_url)
  if not doc:
    raise Exception("Failure fetching subtopic URL: %s" % subtopic_url)

  results = None

  # This URL is different than the rest and needs to find the "p > a"s first.
  if subtopic_url == TOPIC_TO_URL['TMPC']:
    results = doc.select("#leftContentInterior > p > a")
  if not results:
    results = doc.select("#leftContentInterior dl dd")
  if not results:
    results = doc.select("#leftContentInterior ul li")
  if not results:
    results = doc.select("#leftContentInterior > p > a")
  if not results:
    raise inspector.NoReportsFoundError("HHS (%s)" % subtopic_name)
  for result in results:
    if 'crossref' in result.parent.parent.attrs.get('class', []):
      continue
    if result.parent.parent.attrs.get('id') == 'related':
      continue
    report = report_from(result, year_range, topic_name, subtopic_url, subtopic_name)
    if report:
      deduplicate_save_report(report)
Exemplo n.º 30
0
    def fetch_from_landing_page(self, landing_url):
        """Returns a tuple of (pdf_link, summary_text, is_unreleased)."""
        unreleased = False
        page = utils.beautifulsoup_from_url(landing_url)

        summary = None
        field_items = page.select('.field-items')
        if field_items:
            text = [node.strip() for node in field_items[0].findAll(text=True)]
            summary = '\n\n'.join(text).strip()
        if not summary:
            logging.info('\tno summary text found')

        # sanitize now instead of later, to compare to regexes
        else:
            summary = inspector.sanitize(summary)

        if (summary and (RE_NOT_AVAILABLE.search(summary)
                         or RE_NOT_AVAILABLE_2.search(summary)
                         or RE_NOT_AVAILABLE_3.search(summary)
                         or RE_NOT_AVAILABLE_4.search(summary)
                         or RE_WITHDRAWN.search(summary)
                         or RE_CLASSIFIED.search(summary))):
            unreleased = True

        report_url = None
        pdf_link = page.select('.field-name-field-download-files a')
        if not pdf_link:
            logging.warn('No pdf link found on page: {0}'.format(landing_url))
        else:
            report_url = pdf_link[0]['href']

        return report_url, summary, unreleased
Exemplo n.º 31
0
def run(options):
  year_range = inspector.year_range(options, archive)

  topics = options.get('topics')
  if topics:
    topics = topics.split(",")
  else:
    topics = TOPIC_TO_URL.keys()

  all_reports = {}

  for topic in topics:
    year_urls = urls_for(year_range, topic)
    for year_url in year_urls:
      logging.debug("Scraping %s" % year_url)
      doc = utils.beautifulsoup_from_url(year_url)

      if not doc.select(".view-business-areas"):
        raise inspector.NoReportsFoundError("DOT (%s)" % topic)

      results = doc.select(".view-business-areas .views-row")
      for result in results:
        report = report_from(result, year_range, topic, options)
        if report:
          report_id = report["report_id"]
          if report_id in all_reports:
            all_reports[report_id]["topic"] = all_reports[report_id]["topic"] \
                + ", " + topic
          else:
            all_reports[report_id] = report

  for report in all_reports.values():
    inspector.save_report(report)
Exemplo n.º 32
0
def run(options):
  year_range = inspector.year_range(options, archive)
  keys = set()

  # Pull the reports
  for report_type, url in REPORT_URLS:
    doc = utils.beautifulsoup_from_url(url)
    results = doc.select("section#content ul li")
    if results:
      for result in results:
        report = report_from_list(result, url, report_type, year_range)
        if report:
          if report["url"]:
            key = (report["report_id"], unquote(report["url"]))
          else:
            key = (report["report_id"], report["url"])
          if key not in keys:
            inspector.save_report(report)
            keys.add(key)
    else:
      results = doc.select("section#content p")
      if not results:
        raise inspector.NoReportsFoundError("Federal Labor Relations Authority (%s)" % report_type)
      for result in results:
        report = report_from_paragraph(result, url, report_type, year_range)
        if report:
          key = (report["report_id"], report["url"])
          if key not in keys:
            inspector.save_report(report)
            keys.add(key)
Exemplo n.º 33
0
  def fetch_from_landing_page(self, landing_url):
    """Returns a tuple of (pdf_link, summary_text, is_unreleased)."""
    unreleased = False
    page = utils.beautifulsoup_from_url(landing_url)

    summary = None
    field_items = page.select('.field-items')
    if field_items:
      text = [node.strip() for node in field_items[0].findAll(text=True)]
      summary = '\n\n'.join(text).strip()
    if not summary:
      logging.info('\tno summary text found')

    # sanitize now instead of later, to compare to regexes
    else:
      summary = inspector.sanitize(summary)

    if (summary and (RE_NOT_AVAILABLE.search(summary)
                     or RE_NOT_AVAILABLE_2.search(summary)
                     or RE_NOT_AVAILABLE_3.search(summary)
                     or RE_NOT_AVAILABLE_4.search(summary)
                     or RE_WITHDRAWN.search(summary)
                     or RE_CLASSIFIED.search(summary))):
      unreleased = True

    report_url = None
    pdf_link = page.select('.field-name-field-download-files a')
    if not pdf_link:
      logging.warn('No pdf link found on page: {0}'.format(landing_url))
    else:
      report_url = pdf_link[0]['href']

    return report_url, summary, unreleased
Exemplo n.º 34
0
def extract_reports_for_subtopic(subtopic_url, year_range, topic_name,
                                 subtopic_name):
    doc = utils.beautifulsoup_from_url(subtopic_url)
    if not doc:
        raise Exception("Failure fetching subtopic URL: %s" % subtopic_url)

    results = None

    # This URL is different than the rest and needs to find the "p > a"s first.
    if subtopic_url == TOPIC_TO_URL['TMPC']:
        results = doc.select("#leftContentInterior > p > a")
    if not results:
        results = doc.select("#leftContentInterior dl dd")
    if not results:
        results = doc.select("#leftContentInterior ul li")
    if not results:
        results = doc.select("#leftContentInterior > p > a")
    if not results:
        raise inspector.NoReportsFoundError("HHS (%s)" % subtopic_name)
    for result in results:
        if 'crossref' in result.parent.parent.attrs.get('class', []):
            continue
        if result.parent.parent.attrs.get('id') == 'related':
            continue
        report = report_from(result, year_range, topic_name, subtopic_url,
                             subtopic_name)
        if report:
            deduplicate_save_report(report)
Exemplo n.º 35
0
def semiannual_report_from(result, year_range):
  link = result.find("a")

  title = link.text

  # Parse the report title. Ex:
  # 'OIG Semiannual Report to the Congress: October 1, 2013 - March 31, 2014 (incl. MCC)'
  published_on_text = title.split("-")[-1].split("–")[-1].split("(")[0].strip()
  published_on_text = published_on_text.replace("September 31", "September 30")  # See note to IG Web team
  published_on = datetime.datetime.strptime(published_on_text, '%B %d, %Y')

  if published_on.year not in year_range:
    logging.debug("[%s] Skipping, not in requested range." % title)
    return

  landing_url = urljoin(SEMIANNUAL_REPORTS_URL, link.get('href'))
  landing_page = utils.beautifulsoup_from_url(landing_url)

  report_url = landing_page.select("div.field-type-file a")[0].get('href')
  report_filename = report_url.split("/")[-1]
  report_id, _ = os.path.splitext(report_filename)

  report = {
    'inspector': "usaid",
    'inspector_url': "https://oig.usaid.gov",
    'agency': "usaid",
    'agency_name': "Agency For International Development",
    'type': 'semiannual_report',
    'report_id': report_id,
    'url': report_url,
    'title': title,
    'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"),
  }
  return report
Exemplo n.º 36
0
def report_from_landing_url(report_url):
    doc = utils.beautifulsoup_from_url(report_url)
    if not doc:
        raise Exception("Failure fetching report landing URL: %s" % report_url)

    # Throw away the "Related Content" box, if there is one
    related = doc.find(id="related")
    if related:
        related.extract()
    captioned = doc.find(class_="captioned-image")
    if captioned:
        h3 = captioned.h3
        if h3 and h3.text.strip() == "Related Reports":
            captioned.extract()

    possible_tags = (doc.select("h1") + doc.select("h2") + doc.select("h3") +
                     doc.select("body font p b") + doc.select("body center") +
                     doc.select("body blockquote p"))
    for possible_tag in possible_tags:
        published_on = get_published_date_from_tag(possible_tag)
        if published_on:
            break

    url_list = filter_links(doc.select("#leftContentInterior p.download a"),
                            report_url)
    if not url_list:
        url_list = filter_links(doc.select("#leftContentInterior p a"),
                                report_url)
    if len(url_list) > 1:
        raise Exception("Found multiple links on %s:\n%s" %
                        (report_url, url_list))
    elif len(url_list) == 1:
        report_url = url_list[0]

    return report_url, published_on
Exemplo n.º 37
0
def run(options):
    year_range = inspector.year_range(options, archive)
    min_year = min(year_range)
    page = 0
    last_page = 0

    while page <= last_page:
        doc = utils.beautifulsoup_from_url(
            REPORT_SEARCH_URL.format(min_year, page))
        last_page_link = doc.find("a", title="Go to last page")
        if last_page_link:
            href = last_page_link["href"]
            page_match = re.search("[?&]page=([0-9]+)(?:&|$)", href)
            if page_match:
                last_page = int(page_match.group(1))

        results = doc.select(".view-reports-advanced-search .views-row")
        if not results:
            raise inspector.NoReportsFoundError("Department of the Interior")
        for result in results:
            report = report_from(result, year_range)
            if report:
                inspector.save_report(report)
        page += 1
    if last_page == 0:
        raise Exception("Did not find last page link")
Exemplo n.º 38
0
def run(options):
  year_range = inspector.year_range(options, archive)
  min_year = min(year_range)
  page = 0
  last_page = 0

  while page <= last_page:
    doc = utils.beautifulsoup_from_url(REPORT_SEARCH_URL.format(min_year, page))
    last_page_link = doc.find("a", title="Go to last page")
    if last_page_link:
      href = last_page_link["href"]
      page_match = re.search("[?&]page=([0-9]+)(?:&|$)", href)
      if page_match:
        last_page = int(page_match.group(1))

    results = doc.select(".view-reports-advanced-search .views-row")
    if not results:
      raise inspector.NoReportsFoundError("Department of the Interior")
    for result in results:
      report = report_from(result, year_range)
      if report:
        inspector.save_report(report)
    page += 1
  if last_page == 0:
    raise Exception("Did not find last page link")
Exemplo n.º 39
0
def run(options):
  year_range = inspector.year_range(options, archive)
  keys = set()

  # Pull the reports
  for report_type, url in REPORT_URLS:
    doc = utils.beautifulsoup_from_url(url)
    results = doc.select("section#content ul li")
    if results:
      for result in results:
        report = report_from_list(result, url, report_type, year_range)
        if report:
          if report["url"]:
            key = (report["report_id"], unquote(report["url"]))
          else:
            key = (report["report_id"], report["url"])
          if key not in keys:
            inspector.save_report(report)
            keys.add(key)
    else:
      results = doc.select("section#content p")
      if not results:
        raise inspector.NoReportsFoundError("Federal Labor Relations Authority (%s)" % report_type)
      for result in results:
        report = report_from_paragraph(result, url, report_type, year_range)
        if report:
          key = (report["report_id"], report["url"])
          if key not in keys:
            inspector.save_report(report)
            keys.add(key)
Exemplo n.º 40
0
    def run(self, options):
        self.options = options
        self.year_range = inspector.year_range(self.options, archive)
        self.first_date = datetime.datetime(self.year_range[0], 1, 1)
        self.last_date = datetime.datetime(self.year_range[-1], 12, 31)

        for url in self.urls_for():
            page = utils.beautifulsoup_from_url(url)

            nodes = page.select('.energy-listing__results .node')
            if not nodes:
                nodes = page.select('.field-items .node')
            if not nodes:
                nodes = page.select('.node')
            if not nodes:
                raise inspector.NoReportsFoundError(
                    "Department of Energy (%s)" % url)

            for node in nodes:
                report = self.report_from(node)
                if report:
                    inspector.save_report(report)
                else:
                    # Empty report indicates a report out of the date range, or not the ID.
                    continue
Exemplo n.º 41
0
def scrape_restricted_reports(options):
  """Restricted Products.

  A single HTML page lists unreleased reports since 2014, with no links."""

  # These reports are unreleased -- we could make this the text?
  """The following products have been determined to contain either
classified information or controlled unclassified information by the audited
agencies and cannot be publicly released.

Members of Congress or congressional staff who wish to obtain one or more of
these products should call or e-mail the Congressional Relations Office.
All others who wish to obtain one or more of these products should follow the
instructions found on Requesting Restricted Products."""

  REPORTS_URL = 'http://www.gao.gov/restricted/restricted_reports'
  archive = 2014

  year_range = inspector.year_range(options, archive)
  doc = utils.beautifulsoup_from_url(REPORTS_URL)
  results = doc.select("div.listing")
  for result in results:
    report = process_restricted_report(result, year_range, REPORTS_URL)
    if report:
      inspector.save_report(report)
Exemplo n.º 42
0
def run(options):
  year_range = inspector.year_range(options, archive)

  topics = options.get('topics')
  if topics:
    topics = topics.split(",")
  else:
    topics = TOPIC_TO_URL.keys()

  all_reports = {}

  for topic in topics:
    year_urls = urls_for(year_range, topic)
    for year_url in year_urls:
      logging.debug("Scraping %s" % year_url)
      doc = utils.beautifulsoup_from_url(year_url)

      if not doc.select(".view-business-areas"):
        raise inspector.NoReportsFoundError("DOT (%s)" % topic)

      results = doc.select(".view-business-areas .views-row")
      for result in results:
        report = report_from(result, year_range, topic, options)
        if report:
          report_id = report["report_id"]
          if report_id in all_reports:
            all_reports[report_id]["topic"] = all_reports[report_id]["topic"] \
                + ", " + topic
          else:
            all_reports[report_id] = report

  for report in all_reports.values():
    inspector.save_report(report)
Exemplo n.º 43
0
def run(options):
  year_range = inspector.year_range(options, archive)

  # Pull the audit reports
  doc = utils.beautifulsoup_from_url(AUDIT_REPORTS_URL)

  headers = set([a.parent for a in
                 doc.find_all("a", id=re.compile("^[0-9]{4}$"))])
  headers.update(doc.find_all("p", class_="Ptitle1"))
  headers = sorted(headers, key=lambda p: int(p.text.strip()), reverse=True)
  if not headers:
    raise inspector.NoReportsFoundError("ITC")

  for header in headers:
    year = int(header.text.strip())
    results = header.findNextSibling("ul").select("li")

    for result in results:
      if not inspector.sanitize(result.text):
        logging.debug("Skipping empty list item.")
        continue

      report = audit_report_from(year, result, AUDIT_REPORTS_URL, year_range)
      if report:
        inspector.save_report(report)
Exemplo n.º 44
0
def run(options):
  year_range = inspector.year_range(options, archive)

  # Pull the audit reports
  for year in year_range:
    if year < 1998:  # The earliest year for audit reports
      continue
    year_url = AUDIT_REPORTS_URL.format(year=year)
    doc = utils.beautifulsoup_from_url(year_url)
    results = doc.select("tr")
    if not results:
      raise inspector.NoReportsFoundError("Pension Benefit Guaranty Corporation (audit reports)")
    for result in results:
      report = report_from(result, report_type='audit', year_range=year_range)
      if report:
        inspector.save_report(report)

  # Pull the congressional requests
  doc = utils.beautifulsoup_from_url(CONGRESSIONAL_REQUESTS_URL)
  results = doc.select("tr")
  if not results:
    raise inspector.NoReportsFoundError("Pension Benefit Guaranty Corporation (congressional requests)")
  for result in results:
    report = report_from(result, report_type='congress', year_range=year_range)
    if report:
      inspector.save_report(report)

  # Pull the semiannual reports
  doc = utils.beautifulsoup_from_url(SEMIANNUAL_REPORTS_URL)
  results =  doc.select("div.holder a")
  if not results:
    raise inspector.NoReportsFoundError("Pension Benefit Guaranty Corporation (semiannual reports)")
  for result in results:
    report = semiannual_report_from(result, year_range)
    if report:
      inspector.save_report(report)

  # Pull the congressional testimony
  doc = utils.beautifulsoup_from_url(CONGRESSIONAL_TESTIMONY_URL)
  results =  doc.select("div.holder a")
  if not results:
    raise inspector.NoReportsFoundError("Pension Benefit Guaranty Corporation (congressional testimony)")
  for result in results:
    report = testimony_report_from(result, year_range)
    if report:
      inspector.save_report(report)
Exemplo n.º 45
0
 def get_listed_reports(self, url):
   doc = utils.beautifulsoup_from_url(url)
   article = doc.select('.article')[0]
   results = article.find_all('ul')
   if not results:
     raise inspector.NoReportsFoundError("Library of Congress (%s)" % url)
   for ul in results:
     self.get_bare_reports(ul)
Exemplo n.º 46
0
def run(options):
  year_range = inspector.year_range(options, archive)

  component = options.get('component')
  if component:
    components = [component]
  else:
    components = list(COMPONENTS.keys())

  report_id = options.get('report_id')

  limit = int(options.get('limit', 0))

  all_audit_reports = {}

  for component in components:
    logging.info("## Fetching reports for component %s" % component)
    url = url_for(options, component)
    doc = utils.beautifulsoup_from_url(url)

    results = doc.select("table.contentpaneopen table[border=1] tr")
    # accept only trs that look like body tr's (no 'align' attribute)
    #   note: HTML is very inconsistent. cannot rely on thead or tbody
    results = [x for x in results if x.get('align') is None]
    if not results:
      raise inspector.NoReportsFoundError("DHS (%s)" % component)

    count = 0
    for result in results:
      report = report_from(result, component, url)
      if not report:
        continue

      if report_id and (report_id != report['report_id']):
        continue

      if inspector.year_from(report) not in year_range:
        # logging.info("[%s] Skipping, not in requested range." % report['report_id'])
        continue

      key = (report["report_id"], report["title"])
      if key in all_audit_reports:
        all_audit_reports[key]["agency"] = all_audit_reports[key]["agency"] + \
                ", " + report["agency"]
        all_audit_reports[key]["agency_name"] = \
                all_audit_reports[key]["agency_name"] + ", " + \
                report["agency_name"]
      else:
        all_audit_reports[key] = report

      count += 1
      if limit and (count >= limit):
        break

    logging.info("## Fetched %i reports for component %s\n\n" % (count, component))

  for report in all_audit_reports.values():
    inspector.save_report(report)
Exemplo n.º 47
0
def report_from(result, year_range):
    title = result.select("a")[0].text
    agency = result.select("td.Col_Agency")[0].text.strip()
    topic = result.get('class')[0]
    landing_url = urljoin(BASE_PAGE_URL, result.select("a")[0].get('href'))
    published_on_text = result.select("td.Col_Date")[0].text
    published_on = datetime.datetime.strptime(published_on_text.strip(),
                                              '%m-%d-%Y')

    if published_on.year not in year_range:
        logging.debug("[%s] Skipping, not in requested range." % landing_url)
        return

    logging.debug("Scraping landing url: %s", landing_url)
    landing_page = utils.beautifulsoup_from_url(landing_url)

    landing_page_text = landing_page.select("div.style-report-text")[0].text

    # Some pages don't have any reports as a result
    if "did not issue any formal recommendations" in landing_page_text:
        return

    unreleased = any(unreleased_text in landing_page_text
                     for unreleased_text in UNRELEASED_TEXTS)
    if landing_url in UNRELEASED_LANDING_URLS:
        unreleased = True

    if unreleased:
        report_id = None
        report_url = landing_url
    else:
        relative_report_url = landing_page.select(
            "div.report-header-container-aside a")[-1].get('href')
        report_url = urljoin(BASE_PAGE_URL, relative_report_url)
        report_id = landing_page.select("span.report-number")[0].text.strip()

    if not report_id:
        # Fallback to the report filename
        report_filename = report_url.split("/")[-1]
        report_id, extension = os.path.splitext(report_filename)

    report = {
        'inspector': 'fed',
        'inspector_url': 'https://oig.federalreserve.gov/',
        'agency': AGENCY_SLUGS[agency],
        'agency_name': AGENCY_NAMES[agency],
        'type': 'audit',
        'report_id': report_id,
        'url': report_url,
        'landing_url': landing_url,
        'topic': topic,
        'title': title,
        'summary': landing_page_text.strip(),
        'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"),
    }
    if unreleased:
        report['unreleased'] = unreleased
    return report
Exemplo n.º 48
0
def run(options):
    year_range = inspector.year_range(options, archive)

    doc = utils.beautifulsoup_from_url(REPORTS_URL)

    # Pull the semiannual reports
    semiannul_results = doc.select("#AnnualManagementReports select")[0]
    for result in semiannul_results.select("option"):
        report = semiannual_report_from(result, year_range)
        if report:
            inspector.save_report(report)

    # Pull the special reports
    special_report_table = doc.find("table", attrs={"bordercolor": "#808080"})
    for index, result in enumerate(special_report_table.select("tr")):
        if not index:
            # Skip the header row
            continue
        report = report_from(result,
                             REPORTS_URL,
                             report_type='other',
                             year_range=year_range)
        if report:
            inspector.save_report(report)

    # Pull the audit reports
    for year in year_range:
        if year < 2001:  # The oldest fiscal year page available
            continue
        year_url = AUDIT_REPORTS_URL.format(year=year)
        doc = utils.beautifulsoup_from_url(year_url)
        results = doc.select("#main table tr")
        if not results:
            raise inspector.NoReportsFoundError(
                "Railroad Retirement Board (%d)" % year)
        for index, result in enumerate(results):
            if not index:
                # Skip the header row
                continue
            report = report_from(result,
                                 year_url,
                                 report_type='audit',
                                 year_range=year_range)
            if report:
                inspector.save_report(report)
Exemplo n.º 49
0
def run(options):
    year_range = inspector.year_range(options, archive)

    # Pull the audit reports
    for url, report_type, parse_func in REPORT_PAGES_INFO:
        doc = utils.beautifulsoup_from_url(url)

        content = doc.select("section.article-content")[0]
        parse_func(content, url, report_type, year_range)
Exemplo n.º 50
0
def run(options):
  year_range = inspector.year_range(options, archive)

  # Pull the audit reports
  for url, report_type, parse_func in REPORT_PAGES_INFO:
    doc = utils.beautifulsoup_from_url(url)

    content = doc.select("section.article-content")[0]
    parse_func(content, url, report_type, year_range)
Exemplo n.º 51
0
def run(options):
    year_range = inspector.year_range(options, archive)

    # Pull the audit reports
    for page in range(1, 1000):
        # Intermittent errors are indistinguishable from reaching the end of the
        # reports. In both cases, the "content" div only contains an empty div with
        # class "search-results-pagination". Thus, we will always retry pages that
        # look like this.
        url = "{}?RS={}".format(REPORTS_URL, page)
        for attempt in range(MAX_ATTEMPTS):
            doc = utils.beautifulsoup_from_url(url)
            if doc.select(".layout-content_area")[0].text.strip():
                break
            time.sleep(30)

        results = doc.select(".report")
        if not results:
            if page == 1:
                raise inspector.NoReportsFoundError("VA (audit reports)")
            else:
                break
        for result in results:
            report = report_from(result, year_range)
            if report:
                inspector.save_report(report)

    # Pull the semiannual reports
    for attempt in range(MAX_ATTEMPTS):
        doc = utils.beautifulsoup_from_url(SEMIANNUAL_REPORTS_URL)
        page_text = doc.select("div.single-column-report-list")[0].text.strip()
        if page_text != ERROR_TEXT_LIST:
            break
        time.sleep(30)
    if page_text == ERROR_TEXT_LIST:
        raise Exception("Could not retrieve semiannual reports list")

    results = doc.select(".sar")
    if not results:
        raise inspector.NoReportsFoundError("VA (semiannual reports)")
    for result in results:
        report = semiannual_report_from(result, year_range)
        if report:
            inspector.save_report(report)
Exemplo n.º 52
0
def run(options):
  year_range = inspector.year_range(options, archive)

  # Pull the audit reports
  for page in range(1, 1000):
    # Intermittent errors are indistinguishable from reaching the end of the
    # reports. In both cases, the "content" div only contains an empty div with
    # class "search-results-pagination". Thus, we will always retry pages that
    # look like this.
    url = "{}?RS={}".format(REPORTS_URL, page)
    for attempt in range(MAX_ATTEMPTS):
      doc = utils.beautifulsoup_from_url(url)
      if doc.select(".layout-content_area")[0].text.strip():
        break
      time.sleep(30)

    results = doc.select(".report")
    if not results:
      if page == 1:
        raise inspector.NoReportsFoundError("VA (audit reports)")
      else:
        break
    for result in results:
      report = report_from(result, year_range)
      if report:
        inspector.save_report(report)

  # Pull the semiannual reports
  for attempt in range(MAX_ATTEMPTS):
    doc = utils.beautifulsoup_from_url(SEMIANNUAL_REPORTS_URL)
    page_text = doc.select("div.single-column-report-list")[0].text.strip()
    if page_text != ERROR_TEXT_LIST:
      break
    time.sleep(30)
  if page_text == ERROR_TEXT_LIST:
    raise Exception("Could not retrieve semiannual reports list")

  results = doc.select(".sar")
  if not results:
    raise inspector.NoReportsFoundError("VA (semiannual reports)")
  for result in results:
    report = semiannual_report_from(result, year_range)
    if report:
      inspector.save_report(report)
Exemplo n.º 53
0
def run(options):
  year_range = inspector.year_range(options, archive)

  # Pull the audit reports
  doc = utils.beautifulsoup_from_url(AUDIT_REPORTS_URL)
  results = doc.select("table tr")
  if not results:
    raise inspector.NoReportsFoundError("Federal Maritime Commission (audits)")
  for result in results:
    if result.th:
      # Skip the header row
      continue
    report = report_from(result, AUDIT_REPORTS_URL, report_type='audit', year_range=year_range)
    if report:
      inspector.save_report(report)

  # Pull historical audits
  audit_year_links = doc.select("div.col-2-3 ul li a")
  for year_link in audit_year_links:
    audit_year_url = urljoin(AUDIT_REPORTS_URL, year_link.get('href'))
    doc = utils.beautifulsoup_from_url(audit_year_url)
    results = doc.select("table tr")
    if not results:
      # Grab results other than first and last (header and extra links)
      results = doc.select("div.col-2-2 ul")[1:-1]
    if not results:
      raise inspector.NoReportsFoundError("Federal Maritime Commission (%s)" % audit_year_url)
    for result in results:
      if result.th:
        # Skip the header row
        continue
      report = report_from(result, AUDIT_REPORTS_URL, report_type='audit', year_range=year_range)
      if report:
        inspector.save_report(report)

  # Pull the semiannual reports
  doc = utils.beautifulsoup_from_url(SEMIANNUAL_REPORTS_URL)
  results = doc.select("div.col-2-2 p a") + doc.select("div.col-2-2 li a")
  if not results:
    raise inspector.NoReportsFoundError("Federal Maritime Commission (semiannual reports)")
  for result in results:
    report = report_from(result.parent, AUDIT_REPORTS_URL, report_type='semiannual_report', year_range=year_range)
    if report:
      inspector.save_report(report)