Python save_report 예제들, utils.inspector.save_report Python 예제들

예제 #1

0

파일 보기

def run(options):
  year_range = inspector.year_range(options, archive)
  keys = set()

  # Pull the reports
  for report_type, url in REPORT_URLS:
    doc = utils.beautifulsoup_from_url(url)
    results = doc.select("section#content ul li")
    if results:
      for result in results:
        report = report_from_list(result, url, report_type, year_range)
        if report:
          if report["url"]:
            key = (report["report_id"], unquote(report["url"]))
          else:
            key = (report["report_id"], report["url"])
          if key not in keys:
            inspector.save_report(report)
            keys.add(key)
    else:
      results = doc.select("section#content p")
      if not results:
        raise inspector.NoReportsFoundError("Federal Labor Relations Authority (%s)" % report_type)
      for result in results:
        report = report_from_paragraph(result, url, report_type, year_range)
        if report:
          key = (report["report_id"], report["url"])
          if key not in keys:
            inspector.save_report(report)
            keys.add(key)

예제 #2

0

파일 보기

파일: energy.py 프로젝트: harrisj/inspectors-general

    def run(self, options):
        self.options = options
        self.year_range = inspector.year_range(self.options, archive)
        self.first_date = datetime.datetime(self.year_range[0], 1, 1)
        self.last_date = datetime.datetime(self.year_range[-1], 12, 31)

        for url in self.urls_for():
            page = BeautifulSoup(utils.download(url))

            nodes = page.select('.energy-listing__results .node')
            if not nodes:
                nodes = page.select('.field-items .node')
            if not nodes:
                nodes = page.select('.node')
            if not nodes:
                raise inspector.NoReportsFoundException(
                    "Department of Energy (%s)" % url)

            for node in nodes:
                report = self.report_from(node)
                if report:
                    inspector.save_report(report)
                else:
                    # Empty report indicates a report out of the date range, or not the ID.
                    continue

예제 #3

0

파일 보기

def run(options):
    year_range = inspector.year_range(options, archive)

    # Pull the audit reports
    for year in year_range:
        if year < 2005:  # This is the earliest audits go back
            continue
        url = AUDIT_REPORTS_URL.format(year=year)
        doc = BeautifulSoup(utils.download(url))
        results = doc.select("div.content")
        if not results:
            raise inspector.NoReportsFoundError(
                "Tennessee Valley Authority (%d)" % year)
        for result in results:
            report = audit_report_from(result, url, year_range)
            if report:
                inspector.save_report(report)

    # Pull the semiannual reports
    doc = BeautifulSoup(utils.download(SEMIANNUAL_REPORTS_URL))
    results = doc.select("report")
    if not results:
        raise inspector.NoReportsFoundError(
            "Tennessee Valley Authority (semiannual reports)")
    for result in results:
        report = semiannual_report_from(result, year_range)
        if report:
            inspector.save_report(report)

예제 #4

0

파일 보기

파일: usaid.py 프로젝트: harrisj/inspectors-general

def run(options):
  year_range = inspector.year_range(options, archive)

  # Pull the reports with pagination
  for report_type, report_url_format in PAGINATED_REPORT_FORMATS.items():
    for page in range(0, 999):
      url = report_url_format.format(page=page)
      doc = BeautifulSoup(utils.download(url))
      results = doc.select("li.views-row")
      if not results:
        if page == 0:
          raise inspector.NoReportsFoundError("USAID (%s)" % report_type)
        else:
          break

      for result in results:
        report = report_from(result, url, report_type, year_range)
        if report:
          inspector.save_report(report)

  # Pull the semiannual reports (no pagination)
  doc = BeautifulSoup(utils.download(SEMIANNUAL_REPORTS_URL))
  results = doc.select("li.views-row")
  if not results:
    raise inspector.NoReportsFoundError("USAID (semiannual reports)")
  for result in results:
    report = semiannual_report_from(result, year_range)
    if report:
      inspector.save_report(report)

예제 #5

0

파일 보기

파일: state.py 프로젝트: BunsenMcDubbs/inspectors-general

def extract_reports_for_subtopic(subtopic_url, year_range, topic, subtopic=None):
  if subtopic_url.startswith("http://httphttp://"):
    # See notes to IG's web team
    subtopic_url = subtopic_url.replace("http://http", "")

  body = utils.download(subtopic_url)
  doc = BeautifulSoup(body)
  results = doc.select("#body-row02-col02andcol03 a")

  if not results:
    results = doc.select("#body-row02-col01andcol02andcol03 a")
  if not results and "There are currently no reports in this category" not in doc.text:
    raise AssertionError("No report links found for %s" % subtopic_url)

  topic_name = TOPIC_NAMES[topic]
  # Broadcasting Board of Governors is a fully independent agency
  if topic == 'BBG' or subtopic == 'Broadcasting Board of Governors':
    agency = 'bbg'
  else:
    agency = 'state'

  for result in results:
    report = report_from(result, year_range, agency, topic_name, subtopic)
    if report:
      inspector.save_report(report)

예제 #6

0

파일 보기

def parse_mapping(content, landing_url, report_type, year_range):
    links = content.find_all("a")
    if not links:
        raise inspector.NoReportsFoundError("Legal Services Corporation (%s)" %
                                            landing_url)
    for link in links:
        href = link.get("href")
        href = urljoin(landing_url, href)
        result = None
        if href == "https://www.oig.lsc.gov/images/mapping/mapping.zip":
            continue
        elif href == MAPPING_PROJECT_ARCHIVE_GRANTEE_URL:
            continue
        elif href.startswith("mailto:"):
            continue
        elif href == "https://www.oig.lsc.gov/evaluation-of-legal-services-mapping-prsentation":
            link["href"] = "https://oig.lsc.gov/mapping/phaseIIbriefing.pdf"
            result = link.parent
        elif href in (
                "https://www.oig.lsc.gov/images/pdfs/mapping/MeekerOIGMappingReport.pdf",
                "https://www.oig.lsc.gov/core-legal-services",
        ):
            result = link.parent
        elif href == "https://www.oig.lsc.gov/images/mapping/Mapping_Evaluation_Phase_I_Volume_I_Final_Report.pdf":
            result = link.parent.parent
        elif (href.startswith("https://oig.lsc.gov/mapping/references/eval")
              and href.endswith(".pdf")):
            result = link
        else:
            raise Exception(
                "Unexpected link found on a mapping project page: %s" % href)

        report = report_from(result, landing_url, report_type, year_range)
        if report:
            inspector.save_report(report)

예제 #7

0

파일 보기

def run(options):
  year_range = inspector.year_range(options, archive)
  keys = set()

  # Pull the reports
  for report_type, url in REPORT_URLS:
    doc = utils.beautifulsoup_from_url(url)
    results = doc.select("section#content ul li")
    if results:
      for result in results:
        report = report_from_list(result, url, report_type, year_range)
        if report:
          if report["url"]:
            key = (report["report_id"], unquote(report["url"]))
          else:
            key = (report["report_id"], report["url"])
          if key not in keys:
            inspector.save_report(report)
            keys.add(key)
    else:
      results = doc.select("section#content p")
      if not results:
        raise inspector.NoReportsFoundError("Federal Labor Relations Authority (%s)" % report_type)
      for result in results:
        report = report_from_paragraph(result, url, report_type, year_range)
        if report:
          key = (report["report_id"], report["url"])
          if key not in keys:
            inspector.save_report(report)
            keys.add(key)

예제 #8

0

파일 보기

def parse_investigation(content, landing_url, report_type, year_range):
    doj_flag = True
    doj_report_counter = 0
    other_report_counter = 0
    for child in content.children:
        if isinstance(child, Tag) and child.name == 'hr':
            doj_flag = False
            continue
        if doj_flag:
            if isinstance(child, Tag) and child.name == 'ul':
                report = report_from(child.li, landing_url, report_type,
                                     year_range)
                if report:
                    inspector.save_report(report)
                    doj_report_counter = doj_report_counter + 1
        else:
            if isinstance(child, Tag):
                if child.name != 'h3' and child.text.strip():
                    report = report_from(child, landing_url, report_type,
                                         year_range)
                    if report:
                        inspector.save_report(report)
                        other_report_counter = other_report_counter + 1
            elif isinstance(child, Comment):
                continue
            elif isinstance(child, NavigableString):
                if child.strip():
                    raise Exception("Unexpected text!: " + child)
    if doj_report_counter == 0 or other_report_counter == 0:
        raise inspector.NoReportsFoundError("Legal Services Corporation (%s)" %
                                            landing_url)

예제 #9

0

파일 보기

def run(options):
    year_range = inspector.year_range(options, archive)

    # Pull the reports with pagination
    for report_type, report_url_format in PAGINATED_REPORT_FORMATS:
        for page in range(0, 999):
            url = report_url_format.format(page=page)
            doc = utils.beautifulsoup_from_url(url)
            if report_type == "audit" and page == 0 and not doc.select(
                    "div.views-field-field-auditreport-doc-1"):
                raise Exception("Report number CSS class has changed")
            results = doc.select("li.views-row")
            if not results:
                if page == 0:
                    raise inspector.NoReportsFoundError("USAID (%s)" %
                                                        report_type)
                else:
                    break

            for result in results:
                report = report_from(result, url, report_type, year_range)
                if report:
                    inspector.save_report(report)

    # Pull the semiannual reports (no pagination)
    doc = utils.beautifulsoup_from_url(SEMIANNUAL_REPORTS_URL)
    results = doc.select("li.views-row")
    if not results:
        raise inspector.NoReportsFoundError("USAID (semiannual reports)")
    for result in results:
        report = semiannual_report_from(result, year_range)
        if report:
            inspector.save_report(report)

예제 #10

0

파일 보기

파일: sigtarp.py 프로젝트: unitedstates/inspectors-general

def run(options):
  year_range = inspector.year_range(options, archive)

  # Pull the reports
  for report_type, report_url in REPORT_URLS:
    doc = utils.beautifulsoup_from_url(report_url)
    results = doc.select("td.mainInner div.ms-WPBody > div > ul > li")

    if not results:
      raise inspector.NoReportsFoundError("SIGTARP ({})".format(report_type))

    for result in results:
      report = report_from(result, report_type, year_range)
      if report:
        inspector.save_report(report)

  doc = utils.beautifulsoup_from_url(QUARTERLY_REPORTS_URL)
  results = doc.select("#MSOZoneCell_WebPartWPQ3 .s4-wpTopTable a")

  if not results:
    raise inspector.NoReportsFoundError("SIGTARP (quarterly reports)")

  for result in results:
    report = quarterly_report_from(result, year_range)
    if report:
      inspector.save_report(report)

예제 #11

0

파일 보기

파일: exim.py 프로젝트: spulec/inspectors-general

def run(options):
  year_range = inspector.year_range(options)

  for page_url in URLS:
    done = False
    body = utils.download(page_url)
    doc = BeautifulSoup(body)

    maincontent = doc.select("div#CS_Element_eximpagemaincontent")[0]
    all_p = maincontent.find_all("p")

    for p in all_p:
      for all_text, link_text, link_url in recurse_tree(p, False):
        if link_url == None:
          continue
        if link_url.startswith("mailto:"):
          continue
        if page_url == WHATS_NEW_URL and link_url == "/oig/whats-new-archive.cfm":
          # end of page
          done = True
          break
        if link_url.startswith("https://public.govdelivery.com/"):
          continue
        for index_url in URLS:
          if index_url.find(link_url) != -1:
            continue

        year = DATE_RE.search(all_text).group(3)
        if int(year) not in year_range:
          continue

        report = report_from(all_text, link_text, link_url, page_url)
        inspector.save_report(report)
      if done: break

예제 #12

0

파일 보기

파일: sec.py 프로젝트: stvnrlly/inspectors-general

def run(options):
  year_range = inspector.year_range(options)
  topics = options.get('topics')
  if topics:
    topics = topics.split(",")
  else:
    topics = TOPIC_TO_URL.keys()

  for topic in topics:
    topic_url = TOPIC_TO_URL[topic]
    body = utils.download(topic_url)
    doc = BeautifulSoup(body)

    try:
      year_results = doc.select("#Listing")[0]
      results = [x for x in year_results.select("ul li ul li")]
    except IndexError:
      try:
        all_results = doc.select("#bodyholder")[0]
        results = [x for x in all_results.select("ul li")]
      except IndexError:
        results = doc.select("table ul li")

    # Sometimes multiple reports are listed under the same datetime element.
    # We store which published datetime we saw last so that the next report
    # can use if if we are unable to find another published time.
    last_published_on = None
    for result in results:
      report, last_published_on = report_from(result, topic_url, topic, year_range, last_published_on)
      if report:
        inspector.save_report(report)

예제 #13

0

파일 보기

파일: lsc.py 프로젝트: harrisj/inspectors-general

def parse_mapping(content, landing_url, report_type, year_range):
  links = content.find_all("a")
  if not links:
    raise inspector.NoReportsFoundError("Legal Services Corporation (%s)" % landing_url)
  for link in links:
    href = link.get("href")
    href = urljoin(landing_url, href)
    result = None
    if href == "https://www.oig.lsc.gov/images/mapping/mapping.zip":
      continue
    elif href == MAPPING_PROJECT_ARCHIVE_GRANTEE_URL:
      continue
    elif href.startswith("mailto:"):
      continue
    elif href == "https://www.oig.lsc.gov/evaluation-of-legal-services-mapping-prsentation":
      link["href"] = "https://oig.lsc.gov/mapping/phaseIIbriefing.pdf"
      result = link.parent
    elif href in ("https://www.oig.lsc.gov/images/pdfs/mapping/MeekerOIGMappingReport.pdf",
                  "https://www.oig.lsc.gov/core-legal-services",):
      result = link.parent
    elif href == "https://www.oig.lsc.gov/images/mapping/Mapping_Evaluation_Phase_I_Volume_I_Final_Report.pdf":
      result = link.parent.parent
    elif (href.startswith("https://oig.lsc.gov/mapping/references/eval") and
          href.endswith(".pdf")):
      result = link
    else:
      raise Exception("Unexpected link found on a mapping project page: %s"
                      % href)

    report = report_from(result, landing_url, report_type, year_range)
    if report:
      inspector.save_report(report)

예제 #14

0

파일 보기

파일: interior.py 프로젝트: isabelhgarcia33/inspectors-general

def run(options):
    year_range = inspector.year_range(options, archive)
    min_year = min(year_range)
    page = 0
    last_page = 0

    while page <= last_page:
        doc = utils.beautifulsoup_from_url(
            REPORT_SEARCH_URL.format(min_year, page))
        last_page_link = doc.find("a", title="Go to last page")
        if last_page_link:
            href = last_page_link["href"]
            page_match = re.search("[?&]page=([0-9]+)(?:&|$)", href)
            if page_match:
                last_page = int(page_match.group(1))

        results = doc.select(".view-reports-advanced-search .views-row")
        if not results:
            raise inspector.NoReportsFoundError("Department of the Interior")
        for result in results:
            report = report_from(result, year_range)
            if report:
                inspector.save_report(report)
        page += 1
    if last_page == 0:
        raise Exception("Did not find last page link")

예제 #15

0

파일 보기

파일: lsc.py 프로젝트: unitedstates/inspectors-general

def parse_investigation(content, landing_url, report_type, year_range):
  doj_flag = True
  doj_report_counter = 0
  other_report_counter = 0
  for child in content.children:
    if (isinstance(child, Tag) and
            child.name == 'h3' and
            child.text == 'Reports'):
      doj_flag = False
      continue
    if doj_flag:
      if isinstance(child, Tag) and child.name == 'ul':
        doj_report_counter = doj_report_counter + 1
        report = report_from(child.li, landing_url, report_type, year_range)
        if report:
          inspector.save_report(report)
    else:
      if isinstance(child, Tag):
        if child.name != 'h3' and child.text.strip():
          other_report_counter = other_report_counter + 1
          report = report_from(child, landing_url, report_type, year_range)
          if report:
            inspector.save_report(report)
      elif isinstance(child, Comment):
        continue
      elif isinstance(child, NavigableString):
        if child.strip():
          raise Exception("Unexpected text!: " + child)
  if doj_report_counter == 0 or other_report_counter == 0:
    raise inspector.NoReportsFoundError("Legal Services Corporation (%s)" % landing_url)

예제 #16

0

파일 보기

파일: nlrb.py 프로젝트: harrisj/inspectors-general

def run(options):
    year_range = inspector.year_range(options, archive)

    # Pull the audit and inspections reports
    for report_type, reports_url in REPORT_URLS:
        doc = BeautifulSoup(utils.download(reports_url))
        results = doc.select("div.field-item")
        if not results:
            raise inspector.NoReportsFoundError(
                "National Labor Relations Board (%s)" % report_type)
        for result in results:
            report = report_from(result, report_type, reports_url, year_range)
            if report:
                inspector.save_report(report)

    # Pull the semiannual reports
    doc = BeautifulSoup(utils.download(SEMIANNUAL_REPORTS_URL))
    results = doc.select("div.field-item")
    if not results:
        raise inspector.NoReportsFoundError(
            "National Labor Relations Board (semiannual reports)")
    for result in results:
        report = semiannual_report_from(result, year_range)
        if report:
            inspector.save_report(report)

예제 #17

0

파일 보기

파일: hhs.py 프로젝트: Cloudxtreme/inspectors-general

def extract_reports_for_subtopic(subtopic_url, year_range, topic_name, subtopic_name):
  doc = beautifulsoup_from_url(subtopic_url)
  if not doc:
    raise Exception("Failure fetching subtopic URL: %s" % subtopic_url)

  results = None

  # This URL is different than the rest and needs to find the "p > a"s first.
  if subtopic_url == TOPIC_TO_URL['TMPC']:
    results = doc.select("#leftContentInterior > p > a")
  if not results:
    results = doc.select("#leftContentInterior dl dd")
  if not results:
    results = doc.select("#leftContentInterior ul li")
  if not results:
    results = doc.select("#leftContentInterior > p > a")
  if not results:
    raise inspector.NoReportsFoundError("HHS (%s)" % subtopic_name)
  for result in results:
    if 'crossref' in result.parent.parent.attrs.get('class', []):
      continue
    if result.parent.parent.attrs.get('id') == 'related':
      continue
    report = report_from(result, year_range, topic_name, subtopic_url, subtopic_name)
    if report:
      inspector.save_report(report)

예제 #18

0

파일 보기

파일: opm.py 프로젝트: spulec/inspectors-general

def run(options):
  year_range = inspector.year_range(options)
  only_id = options.get('report_id')

  print("## Downloading reports from %i to %i" % (year_range[0], year_range[-1]))

  url = url_for()
  body = utils.download(url)

  doc = BeautifulSoup(body)
  results = doc.select("section")

  for result in results:
    try:
      year = int(result.get("title"))
      # check that the fetched year is in the range
      if year not in year_range:
        continue
      print("## Downloading year %i " % year)
    except ValueError:
      continue

    # gets each table entry and sends generates a report from it
    listings = result.div.table.tbody.contents
    for item in listings:
      if type(item) is not bs4.element.Tag:
        continue
      report = report_from(item)

      # can limit it to just one report, for debugging convenience
      if only_id and only_id != report['report_id']:
        continue

      inspector.save_report(report)

예제 #19

0

파일 보기

파일: interior.py 프로젝트: unitedstates/inspectors-general

def run(options):
  year_range = inspector.year_range(options, archive)
  min_year = min(year_range)
  page = 0
  last_page = 0

  while page <= last_page:
    doc = utils.beautifulsoup_from_url(REPORT_SEARCH_URL.format(min_year, page))
    last_page_link = doc.find("a", title="Go to last page")
    if last_page_link:
      href = last_page_link["href"]
      page_match = re.search("[?&]page=([0-9]+)(?:&|$)", href)
      if page_match:
        last_page = int(page_match.group(1))

    results = doc.select(".view-reports-advanced-search .views-row")
    if not results:
      raise inspector.NoReportsFoundError("Department of the Interior")
    for result in results:
      report = report_from(result, year_range)
      if report:
        inspector.save_report(report)
    page += 1
  if last_page == 0:
    raise Exception("Did not find last page link")

예제 #20

0

파일 보기

파일: osc.py 프로젝트: unitedstates/inspectors-general

def run(options):
  year_range = inspector.year_range(options, archive)
  report_flag = False

  # Pull the table of reports for each year
  for year in year_range:
    url = url_for_year(year)
    html = utils.download(url, scraper_slug="osc")

    if html is None:
      if year == max(year_range):
        continue
      else:
        raise Exception("Couldn't fetch reports page {}".format(url))

    #  spaces appear as &#160; and \u200b .... fix that now
    html = html.replace('&#160;', ' ').replace('\u200b', ' ').replace('\u00a0', ' ').replace('\r', '').replace('\n', '')
    doc = BeautifulSoup(html, "lxml")

    OUTCOME_CODES = generate_outcome_codes(doc)

    keys_used = []  # a few reports appear multiple times... ignore them the second time if they appear more than once

    results = doc.findAll("table")[1].tbody.findAll('tr')  # no ids on the tables, but it's the second one
    for result in results:
      reports = report_from(result, year, year_range, url, OUTCOME_CODES)
      for report in reports:
        if report['report_id'] not in keys_used:
          inspector.save_report(report)
          keys_used.append(report['report_id'])
          report_flag = True

  if not report_flag:
    raise inspector.NoReportsFoundError("OSC")

예제 #21

0

파일 보기

파일: dot.py 프로젝트: harrisj/inspectors-general

def run(options):
  year_range = inspector.year_range(options, archive)

  topics = options.get('topics')
  if topics:
    topics = topics.split(",")
  else:
    topics = TOPIC_TO_URL.keys()

  all_reports = {}

  for topic in topics:
    year_urls = urls_for(year_range, topic)
    for year_url in year_urls:
      logging.debug("Scraping %s" % year_url)
      body = utils.download(year_url)

      doc = BeautifulSoup(body)

      if not doc.select(".view-business-areas"):
        raise inspector.NoReportsFoundError("DOT (%s)" % topic)

      results = doc.select(".view-business-areas .views-row")
      for result in results:
        report = report_from(result, year_range, topic, options)
        if report:
          report_id = report["report_id"]
          if report_id in all_reports:
            all_reports[report_id]["topic"] = all_reports[report_id]["topic"] \
                + ", " + topic
          else:
            all_reports[report_id] = report

  for report in all_reports.values():
    inspector.save_report(report)

예제 #22

0

파일 보기

파일: tigta.py 프로젝트: BunsenMcDubbs/inspectors-general

def run(options):
  year_range = inspector.year_range(options)

  # Pull the audit reports
  for year in year_range:
    url = audit_report_url(year)
    if url:
      parse_result_from_js_url(url, "auditreports", year, year_range)
    url = inspection_report_url(year)
    if url:
      parse_result_from_js_url(url, "iereports", year, year_range)

  # Pull the congressional testimony
  doc = BeautifulSoup(utils.download(CONGRESSIONAL_TESTIMONY_REPORTS_URL))
  results = doc.findAll("ul", type='disc')[0].select("li")
  for result in results:
    report = congressional_testimony_report_from(result, year_range)
    if report:
      inspector.save_report(report)

  # Pull the semiannual reports
  doc = BeautifulSoup(utils.download(SEMIANNUAL_REPORTS_URL))
  results = doc.findAll("ul", type='disc')[0].select("li")
  for result in results:
    report = semiannual_report_from(result, year_range)
    if report:
      inspector.save_report(report)

예제 #23

0

파일 보기

파일: eeoc.py 프로젝트: harrisj/inspectors-general

def run(options):
    year_range = inspector.year_range(options, archive)

    # Pull the reports
    doc = BeautifulSoup(utils.download(REPORTS_URL))
    semiannual_report_results, other_results = doc.select(
        "table tr")[1].select("td")

    if not semiannual_report_results:
        raise inspector.NoReportsFoundException("EEOC (semiannual reports)")
    if not other_results:
        raise inspector.NoReportsFoundException("EEOC (other reports)")

    merge_items(semiannual_report_results)
    merge_items(other_results)

    for result in semiannual_report_results.select("li"):
        report = semiannual_report_from(result,
                                        year_range,
                                        title_prefix="Semiannual Report - ")
        if report:
            inspector.save_report(report)

    for result in other_results.select("li"):
        report = report_from(result, year_range)
        if report:
            inspector.save_report(report)

예제 #24

0

파일 보기

def run(options):
  year_range = inspector.year_range(options, archive)

  topics = options.get('topics')
  if topics:
    topics = topics.split(",")
  else:
    topics = TOPIC_TO_URL.keys()

  all_reports = {}

  for topic in topics:
    year_urls = urls_for(year_range, topic)
    for year_url in year_urls:
      logging.debug("Scraping %s" % year_url)
      doc = utils.beautifulsoup_from_url(year_url)

      if not doc.select(".view-business-areas"):
        raise inspector.NoReportsFoundError("DOT (%s)" % topic)

      results = doc.select(".view-business-areas .views-row")
      for result in results:
        report = report_from(result, year_range, topic, options)
        if report:
          report_id = report["report_id"]
          if report_id in all_reports:
            all_reports[report_id]["topic"] = all_reports[report_id]["topic"] \
                + ", " + topic
          else:
            all_reports[report_id] = report

  for report in all_reports.values():
    inspector.save_report(report)

예제 #25

0

파일 보기

파일: prc.py 프로젝트: slobdell/inspectors-general

def run(options):
    year_range = inspector.year_range(options, archive)

    # Find the number of pages to iterate
    doc = BeautifulSoup(utils.download(REPORTS_URL))
    page_count_text = doc.select("div.AspNet-GridView-Pagination")[0].text
    page_count = int(re.search("Page 1 of (\d+)", page_count_text).groups()[0])

    # Iterate over those pages
    for page in range(1, page_count + 1):
        response = utils.scraper.post(
            REPORTS_URL,
            data={
                "__EVENTTARGET": "ctl00$ctl00$MainContent$NavTreeSubContent$sv$GridViewSummary",
                "__EVENTARGUMENT": "Page${page_number}".format(page_number=page),
            },
            cookies=COOKIES,
        )
        doc = BeautifulSoup(response.content)
        results = doc.select("div.AspNet-GridView table tr")
        if not results:
            break
        for index, result in enumerate(results):
            if not index:
                # Skip the header row
                continue
            report = report_from(result, year_range)
            if report:
                inspector.save_report(report)

예제 #26

0

파일 보기

파일: labor.py 프로젝트: Cloudxtreme/inspectors-general

def run(options):
  year_range = inspector.year_range(options, archive)

  pre_1998_done = False

  # Pull the audit reports
  for year in year_range:
    if year < 1998:
      if pre_1998_done:
        continue
      else:
        pre_1998_done = True
    for page_number in range(0, 10000):
      year_url = url_for(year, page_number)
      doc = beautifulsoup_from_url(year_url)
      results = doc.select("ol li")
      if not results:
        if page_number == 0:
          raise inspector.NoReportsFoundError("Department of Labor (%s)" % year_url)
        else:
          break
      for result in results:
        report = report_from(result, year_url)
        if report:
          inspector.save_report(report)

  # Pull the semiannual reports
  doc = beautifulsoup_from_url(SEMIANNUAL_REPORTS_URL)
  results = doc.select("p > a:nth-of-type(1)")
  if not results:
    raise inspector.NoReportsFoundError("Department of Labor (semiannal reports)")
  for result in results:
    report = semiannual_report_from(result, year_range)
    if report:
      inspector.save_report(report)

예제 #27

0

파일 보기

파일: tigta.py 프로젝트: unitedstates/inspectors-general

def run(options):
  year_range = inspector.year_range(options, archive)
  if datetime.datetime.now().month >= 10:
    # October, November, and December fall into the next fiscal year
    # Add next year to year_range to compensate
    year_range.append(max(year_range) + 1)

  # Pull the audit reports
  for year in year_range:
    url = audit_report_url(year)
    if url:
      parse_result_from_js_url(url, "auditreports", year, year_range, report_type='audit')
    url = inspection_report_url(year)
    if url:
      parse_result_from_js_url(url, "iereports", year, year_range, report_type='inspection')

  # Pull the congressional testimony
  doc = utils.beautifulsoup_from_url(CONGRESSIONAL_TESTIMONY_REPORTS_URL)
  results = doc.findAll("ul", type='disc')[0].select("li")
  for result in results:
    report = congressional_testimony_report_from(result, year_range)
    if report:
      inspector.save_report(report)

  # Pull the semiannual reports
  doc = utils.beautifulsoup_from_url(SEMIANNUAL_REPORTS_URL)
  results = doc.findAll("ul", type='disc')[0].select("li")
  for result in results:
    report = semiannual_report_from(result, year_range)
    if report:
      inspector.save_report(report)

예제 #28

0

파일 보기

파일: usaid.py 프로젝트: unitedstates/inspectors-general

def run(options):
  year_range = inspector.year_range(options, archive)

  # Pull the reports with pagination
  for report_type, report_url_format in PAGINATED_REPORT_FORMATS:
    for page in range(0, 999):
      url = report_url_format.format(page=page)
      doc = utils.beautifulsoup_from_url(url)
      if report_type == "audit" and page == 0 and not doc.select("div.views-field-field-auditreport-doc-1"):
        raise Exception("Report number CSS class has changed")
      results = doc.select("li.views-row")
      if not results:
        if page == 0:
          raise inspector.NoReportsFoundError("USAID (%s)" % report_type)
        else:
          break

      for result in results:
        report = report_from(result, url, report_type, year_range)
        if report:
          inspector.save_report(report)

  # Pull the semiannual reports (no pagination)
  doc = utils.beautifulsoup_from_url(SEMIANNUAL_REPORTS_URL)
  results = doc.select("li.views-row")
  if not results:
    raise inspector.NoReportsFoundError("USAID (semiannual reports)")
  for result in results:
    report = semiannual_report_from(result, year_range)
    if report:
      inspector.save_report(report)

예제 #29

0

파일 보기

파일: tva.py 프로젝트: Cloudxtreme/inspectors-general

def run(options):
  year_range = inspector.year_range(options, archive)

  # Pull the audit reports
  for year in year_range:
    if year < 2005:  # This is the earliest audits go back
      continue
    url = AUDIT_REPORTS_URL.format(year=year)
    doc = BeautifulSoup(utils.download(url))
    results = doc.select("div.content")
    if not results:
      raise inspector.NoReportsFoundError("Tennessee Valley Authority (%d)" % year)
    for result in results:
      report = audit_report_from(result, url, year_range)
      if report:
        inspector.save_report(report)

  # Pull the semiannual reports
  doc = BeautifulSoup(utils.download(SEMIANNUAL_REPORTS_URL))
  results = doc.select("report")
  if not results:
    raise inspector.NoReportsFoundError("Tennessee Valley Authority (semiannual reports)")
  for result in results:
    report = semiannual_report_from(result, year_range)
    if report:
      inspector.save_report(report)

예제 #30

0

파일 보기

파일: amtrak.py 프로젝트: spulec/inspectors-general

def run(options):
  year_range = inspector.year_range(options)
  max_pages = int(options.get('pages', 1))
  for year in year_range:
    page = 1
    done = False
    while not done:
      url = url_for(options, page, year)
      body = utils.download(url)

      doc = BeautifulSoup(body)

      next_page = page + 1
      found_next_page = False
      page_links = doc.select("li.pager-item a.active")
      for page_link in page_links:
        if page_link.text == str(next_page):
          found_next_page = True
          break
      if not found_next_page:
        done = True
      if next_page > max_pages:
        done = True

      results = doc.select("table.views-table > tbody > tr")
      for result in results:
        report = report_from(result)
        inspector.save_report(report)

      page = next_page
      if not done:
        print('Moving to next page (%d)' % page)

예제 #31

0

파일 보기

파일: doj.py 프로젝트: Cloudxtreme/inspectors-general

def run(options):
    year_range = inspector.year_range(options, archive)

    # Can limit search to any of the components listed at the top of this script
    component = options.get('component')
    if component and component in components:
        source_links = {}
        link = "%s/oig/reports/%s.htm" % (base_url, component)
        source_links[link] = components[component]

    # Otherwise, get links to each component's landing page from main page.
    else:
        starting_point = "http://www.justice.gov/oig/reports/components.htm"
        content = get_content(starting_point)
        source_links = {}
        for c in content:
            links = c.find_all("a")
            for l in links:
                name = l.string
                link = base_url + l.get("href")
                source_links[link] = name

    # For each component's landing page, run the processor over it
    keys = list(source_links.keys())
    keys.sort()

    for link in keys:
        content = get_content(link)
        extract_info(content, source_links[link], year_range)

    logging.info("Found %i reports, for year %i to %i" %
                 (len(list(report.keys())), year_range[0], year_range[-1]))

    for key in list(report.keys()):
        inspector.save_report(report[key])

예제 #32

0

파일 보기

def scrape_restricted_reports(options):
  """Restricted Products.

  A single HTML page lists unreleased reports since 2014, with no links."""

  # These reports are unreleased -- we could make this the text?
  """The following products have been determined to contain either
classified information or controlled unclassified information by the audited
agencies and cannot be publicly released.

Members of Congress or congressional staff who wish to obtain one or more of
these products should call or e-mail the Congressional Relations Office.
All others who wish to obtain one or more of these products should follow the
instructions found on Requesting Restricted Products."""

  REPORTS_URL = 'http://www.gao.gov/restricted/restricted_reports'
  archive = 2014

  year_range = inspector.year_range(options, archive)
  doc = utils.beautifulsoup_from_url(REPORTS_URL)
  results = doc.select("div.listing")
  for result in results:
    report = process_restricted_report(result, year_range, REPORTS_URL)
    if report:
      inspector.save_report(report)

예제 #33

0

파일 보기

파일: lsc.py 프로젝트: Cloudxtreme/inspectors-general

def run(options):
  year_range = inspector.year_range(options, archive)

  # Pull the audit reports
  for url, report_type in REPORT_URLS.items():
    page_content = utils.download(url)

    # This typo confuses BS4 and interferes with our selectors
    page_content = page_content.replace('<h4>2015</h3>', '<h4>2015</h4>')

    doc = BeautifulSoup(page_content)

    results = doc.select("blockquote > ul > a")
    if not results:
      results = doc.select("blockquote > ul > li > a")
    if not results:
      results = doc.select("blockquote > font > ul > a")
    if not results:
      results = doc.select("blockquote > a")
    if not results:
      raise inspector.NoReportsFoundError("Legal Services Corporation (%s)" % url)
    for result in results:
      report = report_from(result, url, report_type, year_range)
      if report:
        inspector.save_report(report)

예제 #34

0

파일 보기

파일: usps.py 프로젝트: ericalthatcher/inspectors-general

def run(options):
  year_range = inspector.year_range(options)
  pages = options.get('pages', ALL_PAGES)

  max_page = None
  for page in range(1, (int(pages) + 1)):
    if max_page and (page > max_page):
      print("End of pages!")
      break

    print("## Downloading page %i" % page)
    url = url_for(options, page)
    body = utils.download(url)
    doc = BeautifulSoup(body)
    max_page = last_page_for(doc)

    results = doc.select(".views-row")

    for result in results:
      report = report_from(result)

      # inefficient enforcement of --year arg, USPS doesn't support it server-side
      # TODO: change to published_on.year once it's a datetime
      if inspector.year_from(report) not in year_range:
        print("[%s] Skipping report, not in requested range." % report['report_id'])
        continue

      inspector.save_report(report)

예제 #35

0

파일 보기

def run(options):
  year_range = inspector.year_range(options, archive)

  # Pull the audit reports
  for year in year_range:
    url = AUDITS_REPORTS_URL.format(str(year)[2:4])
    doc = BeautifulSoup(utils.download(url))
    results = doc.select("tr")
    if not results:
      raise inspector.NoReportsFoundError("NASA (%d)" % year)
    for index, result in enumerate(results):
      if not index or not result.text.strip():
        # Skip the header row and any empty rows
        continue
      report = audit_report_from(result, url, year_range)
      if report:
        inspector.save_report(report)

  # Pull the other reports
  doc = BeautifulSoup(utils.download(OTHER_REPORT_URL))
  results = doc.select("#subContainer ul li")
  if not results:
    raise inspector.NoReportsFoundError("NASA (other)")
  for result in results:
    report = other_report_from(result, year_range)
    if report:
      inspector.save_report(report)

예제 #36

0

파일 보기

파일: itc.py 프로젝트: isabelhgarcia33/inspectors-general

def run(options):
  year_range = inspector.year_range(options, archive)

  # Pull the audit reports
  doc = utils.beautifulsoup_from_url(AUDIT_REPORTS_URL)

  headers = set([a.parent for a in
                 doc.find_all("a", id=re.compile("^[0-9]{4}$"))])
  headers.update(doc.find_all("p", class_="Ptitle1"))
  headers = sorted(headers, key=lambda p: int(p.text.strip()), reverse=True)
  if not headers:
    raise inspector.NoReportsFoundError("ITC")

  for header in headers:
    year = int(header.text.strip())
    results = header.findNextSibling("ul").select("li")

    for result in results:
      if not inspector.sanitize(result.text):
        logging.debug("Skipping empty list item.")
        continue

      report = audit_report_from(year, result, AUDIT_REPORTS_URL, year_range)
      if report:
        inspector.save_report(report)

예제 #37

0

파일 보기

파일: dhs.py 프로젝트: harrisj/inspectors-general

def run(options):
  year_range = inspector.year_range(options, archive)

  component = options.get('component')
  if component:
    components = [component]
  else:
    components = list(COMPONENTS.keys())

  report_id = options.get('report_id')

  limit = int(options.get('limit', 0))

  all_audit_reports = {}

  for component in components:
    logging.info("## Fetching reports for component %s" % component)
    url = url_for(options, component)
    body = utils.download(url)

    doc = BeautifulSoup(body)

    results = doc.select("table.contentpaneopen table[border=1] tr")
    # accept only trs that look like body tr's (no 'align' attribute)
    #   note: HTML is very inconsistent. cannot rely on thead or tbody
    results = [x for x in results if x.get('align') is None]
    if not results:
      raise inspector.NoReportsFoundError("DHS (%s)" % component)

    count = 0
    for result in results:
      report = report_from(result, component, url)
      if not report:
        continue

      if report_id and (report_id != report['report_id']):
        continue

      if inspector.year_from(report) not in year_range:
        # logging.info("[%s] Skipping, not in requested range." % report['report_id'])
        continue

      key = (report["report_id"], report["title"])
      if key in all_audit_reports:
        all_audit_reports[key]["agency"] = all_audit_reports[key]["agency"] + \
                ", " + report["agency"]
        all_audit_reports[key]["agency_name"] = \
                all_audit_reports[key]["agency_name"] + ", " + \
                report["agency_name"]
      else:
        all_audit_reports[key] = report

      count += 1
      if limit and (count >= limit):
        break

    logging.info("## Fetched %i reports for component %s\n\n" % (count, component))

  for report in all_audit_reports.values():
    inspector.save_report(report)

예제 #38

0

파일 보기

파일: fhfa.py 프로젝트: slobdell/inspectors-general

def run(options):
  year_range = inspector.year_range(options, archive)
  pages = options.get('pages', ALL_PAGES)

  # Pull the audit reports. Pages are 0-indexed.
  for page in range(0, int(pages) - 1):
    doc = BeautifulSoup(utils.download(AUDIT_REPORTS_URL.format(page=page)))
    results = doc.select("span.field-content")
    if not results:
      # No more results, we must have hit the last page
      break

    for result in results:
      report = report_from(result, year_range, report_type='audit')
      if report:
        inspector.save_report(report)

  # Grab the other reports
  for report_type, url in OTHER_REPORT_URLS.items():
    doc = BeautifulSoup(utils.download(url))
    results = doc.select(".views-field")
    if not results:
      results = doc.select(".views-row")
    for result in results:
      report = report_from(result, year_range, report_type)
      if report:
        inspector.save_report(report)

예제 #39

0

파일 보기

def extract_reports_for_subtopic(subtopic_url, year_range, topic_name,
                                 subtopic_name):
    doc = beautifulsoup_from_url(subtopic_url)
    if not doc:
        raise Exception("Failure fetching subtopic URL: %s" % subtopic_url)

    results = None

    # This URL is different than the rest and needs to find the "p > a"s first.
    if subtopic_url == TOPIC_TO_URL['TMPC']:
        results = doc.select("#leftContentInterior > p > a")
    if not results:
        results = doc.select("#leftContentInterior dl dd")
    if not results:
        results = doc.select("#leftContentInterior ul li")
    if not results:
        results = doc.select("#leftContentInterior > p > a")
    if not results:
        raise inspector.NoReportsFoundError("HHS (%s)" % subtopic_name)
    for result in results:
        if 'crossref' in result.parent.parent.attrs.get('class', []):
            continue
        if result.parent.parent.attrs.get('id') == 'related':
            continue
        report = report_from(result, year_range, topic_name, subtopic_url,
                             subtopic_name)
        if report:
            inspector.save_report(report)

예제 #40

0

파일 보기

def run(options):
  year_range = inspector.year_range(options, archive)

  # Can limit search to any of the components listed at the top of this script
  component = options.get('component')
  if component and component in components:
    source_links = {}
    link = urljoin(base_url, "%s.htm" % component)
    source_links[link] = components[component]

  # Otherwise, get links to each component's landing page from main page.
  else:
    starting_point = "https://oig.justice.gov/reports/components.htm"
    content = get_content(starting_point)
    source_links = {}
    for c in content:
      links = c.find_all("a")
      for l in links:
        name = l.string
        link = urljoin(base_url, l.get("href"))
        source_links[link] = name

  # For each component's landing page, run the processor over it
  keys = list(source_links.keys())
  keys.sort()

  for link in keys:
    content = get_content(link)
    extract_info(content, source_links[link], year_range)


  logging.info("Found %i reports, for year %i to %i" % (len(list(report.keys())), year_range[0], year_range[-1]))

  for key in list(report.keys()):
    inspector.save_report(report[key])

예제 #41

0

파일 보기

파일: usps.py 프로젝트: slobdell/inspectors-general

def run(options):
  year_range = inspector.year_range(options, archive)
  pages = options.get('pages', ALL_PAGES)

  # default to starting at page 1
  begin = int(options.get('begin', 1))

  max_page = None
  for page in range(begin, (int(pages) + 1)):
    if max_page and (page > max_page):
      logging.debug("End of pages!")
      break

    logging.debug("## Downloading page %i" % page)
    url = url_for(options, page)
    body = utils.download(url)
    doc = BeautifulSoup(body)

    # When the USPS restores their page controls, we can use this again,
    # which saves one network call each time.
    max_page = last_page_for(doc)

    results = doc.select(".views-row")

    for result in results:
      report = report_from(result)

      # inefficient enforcement of --year arg, USPS doesn't support it server-side
      # TODO: change to published_on.year once it's a datetime
      if inspector.year_from(report) not in year_range:
        logging.warn("[%s] Skipping report, not in requested range." % report['report_id'])
        continue

      inspector.save_report(report)

예제 #42

0

파일 보기

def scrape_reports(options):
  """Pull reports from "Reports and Testimonies - Browse by date" web page."""

  REPORTS_URL = 'http://www.gao.gov/browse/date/custom?adv_begin_date=01/01/' +\
    '%s&adv_end_date=12/31/%s&rows=50&o=%s' # % (year, year, offset)
  archive = 1970
  # Amazingly, reports go back to 1940, though those are unlikely to be
  # legible enough to OCR. Also very cool, even 1950s-era reports seem to have
  # a highlightable embedded text layer in them. Of course, it was the
  # General Accounting Office back then and less oversighty.

  year_range = inspector.year_range(options, archive)
  for year in year_range:
    is_next_page = True
    offset = 0
    while is_next_page:
      doc = utils.beautifulsoup_from_url(
        REPORTS_URL % (year, year, offset))
      results = doc.select("div.listing")
      for result in results:
        report = process_report(result, year_range)
        if report:
          inspector.save_report(report)
      page_links = doc.select("a.non-current_page")
      if len(page_links) and page_links[-1].text.startswith('Next'):
        offset += 50
      else:
        is_next_page = False

예제 #43

0

파일 보기

파일: va.py 프로젝트: Cloudxtreme/inspectors-general

def run(options):
  year_range = inspector.year_range(options, archive)

  # Pull the audit reports
  for page in range(1, 1000):
    doc = beautifulsoup_from_url("{}?RS={}".format(REPORTS_URL, page))
    results = doc.select("div.leadin")
    if not results:
      if page == 1:
        raise inspector.NoReportsFoundError("VA (audit reports)")
      else:
        break
    for result in results:
      report = report_from(result, year_range)
      if report:
        inspector.save_report(report)

  # Pull the semiannual reports
  doc = beautifulsoup_from_url(SEMIANNUAL_REPORTS_URL)
  results = doc.select("div.leadin")
  if not results:
    raise inspector.NoReportsFoundError("VA (semiannual reports)")
  for result in results:
    report = semiannual_report_from(result, year_range)
    if report:
      inspector.save_report(report)

예제 #44

0

파일 보기

파일: sba.py 프로젝트: slobdell/inspectors-general

def run(options):
  year_range = inspector.year_range(options, archive)

  # Suggested flow, for an IG which paginates results.
  pages = options.get('pages', ALL_PAGES)
  for page in range(1, (int(pages) + 1)):
    data = {
      'view_name': 'oig_nodes',
      'view_display_id': 'block_search_oig_reports',
    }
    if page:
      # Only add page= if page > 0
      data['page'] = page

    response = utils.scraper.post(REPORTS_AJAX_URL,
        data=data,
        headers={
            "Content-Type": "application/x-www-form-urlencoded",
        },
    )
    page_html = response.json()[1]['data']
    doc = BeautifulSoup(page_html)
    results = doc.select("tr")
    if not results:
      break

    for index, result in enumerate(results):
      if not index:
        # Skip the header row
        continue
      report = report_from(result, year_range)
      if report:
        inspector.save_report(report)

예제 #45

0

파일 보기

파일: dhs.py 프로젝트: Cloudxtreme/inspectors-general

def run(options):
  year_range = inspector.year_range(options, archive)

  component = options.get('component')
  if component:
    components = [component]
  else:
    components = list(COMPONENTS.keys())

  report_id = options.get('report_id')

  limit = int(options.get('limit', 0))

  all_audit_reports = {}

  for component in components:
    logging.info("## Fetching reports for component %s" % component)
    url = url_for(options, component)
    body = utils.download(url)

    doc = BeautifulSoup(body)

    results = doc.select("table.contentpaneopen table[border=1] tr")
    # accept only trs that look like body tr's (no 'align' attribute)
    #   note: HTML is very inconsistent. cannot rely on thead or tbody
    results = [x for x in results if x.get('align') is None]
    if not results:
      raise inspector.NoReportsFoundError("DHS (%s)" % component)

    count = 0
    for result in results:
      report = report_from(result, component, url)
      if not report:
        continue

      if report_id and (report_id != report['report_id']):
        continue

      if inspector.year_from(report) not in year_range:
        # logging.info("[%s] Skipping, not in requested range." % report['report_id'])
        continue

      key = (report["report_id"], report["title"])
      if key in all_audit_reports:
        all_audit_reports[key]["agency"] = all_audit_reports[key]["agency"] + \
                ", " + report["agency"]
        all_audit_reports[key]["agency_name"] = \
                all_audit_reports[key]["agency_name"] + ", " + \
                report["agency_name"]
      else:
        all_audit_reports[key] = report

      count += 1
      if limit and (count >= limit):
        break

    logging.info("## Fetched %i reports for component %s\n\n" % (count, component))

  for report in all_audit_reports.values():
    inspector.save_report(report)

예제 #46

0

파일 보기

파일: fhfa.py 프로젝트: isabelhgarcia33/inspectors-general

def run(options):
    year_range = inspector.year_range(options, archive)
    pages = options.get('pages', ALL_PAGES)

    # Pull the audit reports. Pages are 0-indexed.
    for page in range(0, int(pages) - 1):
        doc = utils.beautifulsoup_from_url(AUDIT_REPORTS_URL.format(page=page))
        results = doc.select("span.field-content")
        if not results:
            if page == 0:
                raise inspector.NoReportsFound("FHFA (audit reports)")
            else:
                # No more results, we must have hit the last page
                break

        for result in results:
            report = report_from(result, year_range, report_type='audit')
            if report:
                inspector.save_report(report)

    # Grab the other reports
    for report_type, url in OTHER_REPORT_URLS.items():
        doc = utils.beautifulsoup_from_url(url)
        results = doc.select(".views-field")
        if not results:
            results = doc.select(".views-row")
        if not results:
            raise inspector.NoReportsFound("FHFA (%s)" % report_type)
        for result in results:
            report = report_from(result, year_range, report_type)
            if report:
                inspector.save_report(report)

예제 #47

0

파일 보기

파일: dhs.py 프로젝트: isabelhgarcia33/inspectors-general

def run(options):
    year_range = inspector.year_range(options, archive)

    component = options.get('component')
    if component:
        components = [component]
    else:
        components = sorted(COMPONENTS.keys())

    report_id = options.get('report_id')

    limit = int(options.get('limit', 0))

    all_audit_reports = {}

    for component in components:
        logging.info("## Fetching reports for component %s" % component)
        url = url_for(options, component)
        doc = utils.beautifulsoup_from_url(url)

        results = doc.select("#content-area tbody tr")
        if not results:
            raise inspector.NoReportsFoundError("DHS (%s)" % component)

        count = 0
        for result in results:
            report = report_from(result, component, url)
            if not report:
                continue

            if report_id and (report_id != report['report_id']):
                continue

            if inspector.year_from(report) not in year_range:
                # logging.info("[%s] Skipping, not in requested range." % report['report_id'])
                continue

            key = (report["report_id"], report["title"])
            if key in all_audit_reports:
                all_audit_reports[key]["agency"] = "{}, {}".format(
                    all_audit_reports[key]["agency"], report["agency"])
                all_audit_reports[key]["agency_name"] = "{}, {}".format(
                    all_audit_reports[key]["agency_name"],
                    report["agency_name"])
            else:
                all_audit_reports[key] = report

            count += 1
            if limit and (count >= limit):
                break

        logging.info("## Fetched %i reports for component %s\n\n" %
                     (count, component))

    for report in all_audit_reports.values():
        inspector.save_report(report)

예제 #48

0

파일 보기

def crawl_index(base_url, options, is_meta_index=False):
    year_range = inspector.year_range(options, archive)
    max_pages = options.get('pages')
    if max_pages:
        max_pages = int(max_pages)
    page = 1

    only_id = options.get('report_id')

    done = False
    while not done:
        url = url_for(base_url, page)
        body = utils.download(url)

        doc = BeautifulSoup(body)

        next_page = page + 1
        found_next_page = False
        page_links = doc.select("dl.moreResults a")
        for page_link in page_links:
            if page_link.text == str(next_page):
                found_next_page = True
                break
        if not found_next_page:
            done = True
        if max_pages and next_page > max_pages:
            done = True

        results = doc.select("div#svPortal dl")
        if not results and page == 1:
            temp_text = doc.select("div#svPortal")[0].text.strip()
            if temp_text != "There is currently no content available.":
                raise inspector.NoReportsFoundError(
                    "Government Services Administration (%s)" % url)
        for result in results:
            if "moreResults" in result.get("class"):
                continue
            if is_meta_index:
                url = "http://www.gsaig.gov" + result.a.get("href")
                crawl_index(url, options, False)
            else:
                report = report_from(result, base_url)
                year = int(report['published_on'][:4])

                if only_id and (report['report_id'] != only_id):
                    continue

                if year not in year_range:
                    continue

                inspector.save_report(report)

        page = next_page
        if not done:
            logging.info('Moving to next page (%d)' % page)

예제 #49

0

파일 보기

파일: cpsc.py 프로젝트: harrisj/inspectors-general

def run(options):
    year_range = inspector.year_range(options, archive)

    doc = BeautifulSoup(utils.download(REPORTS_URL))
    results = doc.select("ul.summary-list li")
    if not results:
        raise inspector.NoReportsFoundError("CPSC")
    for result in results:
        report = report_from(result, year_range)
        if report:
            inspector.save_report(report)

예제 #50

0

파일 보기

def run(options):
    year_range = inspector.year_range(options, archive)

    doc = utils.beautifulsoup_from_url(REPORTS_URL)
    results = doc.select(".table-responsive tbody tr")
    if not results:
        raise inspector.NoReportsFoundError("CPSC")
    for result in results:
        report = report_from(result, year_range)
        if report:
            inspector.save_report(report)

예제 #51

0

파일 보기

def run(options):
  year_range = inspector.year_range(options, archive)

  # Pull the reports
  for report_type, report_url in REPORT_URLS:
      doc = utils.beautifulsoup_from_url(report_url)
      results = doc.select("tbody tr > td:nth-of-type(1) a") or doc.select(".views-more-link")
      for result in results:
        report = report_from(result, year_range, report_type)
        if report:
          inspector.save_report(report)

예제 #52

0

파일 보기

def run(options):
  year_range = inspector.year_range(options, archive)

  # Pull the reports
  doc = BeautifulSoup(utils.download(REPORTS_URL))
  results = doc.select("li div li")
  if not results:
    raise inspector.NoReportsFoundError("Peace Corps")
  for result in results:
    report = report_from(result, year_range)
    if report:
      inspector.save_report(report)

예제 #53

0

파일 보기

def parse_peer_reviews(content, landing_url, report_type, year_range):
    links = content.find_all("a")
    if len(links) <= 1:
        raise inspector.NoReportsFoundError("Legal Services Corporation (%s)" %
                                            landing_url)
    for link in links:
        if link.text.find("Government Auditing Standards") != -1:
            continue
        result = link.parent
        report = report_from(result, landing_url, report_type, year_range)
        if report:
            inspector.save_report(report)

예제 #54

0

파일 보기

파일: gao.py 프로젝트: isabelhgarcia33/inspectors-general

def run(options):
    year_range = inspector.year_range(options, archive)

    # Pull the audit and semiannual reports
    for reports_url in [REPORTS_URL, SEMIANNUAL_REPORTS_URL]:
        doc = utils.beautifulsoup_from_url(reports_url)
        results = doc.select("div.listing")
        if not results:
            raise inspector.NoReportsFoundError("GAO (%s)" % reports_url)
        for result in results:
            report = report_from(result, year_range)
            if report:
                inspector.save_report(report)

예제 #55

0

파일 보기

파일: sigar.py 프로젝트: harrisj/inspectors-general

def run(options):
  year_range = inspector.year_range(options, archive)

  # Pull the reports
  for report_type, report_url in REPORT_URLS.items():
    doc = BeautifulSoup(utils.download(report_url))
    results = doc.select("item")
    if not results:
      raise inspector.NoReportsFoundError("SIGAR (%s)" % report_type)
    for result in results:
      report = report_from(result, report_url, report_type, year_range)
      if report:
        inspector.save_report(report)