Exemplo n.º 1
0
def run(options):
  year_range = inspector.year_range(options, archive)

  component = options.get('component')
  if component:
    components = [component]
  else:
    components = list(COMPONENTS.keys())

  report_id = options.get('report_id')

  limit = int(options.get('limit', 0))

  all_audit_reports = {}

  for component in components:
    logging.info("## Fetching reports for component %s" % component)
    url = url_for(options, component)
    body = utils.download(url)

    doc = BeautifulSoup(body)

    results = doc.select("table.contentpaneopen table[border=1] tr")
    # accept only trs that look like body tr's (no 'align' attribute)
    #   note: HTML is very inconsistent. cannot rely on thead or tbody
    results = [x for x in results if x.get('align') is None]
    if not results:
      raise inspector.NoReportsFoundError("DHS (%s)" % component)

    count = 0
    for result in results:
      report = report_from(result, component, url)
      if not report:
        continue

      if report_id and (report_id != report['report_id']):
        continue

      if inspector.year_from(report) not in year_range:
        # logging.info("[%s] Skipping, not in requested range." % report['report_id'])
        continue

      key = (report["report_id"], report["title"])
      if key in all_audit_reports:
        all_audit_reports[key]["agency"] = all_audit_reports[key]["agency"] + \
                ", " + report["agency"]
        all_audit_reports[key]["agency_name"] = \
                all_audit_reports[key]["agency_name"] + ", " + \
                report["agency_name"]
      else:
        all_audit_reports[key] = report

      count += 1
      if limit and (count >= limit):
        break

    logging.info("## Fetched %i reports for component %s\n\n" % (count, component))

  for report in all_audit_reports.values():
    inspector.save_report(report)
Exemplo n.º 2
0
def run(options):
  year_range = inspector.year_range(options)
  pages = options.get('pages', ALL_PAGES)

  max_page = None
  for page in range(1, (int(pages) + 1)):
    if max_page and (page > max_page):
      print("End of pages!")
      break

    print("## Downloading page %i" % page)
    url = url_for(options, page)
    body = utils.download(url)
    doc = BeautifulSoup(body)
    max_page = last_page_for(doc)

    results = doc.select(".views-row")

    for result in results:
      report = report_from(result)

      # inefficient enforcement of --year arg, USPS doesn't support it server-side
      # TODO: change to published_on.year once it's a datetime
      if inspector.year_from(report) not in year_range:
        print("[%s] Skipping report, not in requested range." % report['report_id'])
        continue

      inspector.save_report(report)
Exemplo n.º 3
0
def run(options):
  year_range = inspector.year_range(options, archive)

  component = options.get('component')
  if component:
    components = [component]
  else:
    components = list(COMPONENTS.keys())

  report_id = options.get('report_id')

  limit = int(options.get('limit', 0))

  all_audit_reports = {}

  for component in components:
    logging.info("## Fetching reports for component %s" % component)
    url = url_for(options, component)
    body = utils.download(url)

    doc = BeautifulSoup(body)

    results = doc.select("table.contentpaneopen table[border=1] tr")
    # accept only trs that look like body tr's (no 'align' attribute)
    #   note: HTML is very inconsistent. cannot rely on thead or tbody
    results = [x for x in results if x.get('align') is None]
    if not results:
      raise inspector.NoReportsFoundError("DHS (%s)" % component)

    count = 0
    for result in results:
      report = report_from(result, component, url)
      if not report:
        continue

      if report_id and (report_id != report['report_id']):
        continue

      if inspector.year_from(report) not in year_range:
        # logging.info("[%s] Skipping, not in requested range." % report['report_id'])
        continue

      key = (report["report_id"], report["title"])
      if key in all_audit_reports:
        all_audit_reports[key]["agency"] = all_audit_reports[key]["agency"] + \
                ", " + report["agency"]
        all_audit_reports[key]["agency_name"] = \
                all_audit_reports[key]["agency_name"] + ", " + \
                report["agency_name"]
      else:
        all_audit_reports[key] = report

      count += 1
      if limit and (count >= limit):
        break

    logging.info("## Fetched %i reports for component %s\n\n" % (count, component))

  for report in all_audit_reports.values():
    inspector.save_report(report)
Exemplo n.º 4
0
def run(options):
  year_range = inspector.year_range(options, archive)
  pages = options.get('pages', ALL_PAGES)

  # default to starting at page 1
  begin = int(options.get('begin', 1))

  max_page = None
  for page in range(begin, (int(pages) + 1)):
    if max_page and (page > max_page):
      logging.debug("End of pages!")
      break

    logging.debug("## Downloading page %i" % page)
    url = url_for(options, page)
    body = utils.download(url)
    doc = BeautifulSoup(body)

    # When the USPS restores their page controls, we can use this again,
    # which saves one network call each time.
    max_page = last_page_for(doc)

    results = doc.select(".views-row")

    for result in results:
      report = report_from(result)

      # inefficient enforcement of --year arg, USPS doesn't support it server-side
      # TODO: change to published_on.year once it's a datetime
      if inspector.year_from(report) not in year_range:
        logging.warn("[%s] Skipping report, not in requested range." % report['report_id'])
        continue

      inspector.save_report(report)
Exemplo n.º 5
0
def run(options):
    year_range = inspector.year_range(options, archive)

    component = options.get('component')
    if component:
        components = [component]
    else:
        components = sorted(COMPONENTS.keys())

    report_id = options.get('report_id')

    limit = int(options.get('limit', 0))

    all_audit_reports = {}

    for component in components:
        logging.info("## Fetching reports for component %s" % component)
        url = url_for(options, component)
        doc = utils.beautifulsoup_from_url(url)

        results = doc.select("#content-area tbody tr")
        if not results:
            raise inspector.NoReportsFoundError("DHS (%s)" % component)

        count = 0
        for result in results:
            report = report_from(result, component, url)
            if not report:
                continue

            if report_id and (report_id != report['report_id']):
                continue

            if inspector.year_from(report) not in year_range:
                # logging.info("[%s] Skipping, not in requested range." % report['report_id'])
                continue

            key = (report["report_id"], report["title"])
            if key in all_audit_reports:
                all_audit_reports[key]["agency"] = "{}, {}".format(
                    all_audit_reports[key]["agency"], report["agency"])
                all_audit_reports[key]["agency_name"] = "{}, {}".format(
                    all_audit_reports[key]["agency_name"],
                    report["agency_name"])
            else:
                all_audit_reports[key] = report

            count += 1
            if limit and (count >= limit):
                break

        logging.info("## Fetched %i reports for component %s\n\n" %
                     (count, component))

    for report in all_audit_reports.values():
        inspector.save_report(report)
Exemplo n.º 6
0
def run(options):
  year_range = inspector.year_range(options)

  component = options.get('component')
  if component:
    components = [component]
  else:
    components = list(COMPONENTS.keys())

  report_id = options.get('report_id')

  limit = int(options.get('limit', 0))

  for component in components:
    print("## Fetching reports for component %s" % component)
    url = url_for(options, component)
    body = utils.download(url)

    doc = BeautifulSoup(body)

    results = doc.select("table.contentpaneopen table[border=1] tr")
    # accept only trs that look like body tr's (no 'align' attribute)
    #   note: HTML is very inconsistent. cannot rely on thead or tbody
    results = [x for x in results if x.get('align') is None]

    count = 0
    for result in results:
      report = report_from(result, component, url)
      if report_id and (report_id != report['report_id']):
        continue

      if inspector.year_from(report) not in year_range:
        # print "[%s] Skipping, not in requested range." % report['report_id']
        continue

      inspector.save_report(report)

      count += 1
      if limit and (count >= limit):
        break

    print("## Fetched %i reports for component %s\n\n" % (count, component))
Exemplo n.º 7
0
def run(options):
    year_range = inspector.year_range(options, archive)
    pages = options.get('pages', ALL_PAGES)

    # default to starting at page 1
    begin = int(options.get('begin', 1))

    max_page = None
    for page in range(begin, (int(pages) + 1)):
        if max_page and (page > max_page):
            logging.debug("End of pages!")
            break

        logging.debug("## Downloading page %i" % page)
        url = url_for(options, page)
        body = utils.download(url)
        doc = BeautifulSoup(body)

        # When the USPS restores their page controls, we can use this again,
        # which saves one network call each time.
        max_page = last_page_for(doc)

        results = doc.select(".views-row")
        if not results:
            raise inspector.NoReportsFoundError("USPS")
        for result in results:
            report = report_from(result)

            # inefficient enforcement of --year arg, USPS doesn't support it server-side
            # TODO: change to published_on.year once it's a datetime
            if inspector.year_from(report) not in year_range:
                logging.warn("[%s] Skipping report, not in requested range." %
                             report['report_id'])
                continue

            inspector.save_report(report)
Exemplo n.º 8
0
def run(options):
    year_range = inspector.year_range(options, archive)

    report_types = options.get('types')
    if not report_types:
        report_types = "audit,congress,research"
    report_types = report_types.split(",")
    categories = [tup for tup in CATEGORIES if (tup[0] in report_types)]
    for category_name, category_id in categories:
        pages = get_last_page(options, category_id)

        rows_seen = set()
        pages_to_fetch = range(1, pages + 1)

        # While the reports themselves may shuffle around, the order of the dates
        # of the reports and how many of each date we see on each page will stay
        # constant. This dictionary will hold how many times we see each date.
        # We can stop retrying pages once we have as many unique reports for each
        # date as there are slots for that date.
        date_slot_counts = {}

        # This keeps track of how many unique reports we have found on each date,
        # for comparison with the numbers above.
        date_unique_report_counts = {}

        # This dict maps from a report date to a list of pages on which that date
        # was seen.
        date_to_pages = {}

        for retry in range(MAX_RETRIES):
            for page in pages_to_fetch:
                logging.debug("## Downloading %s, page %i, attempt %i" % \
                                     (category_name, page, retry))
                url = url_for(options, page, category_id)
                body = utils.download(url)
                doc = BeautifulSoup(body)

                results = doc.select(".views-row")
                if not results:
                    if len(doc.select(".view")[0].contents) == 3 and \
                        len(doc.select(".view > .view-filters")) == 1:
                        # If we only have the filter box, and no content box or "pagerer,"
                        # then that just means this search returned 0 results.
                        pass
                    else:
                        # Otherwise, there's probably something wrong with the scraper.
                        raise inspector.NoReportsFoundError("USPS %s" %
                                                            category_name)
                for result in results:
                    if retry == 0:
                        timestamp = get_timestamp(result)
                        if timestamp in date_slot_counts:
                            date_slot_counts[
                                timestamp] = date_slot_counts[timestamp] + 1
                        else:
                            date_slot_counts[timestamp] = 1
                            date_unique_report_counts[timestamp] = 0

                        if timestamp in date_to_pages:
                            if page not in date_to_pages[timestamp]:
                                date_to_pages[timestamp].append(page)
                        else:
                            date_to_pages[timestamp] = [page]

                    row_key = (str(result.text), result.a['href'])
                    if not row_key in rows_seen:
                        rows_seen.add(row_key)
                        timestamp = get_timestamp(result)
                        date_unique_report_counts[timestamp] = \
                            date_unique_report_counts[timestamp] + 1

                        report = report_from(result)
                        # inefficient enforcement of --year arg, USPS doesn't support it server-side
                        # TODO: change to published_on.year once it's a datetime
                        if inspector.year_from(report) not in year_range:
                            logging.warn(
                                "[%s] Skipping report, not in requested range."
                                % report['report_id'])
                            continue

                        inspector.save_report(report)

            pages_to_fetch = set()
            for date, report_count in date_unique_report_counts.items():
                if report_count < date_slot_counts[date]:
                    for page in date_to_pages[date]:
                        pages_to_fetch.add(page)
            if len(pages_to_fetch) == 0:
                break
            pages_to_fetch = list(pages_to_fetch)
            pages_to_fetch.sort()