def run(options): year_range = inspector.year_range(options, archive) component = options.get('component') if component: components = [component] else: components = list(COMPONENTS.keys()) report_id = options.get('report_id') limit = int(options.get('limit', 0)) all_audit_reports = {} for component in components: logging.info("## Fetching reports for component %s" % component) url = url_for(options, component) body = utils.download(url) doc = BeautifulSoup(body) results = doc.select("table.contentpaneopen table[border=1] tr") # accept only trs that look like body tr's (no 'align' attribute) # note: HTML is very inconsistent. cannot rely on thead or tbody results = [x for x in results if x.get('align') is None] if not results: raise inspector.NoReportsFoundError("DHS (%s)" % component) count = 0 for result in results: report = report_from(result, component, url) if not report: continue if report_id and (report_id != report['report_id']): continue if inspector.year_from(report) not in year_range: # logging.info("[%s] Skipping, not in requested range." % report['report_id']) continue key = (report["report_id"], report["title"]) if key in all_audit_reports: all_audit_reports[key]["agency"] = all_audit_reports[key]["agency"] + \ ", " + report["agency"] all_audit_reports[key]["agency_name"] = \ all_audit_reports[key]["agency_name"] + ", " + \ report["agency_name"] else: all_audit_reports[key] = report count += 1 if limit and (count >= limit): break logging.info("## Fetched %i reports for component %s\n\n" % (count, component)) for report in all_audit_reports.values(): inspector.save_report(report)
def run(options): year_range = inspector.year_range(options) pages = options.get('pages', ALL_PAGES) max_page = None for page in range(1, (int(pages) + 1)): if max_page and (page > max_page): print("End of pages!") break print("## Downloading page %i" % page) url = url_for(options, page) body = utils.download(url) doc = BeautifulSoup(body) max_page = last_page_for(doc) results = doc.select(".views-row") for result in results: report = report_from(result) # inefficient enforcement of --year arg, USPS doesn't support it server-side # TODO: change to published_on.year once it's a datetime if inspector.year_from(report) not in year_range: print("[%s] Skipping report, not in requested range." % report['report_id']) continue inspector.save_report(report)
def run(options): year_range = inspector.year_range(options, archive) pages = options.get('pages', ALL_PAGES) # default to starting at page 1 begin = int(options.get('begin', 1)) max_page = None for page in range(begin, (int(pages) + 1)): if max_page and (page > max_page): logging.debug("End of pages!") break logging.debug("## Downloading page %i" % page) url = url_for(options, page) body = utils.download(url) doc = BeautifulSoup(body) # When the USPS restores their page controls, we can use this again, # which saves one network call each time. max_page = last_page_for(doc) results = doc.select(".views-row") for result in results: report = report_from(result) # inefficient enforcement of --year arg, USPS doesn't support it server-side # TODO: change to published_on.year once it's a datetime if inspector.year_from(report) not in year_range: logging.warn("[%s] Skipping report, not in requested range." % report['report_id']) continue inspector.save_report(report)
def run(options): year_range = inspector.year_range(options, archive) component = options.get('component') if component: components = [component] else: components = sorted(COMPONENTS.keys()) report_id = options.get('report_id') limit = int(options.get('limit', 0)) all_audit_reports = {} for component in components: logging.info("## Fetching reports for component %s" % component) url = url_for(options, component) doc = utils.beautifulsoup_from_url(url) results = doc.select("#content-area tbody tr") if not results: raise inspector.NoReportsFoundError("DHS (%s)" % component) count = 0 for result in results: report = report_from(result, component, url) if not report: continue if report_id and (report_id != report['report_id']): continue if inspector.year_from(report) not in year_range: # logging.info("[%s] Skipping, not in requested range." % report['report_id']) continue key = (report["report_id"], report["title"]) if key in all_audit_reports: all_audit_reports[key]["agency"] = "{}, {}".format( all_audit_reports[key]["agency"], report["agency"]) all_audit_reports[key]["agency_name"] = "{}, {}".format( all_audit_reports[key]["agency_name"], report["agency_name"]) else: all_audit_reports[key] = report count += 1 if limit and (count >= limit): break logging.info("## Fetched %i reports for component %s\n\n" % (count, component)) for report in all_audit_reports.values(): inspector.save_report(report)
def run(options): year_range = inspector.year_range(options) component = options.get('component') if component: components = [component] else: components = list(COMPONENTS.keys()) report_id = options.get('report_id') limit = int(options.get('limit', 0)) for component in components: print("## Fetching reports for component %s" % component) url = url_for(options, component) body = utils.download(url) doc = BeautifulSoup(body) results = doc.select("table.contentpaneopen table[border=1] tr") # accept only trs that look like body tr's (no 'align' attribute) # note: HTML is very inconsistent. cannot rely on thead or tbody results = [x for x in results if x.get('align') is None] count = 0 for result in results: report = report_from(result, component, url) if report_id and (report_id != report['report_id']): continue if inspector.year_from(report) not in year_range: # print "[%s] Skipping, not in requested range." % report['report_id'] continue inspector.save_report(report) count += 1 if limit and (count >= limit): break print("## Fetched %i reports for component %s\n\n" % (count, component))
def run(options): year_range = inspector.year_range(options, archive) pages = options.get('pages', ALL_PAGES) # default to starting at page 1 begin = int(options.get('begin', 1)) max_page = None for page in range(begin, (int(pages) + 1)): if max_page and (page > max_page): logging.debug("End of pages!") break logging.debug("## Downloading page %i" % page) url = url_for(options, page) body = utils.download(url) doc = BeautifulSoup(body) # When the USPS restores their page controls, we can use this again, # which saves one network call each time. max_page = last_page_for(doc) results = doc.select(".views-row") if not results: raise inspector.NoReportsFoundError("USPS") for result in results: report = report_from(result) # inefficient enforcement of --year arg, USPS doesn't support it server-side # TODO: change to published_on.year once it's a datetime if inspector.year_from(report) not in year_range: logging.warn("[%s] Skipping report, not in requested range." % report['report_id']) continue inspector.save_report(report)
def run(options): year_range = inspector.year_range(options, archive) report_types = options.get('types') if not report_types: report_types = "audit,congress,research" report_types = report_types.split(",") categories = [tup for tup in CATEGORIES if (tup[0] in report_types)] for category_name, category_id in categories: pages = get_last_page(options, category_id) rows_seen = set() pages_to_fetch = range(1, pages + 1) # While the reports themselves may shuffle around, the order of the dates # of the reports and how many of each date we see on each page will stay # constant. This dictionary will hold how many times we see each date. # We can stop retrying pages once we have as many unique reports for each # date as there are slots for that date. date_slot_counts = {} # This keeps track of how many unique reports we have found on each date, # for comparison with the numbers above. date_unique_report_counts = {} # This dict maps from a report date to a list of pages on which that date # was seen. date_to_pages = {} for retry in range(MAX_RETRIES): for page in pages_to_fetch: logging.debug("## Downloading %s, page %i, attempt %i" % \ (category_name, page, retry)) url = url_for(options, page, category_id) body = utils.download(url) doc = BeautifulSoup(body) results = doc.select(".views-row") if not results: if len(doc.select(".view")[0].contents) == 3 and \ len(doc.select(".view > .view-filters")) == 1: # If we only have the filter box, and no content box or "pagerer," # then that just means this search returned 0 results. pass else: # Otherwise, there's probably something wrong with the scraper. raise inspector.NoReportsFoundError("USPS %s" % category_name) for result in results: if retry == 0: timestamp = get_timestamp(result) if timestamp in date_slot_counts: date_slot_counts[ timestamp] = date_slot_counts[timestamp] + 1 else: date_slot_counts[timestamp] = 1 date_unique_report_counts[timestamp] = 0 if timestamp in date_to_pages: if page not in date_to_pages[timestamp]: date_to_pages[timestamp].append(page) else: date_to_pages[timestamp] = [page] row_key = (str(result.text), result.a['href']) if not row_key in rows_seen: rows_seen.add(row_key) timestamp = get_timestamp(result) date_unique_report_counts[timestamp] = \ date_unique_report_counts[timestamp] + 1 report = report_from(result) # inefficient enforcement of --year arg, USPS doesn't support it server-side # TODO: change to published_on.year once it's a datetime if inspector.year_from(report) not in year_range: logging.warn( "[%s] Skipping report, not in requested range." % report['report_id']) continue inspector.save_report(report) pages_to_fetch = set() for date, report_count in date_unique_report_counts.items(): if report_count < date_slot_counts[date]: for page in date_to_pages[date]: pages_to_fetch.add(page) if len(pages_to_fetch) == 0: break pages_to_fetch = list(pages_to_fetch) pages_to_fetch.sort()