Пример #1
0
def filter_links(link_list, base_url):
    href_list = [element.get('href') for element in link_list]
    for i in range(len(href_list)):
        if href_list[i].startswith("http://go.usa.gov/"):
            href_list[i] = utils.resolve_redirect(href_list[i])
    href_list = [urldefrag(urljoin(base_url, href))[0] for href in href_list]
    filtered_list = [href for href in href_list \
        if href and href not in BLACKLIST_REPORT_URLS and \
        not href.startswith("mailto:")]
    filtered_list = list(set(filtered_list))
    return filtered_list
Пример #2
0
def filter_links(link_list, base_url):
    href_list = [element.get("href") for element in link_list]
    for i in range(len(href_list)):
        if href_list[i].startswith("http://go.usa.gov/"):
            href_list[i] = utils.resolve_redirect(href_list[i])
    href_list = [urldefrag(urljoin(base_url, href))[0] for href in href_list]
    filtered_list = [
        href for href in href_list if href and href not in BLACKLIST_REPORT_URLS and not href.startswith("mailto:")
    ]
    filtered_list = list(set(filtered_list))
    return filtered_list
Пример #3
0
def report_from(result, year_range):
    link = result.find("a")
    report_url = urljoin(REPORTS_URL, link.get('href'))
    report_url = urlunparse(list(urlparse(report_url)[:4]) + ["", ""])
    if report_url in BLACKLIST_REPORT_URLS:
        return

    # Follow redirects to get real file names
    if report_url.startswith("https://www.cpsc.gov/Media/"):
        report_url = utils.resolve_redirect(report_url)

    # URLs with /PageFiles in them need to use the filename and its
    # directory to be unique. Other URLs can just use the filename.
    if "PageFiles" in report_url:
        # e.g. /../132643/fy11fisma.pdf -> 132643-fy11fisma.pdf
        report_filename = str.join("-", report_url.split("/")[-2:])
    else:
        report_filename = report_url.split("/")[-1]

    report_id, _ = os.path.splitext(report_filename)

    title = link.text
    if report_id in REPORT_PUBLISHED_MAP:
        published_on = REPORT_PUBLISHED_MAP[report_id]
    else:
        date_spans = result.select(".date-display-single")
        if date_spans:
            published_on_text = date_spans[0].text
            published_on = datetime.datetime.strptime(published_on_text,
                                                      '%A, %B %d, %Y')
        else:
            admin.log_no_date("cpsc", report_id, title, report_url)
            return

    if published_on.year not in year_range:
        logging.debug("[%s] Skipping, not in requested range." % report_url)
        return

    report_type = report_type_from_title(title)

    report = {
        'inspector': 'cpsc',
        'inspector_url': 'https://www.cpsc.gov/About-CPSC/Inspector-General/',
        'agency': 'cpsc',
        'agency_name': 'Consumer Product Safety Commission',
        'type': report_type,
        'report_id': report_id,
        'url': report_url,
        'title': title,
        'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"),
    }
    return report
Пример #4
0
def report_from(result, year_range):
  link = result.find("a")
  report_url = urljoin(REPORTS_URL, link.get('href'))
  report_url = urlunparse(list(urlparse(report_url)[:4]) + ["", ""])
  if report_url in BLACKLIST_REPORT_URLS:
    return

  # Follow redirects to get real file names
  if report_url.startswith("https://www.cpsc.gov/Media/"):
    report_url = utils.resolve_redirect(report_url)

  # URLs with /PageFiles in them need to use the filename and its
  # directory to be unique. Other URLs can just use the filename.
  if "PageFiles" in report_url:
    # e.g. /../132643/fy11fisma.pdf -> 132643-fy11fisma.pdf
    report_filename = str.join("-", report_url.split("/")[-2:])
  else:
    report_filename = report_url.split("/")[-1]

  report_id, _ = os.path.splitext(report_filename)

  title = link.text
  if report_id in REPORT_PUBLISHED_MAP:
    published_on = REPORT_PUBLISHED_MAP[report_id]
  else:
    date_spans = result.select(".date-display-single")
    if date_spans:
      published_on_text = date_spans[0].text
      published_on = datetime.datetime.strptime(published_on_text, '%A, %B %d, %Y')
    else:
      admin.log_no_date("cpsc", report_id, title, report_url)
      return

  if published_on.year not in year_range:
    logging.debug("[%s] Skipping, not in requested range." % report_url)
    return

  report_type = report_type_from_title(title)

  report = {
    'inspector': 'cpsc',
    'inspector_url': 'https://www.cpsc.gov/About-CPSC/Inspector-General/',
    'agency': 'cpsc',
    'agency_name': 'Consumer Product Safety Commission',
    'type': report_type,
    'report_id': report_id,
    'url': report_url,
    'title': title,
    'published_on': datetime.datetime.strftime(published_on, "%Y-%m-%d"),
  }
  return report
Пример #5
0
def filter_links(link_list, base_url):
    link_list = [
        link for link in link_list if "Report in Brief" not in link.text
    ]
    href_list = [element.get('href') for element in link_list]
    for i in range(len(href_list)):
        while href_list[i].startswith(
            ("http://go.usa.gov/", "https://go.usa.gov/")):
            href_list[i] = utils.resolve_redirect(href_list[i])
    href_list = [urldefrag(urljoin(base_url, href))[0] for href in href_list]
    filtered_list = [
        href for href in href_list
        if href and href not in BLACKLIST_REPORT_URLS
        and not href.startswith("mailto:") and not href.endswith(".jpg")
    ]
    filtered_list = list(set(filtered_list))
    return filtered_list
Пример #6
0
def filter_links(link_list, base_url):
  link_list = [link for link in link_list
               if "Report in Brief" not in link.text
               and "related audit" not in link.text
               and "(OEI-04-15-00430)" not in link.text
               and "this graphic" not in link.text]
  href_list = [element.get('href') for element in link_list]
  for i in range(len(href_list)):
    while href_list[i].startswith(("http://go.usa.gov/",
                                   "https://go.usa.gov/")):
      href_list[i] = utils.resolve_redirect(href_list[i])
  href_list = [urldefrag(urljoin(base_url, href))[0] for href in href_list]
  filtered_list = [href for href in href_list
                   if href and href not in BLACKLIST_REPORT_URLS and
                   not href.startswith("mailto:") and
                   not href.endswith((".jpg", ".xlsx"))]
  filtered_list = list(set(filtered_list))
  return filtered_list