示例#1
0
def bill_version_ids_for(only_congress, options):
    years = utils.get_congress_years(only_congress)
    only_bill_id = options.get('bill_id', None)

    version_ids = []

    for year in years:

        # don't bother fetching future years
        if year > datetime.datetime.now().year:
            continue

        # ensure BILLS sitemap for this year is present
        entries = fdsys.entries_from_collection(year, "BILLS", None, options)

        # some future years may not be ready yet
        if not entries:
            continue

        for entry in entries:
            url, lastmod = entry
            congress, bill_id, bill_version_id = split_url(url)

            # a year may have other congresses in it
            if int(congress) != int(only_congress):
                continue

            # we may be focused on a single bill OD
            if only_bill_id and (bill_id != only_bill_id):
                continue

            version_ids.append(bill_version_id)

    return version_ids
示例#2
0
def bill_version_ids_for(only_congress, options):
  years = utils.get_congress_years(only_congress)
  only_bill_id = options.get('bill_id', None)

  version_ids = []

  for year in years:

    # don't bother fetching future years
    if year > datetime.datetime.now().year:
      continue
    
    # ensure BILLS sitemap for this year is present
    entries = fdsys.entries_from_collection(year, "BILLS", None, options)

    # some future years may not be ready yet
    if not entries:
      continue

    for entry in entries:
      url, lastmod = entry
      congress, bill_id, bill_version_id = split_url(url)

      # a year may have other congresses in it
      if int(congress) != int(only_congress):
        continue

      # we may be focused on a single bill OD
      if only_bill_id and (bill_id != only_bill_id):
        continue

      version_ids.append(bill_version_id)

  return version_ids
示例#3
0
def update_bill_version_list(only_congress):
    bill_versions = {}

    # Which sitemap years should we look at?
    if not only_congress:
        sitemap_files = glob.glob(utils.cache_dir() + "/fdsys/sitemap/*/BILLS.xml")
    else:
        # If --congress=X is specified, only look at the relevant years.
        sitemap_files = [
            utils.cache_dir() + "/fdsys/sitemap/" + str(year) + "/BILLS.xml"
            for year in utils.get_congress_years(only_congress)
        ]
        sitemap_files = [f for f in sitemap_files if os.path.exists(f)]

    # For each year-by-year BILLS sitemap...
    for year_sitemap in sitemap_files:
        dom = etree.parse(year_sitemap).getroot()
        if dom.tag != "{http://www.sitemaps.org/schemas/sitemap/0.9}urlset":
            raise Exception("Mismatched sitemap type.")

        # Loop through each bill text version...
        for file_node in dom.xpath("x:url", namespaces=ns):
            # get URL and last modified date
            url = str(file_node.xpath("string(x:loc)", namespaces=ns))
            lastmod = str(file_node.xpath("string(x:lastmod)", namespaces=ns))

            # extract bill congress, type, number, and version from the URL
            m = re.match(r"http://www.gpo.gov/fdsys/pkg/BILLS-(\d+)([a-z]+)(\d+)(\D.*)/content-detail.html", url)
            if not m:
                raise Exception("Unmatched bill document URL: " + url)
            congress, bill_type, bill_number, version_code = m.groups()
            congress = int(congress)
            if bill_type not in utils.thomas_types:
                raise Exception("Invalid bill type: " + url)

            # If --congress=XXX is specified, only look at those bills.
            if only_congress and congress != only_congress:
                continue

            # Track the documents by congress, bill type, etc.
            bill_versions.setdefault(congress, {}).setdefault(bill_type, {}).setdefault(bill_number, {})[
                version_code
            ] = {"url": url, "lastmod": lastmod}

    # Output the bill version info. We can't do this until the end because we need to get
    # the complete list of versions for a bill before we write the file, and the versions
    # may be split across multiple sitemap files.

    for congress in bill_versions:
        for bill_type in bill_versions[congress]:
            for bill_number in bill_versions[congress][bill_type]:
                utils.write(
                    json.dumps(
                        bill_versions[congress][bill_type][bill_number],
                        sort_keys=True,
                        indent=2,
                        default=utils.format_datetime,
                    ),
                    output_for_bill(congress, bill_type, bill_number, "text-versions.json"),
                )
示例#4
0
def update_sitemap_cache(fetch_collections, options):
    seen_collections = set()

    # Load the root sitemap.
    master_sitemap = get_sitemap(None, None, None, options)
    if master_sitemap.tag != "{http://www.sitemaps.org/schemas/sitemap/0.9}sitemapindex":
        raise Exception("Mismatched sitemap type at the root sitemap.")

    # Process the year-by-year sitemaps.
    for year_node in master_sitemap.xpath("x:sitemap", namespaces=ns):
        # Get year and lastmod date.
        url = str(year_node.xpath("string(x:loc)", namespaces=ns))
        lastmod = str(year_node.xpath("string(x:lastmod)", namespaces=ns))
        m = re.match(r"http://www.gpo.gov/smap/fdsys/sitemap_(\d+)/sitemap_(\d+).xml", url)
        if not m or m.group(1) != m.group(2):
            raise ValueError("Unmatched sitemap URL: %s" % url)
        year = m.group(1)

        # Should we process this year's sitemaps?
        if options.get("congress", None) and int(year) not in utils.get_congress_years(int(options.get("congress"))):
            continue
        if options.get("year", None) and int(year) != int(options.get("year")):
            continue

        # Get the sitemap.
        year_sitemap = get_sitemap(year, None, lastmod, options)
        if year_sitemap.tag != "{http://www.sitemaps.org/schemas/sitemap/0.9}sitemapindex":
            raise Exception("Mismatched sitemap type in %s sitemap." % year)

        # Process the collection sitemaps.
        for collection_node in year_sitemap.xpath("x:sitemap", namespaces=ns):
            # Get collection and lastmod date.
            url = str(collection_node.xpath("string(x:loc)", namespaces=ns))
            lastmod = str(collection_node.xpath("string(x:lastmod)", namespaces=ns))
            m = re.match(r"http://www.gpo.gov/smap/fdsys/sitemap_(\d+)/(\d+)_(.*)_sitemap.xml", url)
            if not m or m.group(1) != year or m.group(2) != year:
                raise ValueError("Unmatched sitemap URL: %s" % url)
            collection = m.group(3)

            # To help the user find a collection name, record this collection but don't download it.
            if options.get("list-collections", False):
                seen_collections.add(collection)
                continue

            # Should we download the sitemap?
            if fetch_collections and collection not in fetch_collections:
                continue

            # Get the sitemap.
            collection_sitemap = get_sitemap(year, collection, lastmod, options)
            if collection_sitemap.tag != "{http://www.sitemaps.org/schemas/sitemap/0.9}urlset":
                raise Exception("Mismatched sitemap type in %s_%s sitemap." % (year, collection))

    if options.get("list-collections", False):
        print "\n".join(sorted(seen_collections))
示例#5
0
def update_sitemap_cache(fetch_collections, options):
  """Updates a local cache of the complete FDSys sitemap tree.
  Pass fetch_collections as None, or to restrict the update to
  particular FDSys collections a set of collection names. Only
  downloads changed sitemap files."""
  
  seen_collections = dict() # maps collection name to a set() of sitemap years in which the collection is present
  
  # Load the root sitemap.
  master_sitemap = get_sitemap(None, None, None, options)
  if master_sitemap.tag != "{http://www.sitemaps.org/schemas/sitemap/0.9}sitemapindex": raise Exception("Mismatched sitemap type at the root sitemap.")
  
  # Process the year-by-year sitemaps.
  for year_node in master_sitemap.xpath("x:sitemap", namespaces=ns):
    # Get year and lastmod date.
    url = str(year_node.xpath("string(x:loc)", namespaces=ns))
    lastmod = str(year_node.xpath("string(x:lastmod)", namespaces=ns))
    m = re.match(r"http://www.gpo.gov/smap/fdsys/sitemap_(\d+)/sitemap_(\d+).xml", url)
    if not m or m.group(1) != m.group(2): raise ValueError("Unmatched sitemap URL: %s" % url)
    year = m.group(1)
    
    # Should we process this year's sitemaps?
    if options.get("congress", None) and int(year) not in utils.get_congress_years(int(options.get("congress"))): continue
    if options.get("year", None) and int(year) != int(options.get("year")): continue

    # Get the sitemap.
    year_sitemap = get_sitemap(year, None, lastmod, options)
    if year_sitemap.tag != "{http://www.sitemaps.org/schemas/sitemap/0.9}sitemapindex": raise Exception("Mismatched sitemap type in %s sitemap." % year)
    
    # Process the collection sitemaps.
    for collection_node in year_sitemap.xpath("x:sitemap", namespaces=ns):
      # Get collection and lastmod date.
      url = str(collection_node.xpath("string(x:loc)", namespaces=ns))
      lastmod = str(collection_node.xpath("string(x:lastmod)", namespaces=ns))
      m = re.match(r"http://www.gpo.gov/smap/fdsys/sitemap_(\d+)/(\d+)_(.*)_sitemap.xml", url)
      if not m or m.group(1) != year or m.group(2) != year: raise ValueError("Unmatched sitemap URL: %s" % url)
      collection = m.group(3)
      
      # To help the user find a collection name, record this collection but don't download it.
      if options.get("list-collections", False):
        seen_collections.setdefault(collection, set()).add(int(year))
        continue

      # Should we download the sitemap?
      if fetch_collections and collection not in fetch_collections:
        continue

      # Get the sitemap.
      collection_sitemap = get_sitemap(year, collection, lastmod, options)
      if collection_sitemap.tag != "{http://www.sitemaps.org/schemas/sitemap/0.9}urlset": raise Exception("Mismatched sitemap type in %s_%s sitemap." % (year, collection))
      
  if options.get("list-collections", False):
    max_collection_name_len = max(len(n) for n in seen_collections)
    def make_nice_year_range(years):
      ranges = []
      for y in sorted(years):
        if len(ranges) > 0 and ranges[-1][1] == y-1:
          # extend the previous range
          ranges[-1][1] = y
        else:
          # append a new range
          ranges.append( [y, y] )
      ranges = [(("%d" % r[0]) if r[0] == r[1] else "%d-%d" % tuple(r)) for r in ranges]
      return ", ".join(ranges)

    for collection in sorted(seen_collections):
      print collection.ljust(max_collection_name_len), " ", make_nice_year_range(seen_collections[collection])
示例#6
0
def update_bill_version_list(only_congress):
  bill_versions = { }
  
  # Which sitemap years should we look at?
  if not only_congress:
    sitemap_files = glob.glob(utils.cache_dir() + "/fdsys/sitemap/*/BILLS.xml")
  else:
    # If --congress=X is specified, only look at the relevant years.
    sitemap_files = [utils.cache_dir() + "/fdsys/sitemap/" + str(year) + "/BILLS.xml" for year in utils.get_congress_years(only_congress)]
    sitemap_files = [f for f in sitemap_files if os.path.exists(f)]
  
  # For each year-by-year BILLS sitemap...
  for year_sitemap in sitemap_files:
    dom = etree.parse(year_sitemap).getroot()
    if dom.tag != "{http://www.sitemaps.org/schemas/sitemap/0.9}urlset": raise Exception("Mismatched sitemap type.")
    
    # Loop through each bill text version...
    for file_node in dom.xpath("x:url", namespaces=ns):
      # get URL and last modified date
      url = str(file_node.xpath("string(x:loc)", namespaces=ns))
      lastmod = str(file_node.xpath("string(x:lastmod)", namespaces=ns))
      
      # extract bill congress, type, number, and version from the URL
      m = re.match(r"http://www.gpo.gov/fdsys/pkg/BILLS-(\d+)([a-z]+)(\d+)(\D.*)/content-detail.html", url)
      if not m: raise Exception("Unmatched bill document URL: " + url)
      congress, bill_type, bill_number, version_code = m.groups()
      congress = int(congress)
      if bill_type not in utils.thomas_types: raise Exception("Invalid bill type: " + url)
      
      # If --congress=XXX is specified, only look at those bills. 
      if only_congress and congress != only_congress:
        continue
      
      # Track the documents by congress, bill type, etc.
      bill_versions\
        .setdefault(congress, { })\
        .setdefault(bill_type, { })\
        .setdefault(bill_number, { })\
        [version_code] = {
          "url": url,
          "lastmod": lastmod,
        }
        
  # Output the bill version info. We can't do this until the end because we need to get
  # the complete list of versions for a bill before we write the file, and the versions
  # may be split across multiple sitemap files.
  
  for congress in bill_versions:
    for bill_type in bill_versions[congress]:
      for bill_number in bill_versions[congress][bill_type]:
        utils.write(
          json.dumps(bill_versions[congress][bill_type][bill_number],
            sort_keys=True, indent=2, default=utils.format_datetime), 
          output_for_bill(congress, bill_type, bill_number, "text-versions.json")
        )
示例#7
0
def mirror_packages(fetch_collections, options):
  """Create a local mirror of FDSys document files. Only downloads
  changed files, according to the sitemap. Run update_sitemap_cache first.
  
  Pass fetch_collections as None, or to restrict the update to
  particular FDSys collections a set of collection names.
  
  Set options["store"] to a comma-separated list of file types (pdf,
  mods, text, xml, zip).
  """
  
  # For determining whether we need to process a sitemap file again on a later
  # run, we need to make a key out of the command line arguments that affect
  # which files we are downloading.
  cache_options_key = repr(tuple(sorted(kv for kv in options.items() if kv[0] in ("store", "year", "congress", "granules", "cached"))))
  
  file_types = options["store"].split(",")

  # Process each FDSys sitemap...
  for sitemap in sorted(glob.glob(utils.cache_dir() + "/fdsys/sitemap/*/*.xml")):
    # Should we process this file?
    year, collection = re.search(r"/(\d+)/([^/]+).xml$", sitemap).groups()
    if "year" in options and year != options["year"]: continue
    if "congress" in options and int(year) not in utils.get_congress_years(int(options["congress"])): continue 
    if fetch_collections and collection not in fetch_collections: continue
    
    # Has this sitemap changed since the last successful mirror?
    #
    # The sitemap's last modification time is stored in ...-lastmod.txt,
    # which comes from the sitemap's parent sitemap's lastmod listing for
    # the file.
    #
    # Compare that to the lastmod value of when we last did a successful mirror.
    # This function can be run to fetch different sets of files, so get the
    # lastmod value corresponding to the current run arguments.
    sitemap_store_state_file = re.sub(r"\.xml$", "-store-state.json", sitemap)
    sitemap_last_mod = open(re.sub(r"\.xml$", "-lastmod.txt", sitemap)).read()
    if os.path.exists(sitemap_store_state_file):
      sitemap_store_state = json.load(open(sitemap_store_state_file))
      if sitemap_store_state.get(cache_options_key) == sitemap_last_mod:
        # sitemap hasn't changed since the last time
        continue
    
    logging.info("scanning " + sitemap + "...")
    
    # Load the sitemap for this year & collection, and loop through each document.
    for package_name, lastmod in get_sitemap_entries(sitemap):
      # Add this package to the download list.
      file_list = []
      
      if not options.get("granules", False):
        # Doing top-level package files (granule==None).
        file_list.append(None)

      else:
        # In some collections, like STATUTE, each document has subparts which are not
        # described in the sitemap. Load the main HTML page and scrape for the sub-files.
        # In the STATUTE collection, the MODS information in granules is redundant with
        # information in the top-level package MODS file. But the only way to get granule-
        # level PDFs is to go through the granules.
        content_detail_url = "http://www.gpo.gov/fdsys/pkg/%s/content-detail.html" % package_name
        content_index = utils.download(content_detail_url,
            "fdsys/package/%s/%s/%s.html" % (year, collection, package_name),
            utils.merge(options, {
            'binary': True, 
          }))
        if not content_index: raise Exception("Failed to download %s" % content_detail_url)
        for link in html.fromstring(content_index).cssselect("table.page-details-data-table td.rightLinkCell a"):
          if link.text == "More":
            m = re.match("granule/(.*)/(.*)/content-detail.html", link.get("href"))
            if not m or m.group(1) != package_name: raise Exception("Unmatched granule URL %s" % link.get("href"))
            granule_name = m.group(2)
            file_list.append(granule_name)
        
      # Download the files of the desired types.
      for granule_name in file_list:
        mirror_package(year, collection, package_name, lastmod, granule_name, file_types, options)
        
    # If we got this far, we successfully downloaded all of the files in this year/collection.
    # To speed up future updates, save the lastmod time of this sitemap in a file indicating
    # what we downloaded. The store-state file contains a JSON mapping of command line options
    # to the most recent lastmod value for this sitemap.
    sitemap_store_state = { }
    if os.path.exists(sitemap_store_state_file):
      sitemap_store_state = json.load(open(sitemap_store_state_file))
    sitemap_store_state[cache_options_key] = sitemap_last_mod
    json.dump(sitemap_store_state, open(sitemap_store_state_file, "w"))
示例#8
0
def update_sitemap_cache(fetch_collections, options):
    """Updates a local cache of the complete FDSys sitemap tree.
  Pass fetch_collections as None, or to restrict the update to
  particular FDSys collections a set of collection names. Only
  downloads changed sitemap files."""

    seen_collections = set()

    # Load the root sitemap.
    master_sitemap = get_sitemap(None, None, None, options)
    if master_sitemap.tag != "{http://www.sitemaps.org/schemas/sitemap/0.9}sitemapindex":
        raise Exception("Mismatched sitemap type at the root sitemap.")

    # Process the year-by-year sitemaps.
    for year_node in master_sitemap.xpath("x:sitemap", namespaces=ns):
        # Get year and lastmod date.
        url = str(year_node.xpath("string(x:loc)", namespaces=ns))
        lastmod = str(year_node.xpath("string(x:lastmod)", namespaces=ns))
        m = re.match(
            r"http://www.gpo.gov/smap/fdsys/sitemap_(\d+)/sitemap_(\d+).xml",
            url)
        if not m or m.group(1) != m.group(2):
            raise ValueError("Unmatched sitemap URL: %s" % url)
        year = m.group(1)

        # Should we process this year's sitemaps?
        if options.get("congress",
                       None) and int(year) not in utils.get_congress_years(
                           int(options.get("congress"))):
            continue
        if options.get("year", None) and int(year) != int(options.get("year")):
            continue

        # Get the sitemap.
        year_sitemap = get_sitemap(year, None, lastmod, options)
        if year_sitemap.tag != "{http://www.sitemaps.org/schemas/sitemap/0.9}sitemapindex":
            raise Exception("Mismatched sitemap type in %s sitemap." % year)

        # Process the collection sitemaps.
        for collection_node in year_sitemap.xpath("x:sitemap", namespaces=ns):
            # Get collection and lastmod date.
            url = str(collection_node.xpath("string(x:loc)", namespaces=ns))
            lastmod = str(
                collection_node.xpath("string(x:lastmod)", namespaces=ns))
            m = re.match(
                r"http://www.gpo.gov/smap/fdsys/sitemap_(\d+)/(\d+)_(.*)_sitemap.xml",
                url)
            if not m or m.group(1) != year or m.group(2) != year:
                raise ValueError("Unmatched sitemap URL: %s" % url)
            collection = m.group(3)

            # To help the user find a collection name, record this collection but don't download it.
            if options.get("list-collections", False):
                seen_collections.add(collection)
                continue

            # Should we download the sitemap?
            if fetch_collections and collection not in fetch_collections:
                continue

            # Get the sitemap.
            collection_sitemap = get_sitemap(year, collection, lastmod,
                                             options)
            if collection_sitemap.tag != "{http://www.sitemaps.org/schemas/sitemap/0.9}urlset":
                raise Exception("Mismatched sitemap type in %s_%s sitemap." %
                                (year, collection))

    if options.get("list-collections", False):
        print "\n".join(sorted(seen_collections))
示例#9
0
def mirror_files(fetch_collections, options):
    # Locally mirror certain file types for the specified collections.

    file_types = options["store"].split(",")

    for sitemap in sorted(glob.glob(utils.cache_dir() + "/fdsys/sitemap/*/*.xml")):
        # Should we process this file?
        year, collection = re.search(r"/(\d+)/([^/]+).xml$", sitemap).groups()
        if "year" in options and year != options["year"]:
            continue
        if "congress" in options and int(year) not in utils.get_congress_years(int(options["congress"])):
            continue
        if fetch_collections and collection not in fetch_collections:
            continue

        logging.warn(sitemap + "...")

        # Load the sitemap for this year & collection.
        dom = etree.parse(sitemap).getroot()
        if dom.tag != "{http://www.sitemaps.org/schemas/sitemap/0.9}urlset":
            raise Exception("Mismatched sitemap type.")

        # Loop through each document in the collection in this year...
        for file_node in dom.xpath("x:url", namespaces=ns):
            # Get URL and last modified timestamp.
            url = str(file_node.xpath("string(x:loc)", namespaces=ns))
            lastmod = str(file_node.xpath("string(x:lastmod)", namespaces=ns))
            if not url.endswith("/content-detail.html"):
                raise Exception("Unrecognized file pattern.")

            # Get the package name.
            m = re.match("http://www.gpo.gov/fdsys/pkg/(.*)/content-detail.html", url)
            if not m:
                raise Exception("Unmatched document URL")
            package_name = m.group(1)

            # Where to store the document files?
            # The path will depend a bit on the collection.
            if collection == "BILLS":
                # Store with the other bill data.
                m = re.match(r"http://www.gpo.gov/fdsys/pkg/BILLS-(\d+)([a-z]+)(\d+)(\D.*)/content-detail.html", url)
                if not m:
                    raise Exception("Unmatched bill document URL: " + url)
                congress, bill_type, bill_number, version_code = m.groups()
                congress = int(congress)
                if "congress" in options and congress != int(options["congress"]):
                    continue
                path = output_for_bill(congress, bill_type, bill_number, "text-versions/" + version_code)
            else:
                # Store in fdsys/COLLECTION/YEAR/PKGNAME.
                path = "%s/fdsys/%s/%s/%s" % (utils.data_dir(), collection, year, package_name)

            # Do we need to update this record?
            lastmod_cache_file = path + "/lastmod.txt"
            cache_lastmod = utils.read(lastmod_cache_file)
            force = ((lastmod != cache_lastmod) or options.get("force", False)) and not options.get("cached", False)

            # Add this package to the download list.
            file_list = []
            file_list.append((None, path))

            if options.get("granules", False):
                # In some collections, like STATUTE, each document has subparts which are not
                # described in the sitemap. Load the main HTML page and scrape for the sub-files.
                # Josh originally thought the STATUTE granule files (individual statutes) were
                # useful, but then it turned out the information is redudant with information
                # in the top-level package MODS file.
                content_index = utils.download(
                    url,
                    "fdsys/package/%s/%s/%s.html" % (year, collection, package_name),
                    utils.merge(
                        options,
                        {
                            "xml": True,  # it's not XML but this avoid unescaping HTML which fails if there are unicode characters
                            "force": force,
                        },
                    ),
                )
                if not content_index:
                    raise Exception("Failed to download %s" % url)
                for link in html.fromstring(content_index).cssselect(
                    "table.page-details-data-table td.rightLinkCell a"
                ):
                    if link.text == "More":
                        m = re.match("granule/(.*)/(.*)/content-detail.html", link.get("href"))
                        if not m or m.group(1) != package_name:
                            raise Exception("Unmatched granule URL %s" % link.get("href"))
                        granule_name = m.group(2)
                        file_list.append((granule_name, path + "/" + granule_name))

            # Download the files of the desired types.
            for granule_name, path in file_list:
                targets = get_package_files(package_name, granule_name, path)
                for file_type in file_types:
                    if file_type not in targets:
                        raise Exception("Invalid file type: %s" % file_type)
                    f_url, f_path = targets[file_type]

                    if force:
                        logging.warn(f_path)
                    data = utils.download(
                        f_url, f_path, utils.merge(options, {"xml": True, "force": force, "to_cache": False})
                    )

                    if not data:
                        raise Exception("Failed to download %s" % url)

            # Write the current last modified date to disk so we know the next time whether
            # we need to fetch the file.
            if lastmod and not options.get("cached", False):
                utils.write(lastmod, lastmod_cache_file)