Пример #1
def fetch_nomination(nomination_id, options={}):
	logging.info("\n[%s] Fetching..." % nomination_id)

	# fetch committee name map, if it doesn't already exist
	nomination_type, number, congress = utils.split_nomination_id(nomination_id)
	if not number:
		return {'saved': False, 'ok': False, 'reason': "Couldn't parse %s" % nomination_id}

	if not utils.committee_names:
		utils.fetch_committee_names(congress, options)

	# fetch bill details body
	body = utils.download(
		nomination_cache_for(nomination_id, "information.html"), options)

	if not body:
		return {'saved': False, 'ok': False, 'reason': "failed to download"}

	if options.get("download_only", False):
		return {'saved': False, 'ok': True, 'reason': "requested download only"}

	# TODO:
	#   detect group nominations, particularly for military promotions
	#   detect when a group nomination is split into subnominations
	# Also, the splitting process is nonsense:
	# http://thomas.loc.gov/home/PN/split.htm

	if "split into two or more parts" in body:
		return {'saved': False, 'ok': True, 'reason': 'was split'}

	nomination = parse_nomination(nomination_id, body, options)
	output_nomination(nomination, options)
	return {'ok': True, 'saved': True}
Пример #2
def fetch_nomination(nomination_id, options={}):
    logging.info("\n[%s] Fetching..." % nomination_id)

    # fetch committee name map, if it doesn't already exist
    nomination_type, number, congress = utils.split_nomination_id(nomination_id)
    if not number:
        return {'saved': False, 'ok': False, 'reason': "Couldn't parse %s" % nomination_id}

    if not utils.committee_names:
        utils.fetch_committee_names(congress, options)

    # fetch bill details body
    body = utils.download(
        nomination_cache_for(nomination_id, "information.html"), options)

    if not body:
        return {'saved': False, 'ok': False, 'reason': "failed to download"}

    if options.get("download_only", False):
        return {'saved': False, 'ok': True, 'reason': "requested download only"}

  # TO DO
  ## detect group nominations, particularly for military promotions
  ## detect when a group nomination is split into sub nominations because of divergent Senate action

    nomination = parse_nomination(nomination_id, body, options)
    output_nomination(nomination, options)
    return {'ok': True, 'saved': True}
Пример #3
def run(options):
    nomination_id = options.get('nomination_id', None)

    if nomination_id:
        nomination_type, number, congress = utils.split_nomination_id(
        to_fetch = [nomination_id]
        congress = options.get('congress', utils.current_congress())
        to_fetch = nomination_ids_for(congress, options)
        if not to_fetch:
            if options.get("fast", False):
                logging.warn("No nominations changed.")
                    "Error figuring out which nominations to download, aborting."
            return None

        limit = options.get('limit', None)
        if limit:
            to_fetch = to_fetch[:int(limit)]

    logging.warn("Going to fetch %i nominations from congress #%s" %
                 (len(to_fetch), congress))

    saved_nominations = utils.process_set(to_fetch,
Пример #4
def nomination_url_for(nomination_id):
	nomination_type, number, congress = utils.split_nomination_id(nomination_id)

	# numbers can be either of the form "63" or "64-01"
	number_pieces = number.split("-")
	if len(number_pieces) == 1:
	url_number = "%05d%s" % (int(number_pieces[0]), number_pieces[1])

	return "http://thomas.loc.gov/cgi-bin/ntquery/z?nomis:%03d%s%s:/" % (int(congress), nomination_type.upper(), url_number)
Пример #5
def nomination_url_for(nomination_id):
    nomination_type, number, congress = utils.split_nomination_id(nomination_id)

    # numbers can be either of the form "63" or "64-01"
    number_pieces = number.split("-")
    if len(number_pieces) == 1:
    url_number = "%05d%s" % (int(number_pieces[0]), number_pieces[1])

    return "http://thomas.loc.gov/cgi-bin/ntquery/z?nomis:%03d%s%s:/" % (int(congress), nomination_type.upper(), url_number)
Пример #6
def parse_nomination(nomination_id, body, options):
    nomination_type, number, congress = utils.split_nomination_id(

    #remove (and store) comments, which contain some info for the nomination but also mess up the parser
    facts = re.findall("<!--(.+?)-->", body)
    body = re.sub("<!--.+?-->", "", body)

    doc = fromstring(body)
    info = {'nomination_id': nomination_id, 'actions': []}

    #the markup on these pages is a disaster, so we're going to use a heuristic based on boldface, inline tags followed by text
    for pair in doc.xpath('//span[@class="elabel"]|//strong'):
        if pair.tail:
            label, data = pair.text.replace(':', '').strip(), pair.tail.strip()
            if label.split(" ")[-1] == "Action":
                data = re.split("\s+\-\s+", data)
                info['actions'].append((label, data[0], data[1]))
                info[label.lower()] = data
  Some of the data is structured fine as is (e.g. Organization, Referred to, Reported by)
  Some needs processing, like date and nominee

    # Doc format is: "January 04, 1995 (104th Congress)"
    info["date"] = datetime.strptime(info["date received"].split(" (")[0],
                                     "%B %d, %Y").strftime("%Y-%m-%d")
    # Note: Will break with the 1000th congress in year 3789
    info["congress"] = int(
        re.search("(\d{2,3})[stndhr]{2}", info["date received"]).group(1))

    # remove final caluse if there
    info["nominee"] = info["nominee"].split(", vice")[0]

    # get overview from the text of the nomination
        (name, state, position) = re.search("(.+?), of (.+?), to be (.+?)",
    except Exception, e:
        logging.error("Couldn't parse %s" % info["nominee"])
        (name, state, position) = ("", "", "")
Пример #7
def fetch_nomination(nomination_id, options={}):
    logging.info("\n[%s] Fetching..." % nomination_id)

    # fetch committee name map, if it doesn't already exist
    nomination_type, number, congress = utils.split_nomination_id(
    if not number:
        return {
            'saved': False,
            'ok': False,
            'reason': "Couldn't parse %s" % nomination_id

    if not utils.committee_names:
        utils.fetch_committee_names(congress, options)

    # fetch bill details body
    body = utils.download(
        nomination_cache_for(nomination_id, "information.html"), options)

    if not body:
        return {'saved': False, 'ok': False, 'reason': "failed to download"}

    if options.get("download_only", False):
        return {
            'saved': False,
            'ok': True,
            'reason': "requested download only"
  # TO DO
  ## detect group nominations, particularly for military promotions
  ## detect when a group nomination is split into sub nominations because of divergent Senate action

    nomination = parse_nomination(nomination_id, body, options)
    output_nomination(nomination, options)
    return {'ok': True, 'saved': True}
Пример #8
def parse_nomination(nomination_id, body, options):
  nomination_type, number, congress = utils.split_nomination_id(nomination_id)
  #remove (and store) comments, which contain some info for the nomination but also mess up the parser  
  facts = re.findall("<!--(.+?)-->", body)
  body = re.sub("<!--.+?-->", "", body)
  doc = fromstring(body)
  info = { 'nomination_id': nomination_id, 'actions': [] }

  #the markup on these pages is a disaster, so we're going to use a heuristic based on boldface, inline tags followed by text
  for pair in doc.xpath('//span[@class="elabel"]|//strong'):
    if pair.tail:
        label, data = pair.text.replace(':', '').strip(), pair.tail.strip()
        if label.split(" ")[-1] == "Action":
            data = re.split("\s+\-\s+", data)
            info['actions'].append((label, data[0], data[1]))
            info[label.lower()] = data

  Some of the data is structured fine as is (e.g. Organization, Referred to, Reported by)
  Some needs processing, like date and nominee
  # Doc format is: "January 04, 1995 (104th Congress)"
  info["date"] = datetime.strptime(info["date received"].split(" (")[0], "%B %d, %Y").strftime("%Y-%m-%d")
  # Note: Will break with the 1000th congress in year 3789
  info["congress"] = int(re.search("(\d{2,3})[stndhr]{2}", info["date received"]).group(1))
  # remove final caluse if there
  info["nominee"] = info["nominee"].split(", vice")[0]
  # get overview from the text of the nomination
    (name, state, position) = re.search("(.+?), of (.+?), to be (.+?)", info["nominee"]).groups()
  except Exception, e:
    logging.error("Couldn't parse %s" % info["nominee"])
    (name, state, position) = ("", "", "")    
Пример #9
def run(options):
  nomination_id = options.get('nomination_id', None)
  if nomination_id:
    nomination_type, number, congress = utils.split_nomination_id(nomination_id)
    to_fetch = [nomination_id]
    congress = options.get('congress', utils.current_congress())
    to_fetch = nomination_ids_for(congress, options)
    if not to_fetch:
      if options.get("fast", False):
        logging.warn("No nominations changed.")
        logging.error("Error figuring out which nominations to download, aborting.")
      return None

    limit = options.get('limit', None)
    if limit:
      to_fetch = to_fetch[:int(limit)]

  logging.warn("Going to fetch %i nominations from congress #%s" % (len(to_fetch), congress))
  saved_nominations = utils.process_set(to_fetch, nomination_info.fetch_nomination, options)  
Пример #10
def nomination_cache_for(nomination_id, file):
    nomination_type, number, congress = utils.split_nomination_id(
    return "%s/nominations/%s/%s" % (congress, number, file)
Пример #11
def parse_nomination(nomination_id, body, options):
    nomination_type, number, congress = utils.split_nomination_id(nomination_id)

    # remove (and store) comments, which contain some info for the nomination but also mess up the parser
    facts = re.findall("<!--(.+?)-->", body)
    body = re.sub("<!--.+?-->", "", body)

    committee_names = []
    committees = []

    doc = fromstring(body)
    info = {
        'nomination_id': nomination_id, 'actions': []

    # the markup on these pages is a disaster, so we're going to use a heuristic based on boldface, inline tags followed by text
    for pair in doc.xpath('//span[@class="elabel"]|//strong'):
        if pair.tail:
            text = pair.text or pair.text_content()
            label, data = text.replace(':', '').strip(), pair.tail.strip()

            # handle actions separately
            if label.split(" ")[-1] == "Action":
                pieces = re.split("\s+\-\s+", data)

                location = label.split(" ")[0].lower()

                # use 'acted_at', even though it's always a date, to be consistent
                # with acted_at field on bills and amendments
                acted_at = datetime.strptime(pieces[0], "%B %d, %Y").strftime("%Y-%m-%d")

                # join rest back together (in case action itself has a hyphen)
                text = str.join(" - ", pieces[1:len(pieces)])

                    "type": "action",
                    "location": location,
                    "acted_at": acted_at,
                    "text": text

                # let's handle these cases one by one
                if label == "Organization":
                    info["organization"] = data

                elif label == "Control Number":
                    # this doesn't seem useful

                elif label.lower() == "referred to":

                elif label == "Reported by":
                    info["reported_by"] = data

                elif label == "Nomination":
                    # sanity check - verify nomination_id matches
                    if nomination_id != data:
                        raise Exception("Whoa! Mismatched nomination ID.")

                elif label == "Date Received":
                    # Note: Will break with the 1000th congress in year 3789
                    match = re.search("(\d{2,3})[stndhr]{2}", data)
                    if match:
                        info["congress"] = int(match.group(1))
                        raise Exception("Choked, couldn't find Congress in \"%s\"" % data)

                    # Doc format is: "January 04, 1995 (104th Congress)"
                    info["received_on"] = datetime.strptime(data.split(" (")[0], "%B %d, %Y").strftime("%Y-%m-%d")

                elif label == "Nominee":

                    name = data.split(", vice")[0]

                        name = re.search("(.+?),", name).groups()[0]
                    except Exception, e:
                        raise Exception("Couldn't parse nominee entry: %s" % name)

                    # and grab the state and position out of the comment facts
                    if facts[-5]:
                        position = facts[-5]
                        raise Exception("Couldn't find the position in the comments.")

                    info["nominees"] = [{
                        "name": name,
                        "position": position,
                        "state": facts[-6][2:]

                elif label.lower() == "nominees":

                elif label.lower() == "list of nominees":
                    # step through each sibling, collecting each br's stripped tail for names as we go
                    # stop when we get to a strong or span (next label)
                    nominees = []

                    current_position = None
                    for sibling in pair.itersiblings():
                        if sibling.tag == "br":
                            if sibling.tail:
                                name = sibling.tail.strip()
                                if (name[0:5].lower() == "to be"):
                                    current_position = name[6:].strip()
                                elif name:
                                        "name": sibling.tail.strip(),
                                        "position": current_position
                        elif (sibling.tag == "strong") or (sibling.tag == "span"):

                    info["nominees"] = nominees

                    # choke, I think we handle all of them now
                    raise Exception("Unrecognized label: %s" % label)
Пример #12
def nomination_cache_for(nomination_id, file):
    nomination_type, number, congress = utils.split_nomination_id(nomination_id)
    return "%s/nominations/%s/%s" % (congress, number, file)
Пример #13
def output_for_nomination(nomination_id, format):
    nomination_type, number, congress = utils.split_nomination_id(nomination_id)
    return "%s/%s/nominations/%s/%s" % (utils.data_dir(), congress, number, "data.%s" % format)
Пример #14
def parse_nomination(nomination_id, body, options):
    nomination_type, number, congress = utils.split_nomination_id(

    # remove (and store) comments, which contain some info for the nomination but also mess up the parser
    facts = re.findall("<!--(.+?)-->", body)
    body = re.sub("<!--.+?-->", "", body)

    committee_names = []
    committees = []

    doc = fromstring(body)
    info = {'nomination_id': nomination_id, 'actions': []}

    # the markup on these pages is a disaster, so we're going to use a heuristic based on boldface, inline tags followed by text
    for pair in doc.xpath('//span[@class="elabel"]|//strong'):
        if pair.tail:
            text = pair.text or pair.text_content()
            label, data = text.replace(':', '').strip(), pair.tail.strip()

            # handle actions separately
            if label.split(" ")[-1] == "Action":
                pieces = re.split("\s+\-\s+", data)

                location = label.split(" ")[0].lower()

                # use 'acted_at', even though it's always a date, to be consistent
                # with acted_at field on bills and amendments
                acted_at = datetime.strptime(pieces[0],
                                             "%B %d, %Y").strftime("%Y-%m-%d")

                # join rest back together (in case action itself has a hyphen)
                text = str.join(" - ", pieces[1:len(pieces)])

                    "type": "action",
                    "location": location,
                    "acted_at": acted_at,
                    "text": text

                # let's handle these cases one by one
                if label == "Organization":
                    info["organization"] = data

                elif label == "Control Number":
                    # this doesn't seem useful

                elif label.lower() == "referred to":

                elif label == "Reported by":
                    info["reported_by"] = data

                elif label == "Nomination":
                    # sanity check - verify nomination_id matches
                    if nomination_id != data:
                        raise Exception("Whoa! Mismatched nomination ID.")

                elif label == "Date Received":
                    # Note: Will break with the 1000th congress in year 3789
                    match = re.search("(\d{2,3})[stndhr]{2}", data)
                    if match:
                        info["congress"] = int(match.group(1))
                        raise Exception(
                            "Choked, couldn't find Congress in \"%s\"" % data)

                    # Doc format is: "January 04, 1995 (104th Congress)"
                    info["received_on"] = datetime.strptime(
                        data.split(" (")[0], "%B %d, %Y").strftime("%Y-%m-%d")

                elif label == "Nominee":

                    name = data.split(", vice")[0]

                        name = re.search("(.+?),", name).groups()[0]
                    except Exception, e:
                        raise Exception("Couldn't parse nominee entry: %s" %

                    # and grab the state and position out of the comment facts
                    if facts[-5]:
                        position = facts[-5]
                        raise Exception(
                            "Couldn't find the position in the comments.")

                    info["nominees"] = [{
                        "name": name,
                        "position": position,
                        "state": facts[-6][2:]

                elif label.lower() == "nominees":

                elif label.lower() == "list of nominees":
                    # step through each sibling, collecting each br's stripped tail for names as we go
                    # stop when we get to a strong or span (next label)
                    nominees = []

                    current_position = None
                    for sibling in pair.itersiblings():
                        if sibling.tag == "br":
                            if sibling.tail:
                                name = sibling.tail.strip()
                                if (name[0:5].lower() == "to be"):
                                    current_position = name[6:].strip()
                                elif name:
                        elif (sibling.tag == "strong") or (sibling.tag
                                                           == "span"):

                    info["nominees"] = nominees

                    # choke, I think we handle all of them now
                    raise Exception("Unrecognized label: %s" % label)
Пример #15
def nomination_url_for(nomination_id):
    nomination_type, number, congress = utils.split_nomination_id(
    return "http://thomas.loc.gov/cgi-bin/ntquery/z?nomis:%03d%s%05d00:/" % (
        int(congress), nomination_type.upper(), int(number))
Пример #16
def output_for_nomination(nomination_id, format):
    nomination_type, number, congress = utils.split_nomination_id(
    return "%s/%s/nominations/%s/%s" % (utils.data_dir(), congress, number,
                                        "data.%s" % format)
Пример #17
def nomination_url_for(nomination_id):
  nomination_type, number, congress = utils.split_nomination_id(nomination_id)
  return "http://thomas.loc.gov/cgi-bin/ntquery/z?nomis:%03d%s%05d00:/" % (int(congress), nomination_type.upper(), int(number))