Exemplo n.º 1
0
def fetch_senate_committee_meetings(committees, options):
    # Load any existing meetings file so we can recycle any GUIDs.
    existing_meetings = []
    output_file = output_for("senate")
    if os.path.exists(output_file):
        existing_meetings = json.load(open(output_file))

    options = dict(options)  # clone
    options["binary"] = True  #
    options["force"] = True

    meetings = []

    dom = lxml.etree.fromstring(
        utils.download(
            "http://www.senate.gov/general/committee_schedules/hearings.xml",
            "committee_schedule/senate.xml", options))

    for node in dom.xpath("meeting"):
        committee_id = str(node.xpath('string(cmte_code)'))
        if committee_id.strip() == "":
            continue  # "No committee hearings scheduled" placeholder
        occurs_at = str(node.xpath('string(date)'))
        room = str(node.xpath('string(room)'))
        topic = str(node.xpath('string(matter)'))

        occurs_at = datetime.datetime.strptime(occurs_at, "%d-%b-%Y %I:%M %p")
        topic = re.sub(r"\s+", " ", topic).strip()

        # Validate committee code.
        try:
            committee_code, subcommittee_code = re.match(
                r"(\D+)(\d+)$", committee_id).groups()
            if committee_code not in committees:
                raise ValueError(committee_code)
            if subcommittee_code == "00":
                subcommittee_code = None
            if subcommittee_code and subcommittee_code not in committees[
                    committee_code]["subcommittees"]:
                raise ValueError(subcommittee_code)
        except:
            print("Invalid committee code", committee_id)
            continue

        # See if this meeting already exists. If so, take its GUID.
        # Assume meetings are the same if they are for the same committee/subcommittee and
        # at the same time.
        for mtg in existing_meetings:
            if mtg["committee"] == committee_code and mtg.get(
                    "subcommittee", None) == subcommittee_code and mtg[
                        "occurs_at"] == occurs_at.isoformat():
                if options.get("debug", False):
                    print("[%s] Reusing gUID." % mtg["guid"])
                guid = mtg["guid"]
                break
        else:
            # Not found, so create a new ID.
            # TODO: Can we make this a human-readable ID?
            guid = str(uuid.uuid4())

        # Scrape the topic text for mentions of bill numbers.
        congress = utils.congress_from_legislative_year(
            utils.current_legislative_year(occurs_at))
        bills = []
        bill_number_re = re.compile(
            r"(hr|s|hconres|sconres|hjres|sjres|hres|sres)\s?(\d+)", re.I)
        for bill_match in bill_number_re.findall(topic.replace(".", "")):
            bills.append(bill_match[0].lower() + bill_match[1] + "-" +
                         str(congress))

        # Create the meeting event.
        if options.get("debug", False):
            print("[senate][%s][%s] Found meeting in room %s at %s." %
                  (committee_code, subcommittee_code, room,
                   occurs_at.isoformat()))

        meetings.append({
            "chamber": "senate",
            "congress": congress,
            "guid": guid,
            "committee": committee_code,
            "subcommittee": subcommittee_code,
            "occurs_at": occurs_at.isoformat(),
            "room": room,
            "topic": topic,
            "bill_ids": bills,
        })

    print("[senate] Found %i meetings." % len(meetings))
    return meetings
Exemplo n.º 2
0
def run():

    # default to caching
    cache = utils.flags().get('cache', True)
    force = not cache

    only_bioguide = utils.flags().get('bioguide', None)
    congress = utils.flags().get('congress', None)

    data_files = []

    print("Loading %s..." % "legislators-current.yaml")
    legislators = load_data("legislators-current.yaml")
    data_files.append((legislators, "legislators-current.yaml"))
    print("Loading %s..." % "legislators-historical.yaml")
    legislators = load_data("legislators-historical.yaml")
    data_files.append((legislators, "legislators-historical.yaml"))

    #load roll call data. Will need to be updated (possibly) for 114th+ congresses, since it is unclear what the URl format will be
    if congress == None:
        raise Exception("the --congress flag is required")
    elif congress == "113":
        url_senate = "http://amypond.sscnet.ucla.edu/rollcall/static/S113.ord"
        url_house = "http://amypond.sscnet.ucla.edu/rollcall/static/H113.ord"
    elif int(congress) < 10 and int(congress) > 0:
        url_senate = "ftp://voteview.com/dtaord/sen0%skh.ord" % congress
        url_house = "ftp://voteview.com/dtaord/hou0%skh.ord" % congress
    elif int(congress) < 113 and int(congress) >= 10:
        url_senate = "ftp://voteview.com/dtaord/sen%skh.ord" % congress
        url_house = "ftp://voteview.com/dtaord/hou%skh.ord" % congress
    else:
        raise Exception("no data for congress " + congress)

    senate_destination = "icpsr/source/senate_rollcall%s.txt" % congress
    senate_data = utils.download(url_senate, senate_destination, force)

    house_destination = "icpsr/source/house_rollcall%s.txt" % congress
    house_data = utils.download(url_house, house_destination, force)

    error_log = csv.writer(
        open("cache/errors/mismatch/mismatch_%s.csv" % congress, "wb"))
    error_log.writerow([
        "error_type", "matches", "icpsr_name", "icpsr_state", "is_territory",
        "old_id", "new_id"
    ])

    read_files = [(senate_data, "sen"), (house_data, "rep")]
    print("Running for congress " + congress)
    for read_file in read_files:
        for data_file in data_files:
            for legislator in data_file[0]:
                num_matches = 0
                # # this can't run unless we've already collected a bioguide for this person
                bioguide = legislator["id"].get("bioguide", None)
                # if we've limited this to just one bioguide, skip over everyone else
                if only_bioguide and (bioguide != only_bioguide):
                    continue
                #if not in currently read chamber, skip
                chamber = legislator['terms'][len(legislator['terms']) -
                                              1]['type']
                if chamber != read_file[1]:
                    continue

                #only run for selected congress
                latest_congress = utils.congress_from_legislative_year(
                    utils.legislative_year(
                        parse_date(
                            legislator['terms'][len(legislator['terms']) -
                                                1]['start'])))
                if chamber == "sen":
                    congresses = [
                        latest_congress, latest_congress + 1,
                        latest_congress + 2
                    ]
                else:
                    congresses = [latest_congress]

                if int(congress) not in congresses:
                    continue

                # pull data to match from yaml

                last_name_unicode = legislator['name']['last'].upper().strip(
                ).replace('\'', '')
                last_name = unicodedata.normalize(
                    'NFD', str(last_name_unicode)).encode('ascii', 'ignore')
                state = utils.states[legislator['terms']
                                     [len(legislator['terms']) -
                                      1]['state']].upper()[:7].strip()
                # select icpsr source data based on more recent chamber

                write_id = ""
                lines = read_file[0].split('\n')
                for line in lines:
                    # parse source data
                    icpsr_state = line[12:20].strip()
                    icpsr_name = line[21:].strip().strip(string.digits).strip()
                    icpsr_id = line[3:8].strip()

                    #ensure unique match
                    if icpsr_name[:8] == last_name[:8] and state == icpsr_state:
                        num_matches += 1
                        write_id = icpsr_id
                #skip if icpsr id is currently in data
                if "icpsr" in legislator["id"]:
                    if write_id == legislator["id"]["icpsr"] or write_id == "":
                        continue
                    elif write_id != legislator["id"][
                            "icpsr"] and write_id != "":
                        error_log.writerow([
                            "Incorrect_ID", "NA", last_name[:8], state, "NA",
                            legislator["id"]["icpsr"], write_id
                        ])
                        print("ID updated for %s" % last_name)
                if num_matches == 1:
                    legislator['id']['icpsr'] = int(write_id)
                else:
                    if state == 'GUAM' or state == 'PUERTO' or state == "VIRGIN" or state == "DISTRIC" or state == "AMERICA" or state == "NORTHER" or state == "PHILIPP":
                        error_log.writerow([
                            "Non_1_match_number",
                            str(num_matches), last_name[:8], state, "Y", "NA",
                            "NA"
                        ])
                    else:
                        print(
                            str(num_matches) + " matches found for " +
                            last_name[:8] + ", " + state + " in congress " +
                            str(congress))
                        error_log.writerow([
                            "Non_1_match_number",
                            str(num_matches), last_name, state, "N", "NA", "NA"
                        ])

            save_data(data_file[0], data_file[1])
Exemplo n.º 3
0
    for data_file in data_files:
        for legislator in data_file[0]:
            num_matches = 0
            # # this can't run unless we've already collected a bioguide for this person
            bioguide = legislator["id"].get("bioguide", None)
            # if we've limited this to just one bioguide, skip over everyone else
            if only_bioguide and (bioguide != only_bioguide):
                continue
            # if not in currently read chamber, skip
            chamber = legislator["terms"][len(legislator["terms"]) - 1]["type"]
            if chamber != read_file[1]:
                continue

            # only run for selected congress
            latest_congress = utils.congress_from_legislative_year(
                utils.legislative_year(parse_date(legislator["terms"][len(legislator["terms"]) - 1]["start"]))
            )
            if chamber == "sen":
                congresses = [latest_congress, latest_congress + 1, latest_congress + 2]
            else:
                congresses = [latest_congress]

            if int(congress) not in congresses:
                continue

            # pull data to match from yaml

            last_name_unicode = legislator["name"]["last"].upper().strip().replace("'", "")
            last_name = unicodedata.normalize("NFD", unicode(last_name_unicode)).encode("ascii", "ignore")
            state = utils.states[legislator["terms"][len(legislator["terms"]) - 1]["state"]].upper()[:7].strip()
            # select icpsr source data based on more recent chamber
Exemplo n.º 4
0
def run():

    # default to caching
    cache = utils.flags().get('cache', True)
    force = not cache

    only_bioguide = utils.flags().get('bioguide', None)
    congress = utils.flags().get('congress', None)

    data_files = []

    print("Loading %s..." % "legislators-current.yaml")
    legislators = load_data("legislators-current.yaml")
    data_files.append((legislators, "legislators-current.yaml"))
    print("Loading %s..." % "legislators-historical.yaml")
    legislators = load_data("legislators-historical.yaml")
    data_files.append((legislators, "legislators-historical.yaml"))

    # load member data from vote view
    if congress == None:
        raise Exception("the --congress flag is required")
    elif int(congress) < 10 and int(congress) > 0:
        url_senate = "https://voteview.com/static/data/out/members/S00%s_members.csv" % congress
        url_house = "https://voteview.com/static/data/out/members/H00%s_members.csv" % congress
    elif int(congress) < 100 and int(congress) >= 10:
        url_senate = "https://voteview.com/static/data/out/members/S0%s_members.csv" % congress
        url_house = "https://voteview.com/static/data/out/members/H0%s_members.csv" % congress
    elif int(congress) >= 100:
        url_senate = "https://voteview.com/static/data/out/members/S%s_members.csv" % congress
        url_house = "https://voteview.com/static/data/out/members/H%s_members.csv" % congress
    else:
        raise Exception("no data for congress " + congress)

    senate_destination = "icpsr/source/senate_rollcall%s.txt" % congress
    senate_data = utils.download(url_senate, senate_destination, force)

    house_destination = "icpsr/source/house_rollcall%s.txt" % congress
    house_data = utils.download(url_house, house_destination, force)

    error_log = csv.writer(
        open("cache/errors/mismatch/mismatch_%s.csv" % congress, "w"))
    error_log.writerow([
        "error_type", "matches", "icpsr_name", "icpsr_state", "is_territory",
        "old_id", "new_id"
    ])

    read_files = [("sen", senate_data), ("rep", house_data)]
    print("Running for congress " + congress)
    for read_file_chamber, read_file_content in read_files:
        for data_file in data_files:
            for legislator in data_file[0]:
                num_matches = 0
                write_id = ""
                # this can't run unless we've already collected a bioguide for this person
                bioguide = legislator["id"].get("bioguide", None)
                # if we've limited this to just one bioguide, skip over everyone else
                if only_bioguide and (bioguide != only_bioguide):
                    continue
                #if not in currently read chamber, skip
                chamber = legislator['terms'][len(legislator['terms']) -
                                              1]['type']
                if chamber != read_file_chamber:
                    continue

                #only run for selected congress
                latest_congress = utils.congress_from_legislative_year(
                    utils.legislative_year(
                        parse_date(
                            legislator['terms'][len(legislator['terms']) -
                                                1]['start'])))
                if chamber == "sen":
                    congresses = [
                        latest_congress, latest_congress + 1,
                        latest_congress + 2
                    ]
                else:
                    congresses = [latest_congress]

                if int(congress) not in congresses:
                    continue

                # pull data to match from yaml

                last_name = legislator['name']['last'].upper()
                state = utils.states[legislator['terms']
                                     [len(legislator['terms']) -
                                      1]['state']].upper()[:7].strip()

                # convert read_file_content str to file object, then parse as csv file
                content_as_file = StringIO(read_file_content)
                content_parsed = csv.reader(content_as_file, delimiter=',')

                # loop through congress members in read file, see if one matches the current legislator
                for icpsr_member in content_parsed:
                    # ensure unique match bassed of bioguide id
                    if bioguide == icpsr_member[10]:
                        num_matches += 1
                        write_id = int(icpsr_member[2])

                # skip if icpsr id is currently in data
                if "icpsr" in legislator["id"]:
                    if write_id == legislator["id"]["icpsr"] or write_id == "":
                        continue
                    elif write_id != legislator["id"][
                            "icpsr"] and write_id != "":
                        error_log.writerow([
                            "Incorrect_ID", "NA", last_name[:8], state, "NA",
                            legislator["id"]["icpsr"], write_id
                        ])
                        print("ID updated for %s" % last_name)

                if num_matches == 1:
                    legislator['id']['icpsr'] = int(write_id)
                else:
                    if state == 'GUAM' or state == 'PUERTO' or state == "VIRGIN" or state == "DISTRIC" or state == "AMERICA" or state == "NORTHER" or state == "PHILIPP":
                        print('error: non 1 match')
                        error_log.writerow([
                            "Non_1_match_number",
                            str(num_matches), last_name[:8], state, "Y", "NA",
                            "NA"
                        ])
                    else:
                        print(
                            str(num_matches) + " matches found for " +
                            last_name[:8] + ", " + state + " in congress " +
                            str(congress))
                        error_log.writerow([
                            "Non_1_match_number",
                            str(num_matches), last_name, state, "N", "NA", "NA"
                        ])

            save_data(data_file[0], data_file[1])
Exemplo n.º 5
0
def fetch_senate_committee_meetings(existing_meetings, committees, options):
  # Parse the Senate committee meeting XML feed for meetings.
  # To aid users of the data, attempt to assign GUIDs to meetings.

  options = dict(options) # clone
  options["binary"] = True

  meetings = []

  dom = lxml.etree.fromstring(utils.download(
    "http://www.senate.gov/general/committee_schedules/hearings.xml",
    "committee_schedule/senate.xml",
    options))

  for node in dom.xpath("meeting"):
    committee_id = unicode(node.xpath('string(cmte_code)'))
    if committee_id.strip() == "": continue # "No committee hearings scheduled" placeholder
    occurs_at = unicode(node.xpath('string(date)'))
    room = unicode(node.xpath('string(room)'))
    topic = unicode(node.xpath('string(matter)'))

    occurs_at = datetime.datetime.strptime(occurs_at, "%d-%b-%Y %I:%M %p")
    topic = re.sub(r"\s+", " ", topic).strip()

    # Validate committee code.
    try:
      committee_code, subcommittee_code = re.match(r"(\D+)(\d+)$", committee_id).groups()
      if committee_code not in committees: raise ValueError(committee_code)
      if subcommittee_code == "00": subcommittee_code = None
      if subcommittee_code and subcommittee_code not in committees[committee_code]["subcommittees"]: raise ValueError(subcommittee_code)
    except:
      print "Invalid committee code", committee_id
      continue

    # See if this meeting already exists. If so, take its GUID.
    # Assume meetings are the same if they are for the same committee/subcommittee and
    # at the same time.
    for mtg in existing_meetings:
      if mtg["committee"] == committee_code and mtg.get("subcommittee", None) == subcommittee_code and mtg["occurs_at"] == occurs_at.isoformat():
        guid = mtg["guid"]
        break
    else:
      # Not found, so create a new ID.
      guid = unicode(uuid.uuid4())

    # Scrape the topic text for mentions of bill numbers.
    congress = utils.congress_from_legislative_year(utils.current_legislative_year(occurs_at))
    bills = []
    bill_number_re = re.compile(r"(hr|s|hconres|sconres|hjres|sjres|hres|sres)\s?(\d+)", re.I)
    for bill_match in bill_number_re.findall(topic.replace(".", "")):
      bills.append( bill_match[0].lower() + bill_match[1] + "-" + str(congress) )

    # Create the meeting event.
    meetings.append({
      "chamber": "senate",
      "congress": congress,
      "guid": guid,
      "committee": committee_code,
      "subcommittee": subcommittee_code,
      "occurs_at": occurs_at.isoformat(),
      "room": room,
      "topic": topic,
      "bills": bills,
    })

  return meetings
Exemplo n.º 6
0
def fetch_senate_committee_meetings(existing_meetings, committees, options):
    # Parse the Senate committee meeting XML feed for meetings.
    # To aid users of the data, attempt to assign GUIDs to meetings.

    options = dict(options)  # clone
    options["binary"] = True

    meetings = []

    dom = lxml.etree.fromstring(
        utils.download(
            "http://www.senate.gov/general/committee_schedules/hearings.xml",
            "committee_schedule/senate.xml", options))

    for node in dom.xpath("meeting"):
        committee_id = unicode(node.xpath('string(cmte_code)'))
        if committee_id.strip() == "":
            continue  # "No committee hearings scheduled" placeholder
        occurs_at = unicode(node.xpath('string(date)'))
        room = unicode(node.xpath('string(room)'))
        topic = unicode(node.xpath('string(matter)'))

        occurs_at = datetime.datetime.strptime(occurs_at, "%d-%b-%Y %I:%M %p")
        topic = re.sub(r"\s+", " ", topic).strip()

        # Validate committee code.
        try:
            committee_code, subcommittee_code = re.match(
                r"(\D+)(\d+)$", committee_id).groups()
            if committee_code not in committees:
                raise ValueError(committee_code)
            if subcommittee_code == "00": subcommittee_code = None
            if subcommittee_code and subcommittee_code not in committees[
                    committee_code]["subcommittees"]:
                raise ValueError(subcommittee_code)
        except:
            print "Invalid committee code", committee_id
            continue

        # See if this meeting already exists. If so, take its GUID.
        # Assume meetings are the same if they are for the same committee/subcommittee and
        # at the same time.
        for mtg in existing_meetings:
            if mtg["committee"] == committee_code and mtg.get(
                    "subcommittee", None) == subcommittee_code and mtg[
                        "occurs_at"] == occurs_at.isoformat():
                guid = mtg["guid"]
                break
        else:
            # Not found, so create a new ID.
            guid = unicode(uuid.uuid4())

        # Scrape the topic text for mentions of bill numbers.
        congress = utils.congress_from_legislative_year(
            utils.current_legislative_year(occurs_at))
        bills = []
        bill_number_re = re.compile(
            r"(hr|s|hconres|sconres|hjres|sjres|hres|sres)\s?(\d+)", re.I)
        for bill_match in bill_number_re.findall(topic.replace(".", "")):
            bills.append(bill_match[0].lower() + bill_match[1] + "-" +
                         str(congress))

        # Create the meeting event.
        meetings.append({
            "chamber": "senate",
            "congress": congress,
            "guid": guid,
            "committee": committee_code,
            "subcommittee": subcommittee_code,
            "occurs_at": occurs_at.isoformat(),
            "room": room,
            "topic": topic,
            "bills": bills,
        })

    return meetings
Exemplo n.º 7
0
def fetch_senate_committee_meetings(committees, options):
    # Load any existing meetings file so we can recycle any GUIDs.
    existing_meetings = []
    output_file = output_for("senate")
    if os.path.exists(output_file):
        existing_meetings = json.load(open(output_file))

    options = dict(options)  # clone
    options["binary"] = True  #
    options["force"] = True

    meetings = []

    dom = lxml.etree.fromstring(
        utils.download(
            "http://www.senate.gov/general/committee_schedules/hearings.xml", "committee_schedule/senate.xml", options
        )
    )

    for node in dom.xpath("meeting"):
        committee_id = unicode(node.xpath("string(cmte_code)"))
        if committee_id.strip() == "":
            continue  # "No committee hearings scheduled" placeholder
        occurs_at = unicode(node.xpath("string(date)"))
        room = unicode(node.xpath("string(room)"))
        topic = unicode(node.xpath("string(matter)"))

        occurs_at = datetime.datetime.strptime(occurs_at, "%d-%b-%Y %I:%M %p")
        topic = re.sub(r"\s+", " ", topic).strip()

        # Validate committee code.
        try:
            committee_code, subcommittee_code = re.match(r"(\D+)(\d+)$", committee_id).groups()
            if committee_code not in committees:
                raise ValueError(committee_code)
            if subcommittee_code == "00":
                subcommittee_code = None
            if subcommittee_code and subcommittee_code not in committees[committee_code]["subcommittees"]:
                raise ValueError(subcommittee_code)
        except:
            print "Invalid committee code", committee_id
            continue

        # See if this meeting already exists. If so, take its GUID.
        # Assume meetings are the same if they are for the same committee/subcommittee and
        # at the same time.
        for mtg in existing_meetings:
            if (
                mtg["committee"] == committee_code
                and mtg.get("subcommittee", None) == subcommittee_code
                and mtg["occurs_at"] == occurs_at.isoformat()
            ):
                if options.get("debug", False):
                    print "[%s] Reusing gUID." % mtg["guid"]
                guid = mtg["guid"]
                break
        else:
            # Not found, so create a new ID.
            # TODO: Can we make this a human-readable ID?
            guid = unicode(uuid.uuid4())

        # Scrape the topic text for mentions of bill numbers.
        congress = utils.congress_from_legislative_year(utils.current_legislative_year(occurs_at))
        bills = []
        bill_number_re = re.compile(r"(hr|s|hconres|sconres|hjres|sjres|hres|sres)\s?(\d+)", re.I)
        for bill_match in bill_number_re.findall(topic.replace(".", "")):
            bills.append(bill_match[0].lower() + bill_match[1] + "-" + str(congress))

        # Create the meeting event.
        if options.get("debug", False):
            print "[senate][%s][%s] Found meeting in room %s at %s." % (
                committee_code,
                subcommittee_code,
                room,
                occurs_at.isoformat(),
            )

        meetings.append(
            {
                "chamber": "senate",
                "congress": congress,
                "guid": guid,
                "committee": committee_code,
                "subcommittee": subcommittee_code,
                "occurs_at": occurs_at.isoformat(),
                "room": room,
                "topic": topic,
                "bill_ids": bills,
            }
        )

    print "[senate] Found %i meetings." % len(meetings)
    return meetings
Exemplo n.º 8
0
def run():

    # default to caching
    cache = utils.flags().get('cache', True)
    force = not cache


    only_bioguide = utils.flags().get('bioguide', None)
    congress = utils.flags().get('congress',None)


    data_files = []

    print("Loading %s..." % "legislators-current.yaml")
    legislators = load_data("legislators-current.yaml")
    data_files.append((legislators,"legislators-current.yaml"))
    print("Loading %s..." % "legislators-historical.yaml")
    legislators = load_data("legislators-historical.yaml")
    data_files.append((legislators,"legislators-historical.yaml"))

    #load roll call data. Will need to be updated (possibly) for 114th+ congresses, since it is unclear what the URl format will be
    if congress == None:
        raise Exception("the --congress flag is required")
    elif congress == "113":
        url_senate = "http://amypond.sscnet.ucla.edu/rollcall/static/S113.ord"
        url_house = "http://amypond.sscnet.ucla.edu/rollcall/static/H113.ord"
    elif int(congress) <10 and int(congress) >0:
        url_senate = "ftp://voteview.com/dtaord/sen0%skh.ord" % congress
        url_house = "ftp://voteview.com/dtaord/hou0%skh.ord" % congress
    elif int(congress) < 113 and int(congress) >= 10:
        url_senate = "ftp://voteview.com/dtaord/sen%skh.ord" % congress
        url_house = "ftp://voteview.com/dtaord/hou%skh.ord" % congress
    else:
        raise Exception("no data for congress " + congress)

    senate_destination = "icpsr/source/senate_rollcall%s.txt" % congress
    senate_data = utils.download(url_senate, senate_destination, force)

    house_destination = "icpsr/source/house_rollcall%s.txt" % congress
    house_data = utils.download(url_house, house_destination, force)

    error_log = csv.writer(open("cache/errors/mismatch/mismatch_%s.csv" % congress, "wb"))
    error_log.writerow(["error_type","matches","icpsr_name","icpsr_state","is_territory","old_id","new_id"])



    read_files = [(senate_data,"sen"),(house_data,"rep")]
    print("Running for congress " + congress)
    for read_file in read_files:
        for data_file in data_files:
            for legislator in data_file[0]:
                num_matches = 0
                # # this can't run unless we've already collected a bioguide for this person
                bioguide = legislator["id"].get("bioguide", None)
                # if we've limited this to just one bioguide, skip over everyone else
                if only_bioguide and (bioguide != only_bioguide):
                    continue
                #if not in currently read chamber, skip
                chamber = legislator['terms'][len(legislator['terms'])-1]['type']
                if chamber != read_file[1]:
                    continue

                #only run for selected congress
                latest_congress = utils.congress_from_legislative_year(utils.legislative_year(parse_date(legislator['terms'][len(legislator['terms'])-1]['start'])))
                if chamber == "sen":
                    congresses = [latest_congress,latest_congress+1,latest_congress+2]
                else:
                    congresses =[latest_congress]

                if int(congress) not in congresses:
                    continue

                # pull data to match from yaml

                last_name_unicode = legislator['name']['last'].upper().strip().replace('\'','')
                last_name = unicodedata.normalize('NFD', str(last_name_unicode)).encode('ascii', 'ignore')
                state = utils.states[legislator['terms'][len(legislator['terms'])-1]['state']].upper()[:7].strip()
                # select icpsr source data based on more recent chamber

                write_id = ""
                lines = read_file[0].split('\n')
                for line in lines:
                    # parse source data
                    icpsr_state = line[12:20].strip()
                    icpsr_name = line[21:].strip().strip(string.digits).strip()
                    icpsr_id = line[3:8].strip()

                    #ensure unique match
                    if icpsr_name[:8] == last_name[:8] and state == icpsr_state:
                        num_matches += 1
                        write_id = icpsr_id
                #skip if icpsr id is currently in data
                if "icpsr" in legislator["id"]:
                    if write_id == legislator["id"]["icpsr"] or write_id == "":
                        continue
                    elif write_id != legislator["id"]["icpsr"] and write_id != "":
                        error_log.writerow(["Incorrect_ID","NA",last_name[:8],state,"NA",legislator["id"]["icpsr"],write_id])
                        print("ID updated for %s" % last_name)
                if num_matches == 1:
                    legislator['id']['icpsr'] = int(write_id)
                else:
                    if state == 'GUAM' or state == 'PUERTO' or state == "VIRGIN" or state == "DISTRIC" or state == "AMERICA" or state == "NORTHER" or state == "PHILIPP":
                        error_log.writerow(["Non_1_match_number",str(num_matches),last_name[:8],state,"Y","NA","NA"])
                    else:
                        print(str(num_matches) + " matches found for "+ last_name[:8] + ", " + state + " in congress " + str(congress))
                        error_log.writerow(["Non_1_match_number",str(num_matches),last_name,state,"N","NA","NA"])


            save_data(data_file[0], data_file[1])