def scrape_senators(self, year): senator_url = 'http://www.senate.state.tx.us/75r/senate/senmem.htm' with self.urlopen_context(senator_url) as page: root = lxml.etree.fromstring(page, lxml.etree.HTMLParser()) for el in root.xpath('//table[@summary="senator identification"]'): sen_link = el.xpath('tr/td[@headers="senator"]/a')[0] full_name = sen_link.text district = el.xpath('string(tr/td[@headers="district"])') party = el.xpath('string(tr/td[@headers="party"])') pre, first, last, suffixes = name_tools.split(full_name) leg = Legislator('81', 'upper', district, full_name, first, last, '', party, suffix=suffixes) leg.add_source(senator_url) details_url = ('http://www.senate.state.tx.us/75r/senate/' + sen_link.attrib['href']) with self.urlopen_context(details_url) as details_page: details = lxml.etree.fromstring(details_page, lxml.etree.HTMLParser()) comms = details.xpath("//h2[contains(text(), 'Committee Membership')]")[0] comms = comms.getnext() for comm in comms.xpath('li/a'): comm_name = comm.text if comm.tail: comm_name += comm.tail leg.add_role('committee member', '81', committee=comm_name.strip()) self.save_legislator(leg)
def _scrape_lower_special_committees(self): url = 'http://house.louisiana.gov/H_Cmtes/SpecialCommittees.aspx' page = self.lxmlize(url) committee_list = page.xpath('//table[@id="table106"]//div[@class=' '"exBody1A"]/div[@class="accordion"]')[0] headers = committee_list.xpath('./h3') for header in headers: committee_name_text = header.xpath('string()') committee_name = committee_name_text.strip() committee_name = self._normalize_committee_name(committee_name) chamber = 'joint' if committee_name.startswith('Joint') else 'lower' committee = Committee(chamber, committee_name) committee.add_source(url) committee_memberlist = header.xpath('./following-sibling::div[' '@class="pane"]//tr[@class="linkStyle2"]') for row in committee_memberlist: member_name = row.xpath('normalize-space(string(./td[1]))') member_name = ' '.join(filter(None, name_tools.split(member_name))) member_role = row.xpath('normalize-space(string(./td[2]))') member_role = self._normalize_member_role(member_role) committee.add_member(member_name, member_role) self.save_committee(committee)
def scrape_reps(self, year): rep_url = 'http://www.house.state.tx.us/members/welcome.php' with self.urlopen_context(rep_url) as page: root = lxml.etree.fromstring(page, lxml.etree.HTMLParser()) for el in root.xpath('//form[@name="frmMembers"]/table/tr')[1:]: full_name = el.xpath('string(td/a/font/span)') district = el.xpath('string(td[2]/span)') county = el.xpath('string(td[3]/span)') if full_name.startswith('District'): # Ignore empty seats continue pre, first, last, suffixes = name_tools.split(full_name) party = '' leg = Legislator('81', 'lower', district, full_name, first, last, '', party, suffix=suffixes) leg.add_source(rep_url) # Is there anything out there that handles meta refresh? redirect_url = el.xpath('td/a')[0].attrib['href'] redirect_url = ('http://www.house.state.tx.us/members/' + redirect_url) details_url = redirect_url with self.urlopen_context(redirect_url) as redirect_page: redirect = lxml.etree.fromstring(redirect_page, lxml.etree.HTMLParser()) try: filename = redirect.xpath( "//meta[@http-equiv='refresh']" )[0].attrib['content'] filename = filename.split('0;URL=')[1] details_url = details_url.replace('welcome.htm', filename) except: # The Speaker's member page does not redirect. # The Speaker is not on any committees # so we can just continue with the next member. self.save_legislator(leg) continue with self.urlopen_context(details_url) as details_page: details = lxml.etree.fromstring(details_page, lxml.etree.HTMLParser()) comms = details.xpath( "//b[contains(text(), 'Committee Assignments')]/" "..//a") for comm in comms: leg.add_role('committee member', '81', committee=comm.text.strip()) self.save_legislator(leg)
def scrape_legislators(self, chamber, year): year = int(year) session = self.internal_sessions[year][0][1] # iterating through subsessions would be a better way to do this.. if year % 2 == 0 and (year != dt.date.today().year or year + 1 != dt.date.today().year): raise NoDataForYear(year) if chamber == "upper": url = "http://legis.wi.gov/w3asp/contact/legislatorslist.aspx?house=senate" else: url = "http://legis.wi.gov/w3asp/contact/legislatorslist.aspx?house=assembly" body = unicode(self.urlopen(url), "latin-1") page = lxml.html.fromstring(body) for row in page.cssselect("#ctl00_C_dgLegData tr"): if len(row.cssselect("td a")) > 0: rep_url = list(row)[0].cssselect("a[href]")[0].get("href") (full_name, party) = re.findall(r"([\w\-\,\s\.]+)\s+\(([\w])\)", list(row)[0].text_content())[0] pre, first, last, suffixes = name_tools.split(full_name) district = str(int(list(row)[2].text_content())) leg = Legislator(session, chamber, district, full_name, first, last, "", party, suffix=suffixes) leg.add_source(rep_url) leg = self.add_committees(leg, rep_url, session) self.save_legislator(leg)
def scrape_upper_offices(self, legislator): guessed_url_tmpl = ('http://www.oksenate.gov/Senators/' 'biographies/%s_bio.html') last_name_parts = name_tools.split(legislator['full_name']) last_name = last_name_parts[2].replace(' ', '_') guessed_url = guessed_url_tmpl % last_name try: html = self.urlopen(guessed_url) except scrapelib.HTTPError: # The name was backwards; retry with first name (i.e., last name) last_name = last_name_parts[1].replace(' ', '_').strip(',') guessed_url = guessed_url_tmpl % last_name html = self.urlopen(guessed_url) legislator.add_source(guessed_url) doc = lxml.html.fromstring(html) doc.make_links_absolute(guessed_url) xpath = '//h3[contains(., "Office")]' table = doc.xpath(xpath)[0].itersiblings().next() col1, col2 = table.xpath('tr[2]/td') # Add the capitol office. col1 = map(scrub, col1.itertext()) while True: # Throw away anything after the email address. last = col1[-1] if '@' not in last and not re.search(r'[\d\-\(\) ]{7,}', last): print col1.pop() else: break # Set email on the leg object. email = col1.pop() legislator['email'] = email # Next line is the phone number. phone = col1.pop() office = dict( name='Capitol Office', type='capitol', address='\n'.join(col1), fax=None, email=None, phone=phone) legislator.add_office(**office) col2 = map(scrub, col2.itertext()) if len(col2) < 2: return office = dict( name='District Office', type='district', address='\n'.join(col2), fax=None, email=None, phone=phone) legislator.add_office(**office)
def breakout_names(apps, schema_editor): User = apps.get_model("faceitweb", "User") for user in User.objects.all(): prefix, first_name, last_name, suffix = name_tools.split(user.full_name) user.first_name = first_name user.last_name = last_name user.save()
def split_name(obj): """ If the supplied legislator/person object is missing 'first_name' or 'last_name' then use name_tools to split. """ if obj['_type'] in ('person', 'legislator'): for key in ('first_name', 'last_name'): if key not in obj or not obj[key]: # Need to split (obj['first_name'], obj['last_name'], obj['suffixes']) = name_tools.split(obj['full_name'])[1:] break return obj
def name_forms(name): """ >>> forms = name_forms("Michael Stephens") >>> 'michael stehpens' in forms True >>> 'stephens, michael' in forms True >>> 'm stephens' in forms True >>> 'stephens' in forms True """ sname = {} (sname['pre'], sname['first'], sname['last'], sname['post']) = name_tools.split(name) forms = set() def add_form(str): str = (str % sname).strip(', \t\r\n').lower() str = str.replace('.', '') # Collapse all whitespace segments into single space characters str = ' '.join(str.split()) forms.add(str) add_form("%(first)s %(last)s") add_form("%(last)s") add_form("%(pre)s %(first)s %(last)s") add_form("%(first)s %(last)s %(post)s") add_form("%(pre)s %(first)s %(last)s") add_form("%(pre)s %(first)s %(last)s %(post)s") add_form("%(last)s, %(first)s") pre_first = ("%(pre)s %(first)s" % sname).strip(', \t\r\n') add_form("%(last)s, " + pre_first) add_form("%s %s" % (sname['first'][0], sname['last'])) add_form("%s. %s" % (sname['first'][0], sname['last'])) initials = ' '.join([w[0] for w in sname['first'].split()]) add_form(initials + " %(last)s") add_form("%(last)s, " + initials) return forms
def handle(self, *args, **options): if args: congress = args[0] else: congress = 112 data = 'lastname=&firstname=&position=&state=&party=&congress=%s' % str(congress) url = 'http://bioguide.congress.gov/biosearch/biosearch1.asp' req = urllib2.Request(url, data) response = urllib2.urlopen(req).read() soup = BeautifulSoup(response) for row in soup.findAll('tr')[2:]: cells = row.findAll('td') if len(cells) != 6: continue try: try: name = cells[0].find('a').renderContents() bioguide_id = cells[0].find('a')['href'].split('=')[-1] except AttributeError: pass birth_death, position, party, state, congress = [x.renderContents() for x in cells[1:]] congress = congress.split('<br />')[0] data = {'bioguide_id': bioguide_id, 'birth_death': birth_death, 'position': position, 'party': party, 'state': state, 'congress': congress, } data['prefix'], data['first'], data['last'], data['suffix'] = name_tools.split(name) print data except Exception, e: print Exception, e try: legislator, created = Legislator.objects.get_or_create(**data) except IntegrityError: continue
def import_committees(state, data_dir): data_dir = os.path.join(data_dir, state) pattern = os.path.join(data_dir, 'committees', '*.json') for path in glob.iglob(pattern): with open(path) as f: data = prepare_obj(json.load(f)) meta = db.metadata.find_one({'_id': state}) current_term = meta['terms'][-1]['name'] for member in data['members']: if not member['legislator']: continue (pre, first, last, suff) = name_tools.split(member['legislator']) found = db.legislators.find({ 'first_name': first, 'last_name': last, 'roles': {'$elemMatch': {'term': current_term, 'state': state}}}) if found.count() > 1: print "Too many matches for %s" % member['legislator'] continue elif found.count() == 0: print "No matches for %s" % member['legislator'] continue legislator = found[0] for role in legislator['roles']: if (role['type'] == 'committee member' and role['term'] == current_term and role['committee'] == data['name']): break else: legislator['roles'].append({ 'type': 'committee member', 'committee': data['name'], 'term': current_term, 'chamber': data['chamber']}) legislator['updated_at'] = datetime.datetime.now() db.legislators.save(legislator)
def _scrape_lower_standing_committee(self, committee_name, url): page = self.lxmlize(url) committee = Committee('lower', committee_name) committee.add_source(url) rows = page.xpath('//table[@id="body_ListView1_itemPlaceholder' 'Container"]/tr[@class="linkStyle2"]') for row in rows: member_name = row.xpath('normalize-space(string(./td[1]/a))') member_name = ' '.join(filter(None, name_tools.split(member_name))) member_role = row.xpath('normalize-space(string(./td[2]))') member_role = self._normalize_member_role(member_role) committee.add_member(member_name, member_role) self.save_committee(committee)
def getVirtualSets(element, source): namespaces = { 'oai_dc': 'http://www.openarchives.org/OAI/2.0/oai_dc/', 'dc' : 'http://purl.org/dc/elements/1.1/'} xpath_ev = etree.XPathEvaluator(element, namespaces=namespaces) matches = xpath_ev.evaluate('oai_dc:dc/dc:creator/text()') result = [] for v in matches: if v.strip() == "": continue name = unicode(html.fromstring(v).text) name = unidecode(name) pre, first, last, post = name_tools.split(name) name = last.lower().strip() name = OAIDCLastnameExtractor.escaping_chars_re.sub('',name) name = OAIDCLastnameExtractor.final_nontext_re.sub('',name) name = OAIDCLastnameExtractor.nontext_re.sub('-',name) result.append(name) return result
def prepare_obj(obj): """ Convert timestamps in the scraper output to datetimes so that they will be saved as Mongo datetimes, and standardize some other fields. """ for source in obj.get('sources', []): source['retrieved'] = timestamp_to_dt(source['retrieved']) for action in obj.get('actions', []): action['date'] = timestamp_to_dt(action['date']) for role in obj.get('roles', []): if role['start_date']: role['start_date'] = timestamp_to_dt(role['start_date']) if role['end_date']: role['end_date'] = timestamp_to_dt(role['end_date']) role['state'] = obj['state'] for vote in obj.get('votes', []): vote['date'] = timestamp_to_dt(vote['date']) if 'date' in obj: obj['date'] = timestamp_to_dt(obj['date']) # If we are handling a legislator and the scraped data # includes both 'first_name' and 'last_name' fields, then use them. # If one or both of these fields is missing, then run the name_tools # splitting code to generate them. if obj['_type'] in ('person', 'legislator'): split_name = False for key in ('first_name', 'last_name'): if key not in obj or not obj[key]: # Need to split (obj['first_name'], obj['last_name'], obj['suffixes']) = name_tools.split(obj['full_name'])[1:] break return obj
def parse_comma_name(name): """ Parse a name of the form "Last name, First name" to (first name, last name) Try to do something reasonable if there is no comma. """ if ',' in name: # In this case name_tools does it well prefix, first_name, last_name, suffix = name_tools.split(name) else: words, separators = split_name_words(name) if not words: return ('', '') first_name = None last_name = None from_lists = True # Search for initials in the words initial = list(map(contains_initials, words)) capitalized = list(map(is_fully_capitalized, words)) # CASE 1: the first word is capitalized but not all of them are # we assume that it is the first word of the last name if not initial[0] and capitalized[0] and not all(capitalized): (last, first) = predsplit_forward( (lambda i: capitalized[i] and not initial[i]), words) # CASE 2: the last word is capitalized but not all of them are # we assume that it is the last word of the last name elif not initial[-1] and capitalized[-1] and not all(capitalized): (first, last) = predsplit_forward( (lambda i: (not capitalized[i]) or initial[i]), words) # CASE 3: the first word is an initial elif initial[0]: (first, last) = predsplit_forward( (lambda i: initial[i]), words) # CASE 4: the last word is an initial # this is trickier, we know that the last name comes first # but we don't really know where it stops. # For simplicity we assume that all the words in the first # name are initials elif initial[-1]: (last, first) = predsplit_backwards( (lambda i: initial[i]), words) # CASE 5: there are initials in the name, but neither # at the beginning nor at the end elif True in initial: last_initial_idx = None for i in range(len(words)): if initial[i]: last_initial_idx = i first = words[:last_initial_idx+1] last = words[last_initial_idx+1:] # CASE 6: we have no clue # We fall back on name_tools, where wise things are done # to parse correctly names such as "Colin de la Higuera" else: prefix, first_name, last_name, suffix = name_tools.split(name) from_lists = False if from_lists: first_name = ' '.join(first) last_name = ' '.join(last) first_name = first_name.strip() last_name = last_name.strip() first_name = normalize_name_words(first_name) last_name = normalize_name_words(last_name) if not last_name: first_name, last_name = last_name, first_name return (first_name, last_name)
def import_committees(state, data_dir): data_dir = os.path.join(data_dir, state) pattern = os.path.join(data_dir, "committees", "*.json") meta = db.metadata.find_one({"_id": state}) current_term = meta["terms"][-1]["name"] current_session = meta["terms"][-1]["sessions"][-1] paths = glob.glob(pattern) if not paths: # Not standalone committees for legislator in db.legislators.find({"roles": {"$elemMatch": {"term": current_term, "state": state}}}): for role in legislator["roles"]: if role["type"] == "committee member" and "committee_id" not in role: spec = {"state": role["state"], "chamber": role["chamber"], "committee": role["committee"]} if "subcommittee" in role: spec["subcommittee"] = role["subcommittee"] committee = db.committees.find_one(spec) if not committee: committee = spec committee["_type"] = "committee" committee["members"] = [] committee["sources"] = [] insert_with_id(committee) for member in committee["members"]: if member["leg_id"] == legislator["leg_id"]: break else: committee["members"].append( {"name": legislator["full_name"], "leg_id": legislator["leg_id"], "role": "member"} ) db.committees.save(committee, safe=True) role["committee_id"] = committee["_id"] db.legislators.save(legislator, safe=True) for path in paths: with open(path) as f: data = prepare_obj(json.load(f)) spec = {"state": state, "chamber": data["chamber"], "committee": data["committee"]} if "subcommittee" in data: spec["subcommittee"] = data["subcommittee"] committee = db.committees.find_one(spec) if not committee: insert_with_id(data) committee = data else: update(committee, data, db.committees) for member in committee["members"]: if not member["name"]: continue (pre, first, last, suff) = name_tools.split(member["name"]) leg_id = get_legislator_id(state, current_session, data["chamber"], member["name"]) if not leg_id: print "No matches for %s" % member["name"].encode("ascii", "ignore") member["leg_id"] = None continue legislator = db.legislators.find_one({"_id": leg_id}) member["leg_id"] = leg_id for role in legislator["roles"]: if ( role["type"] == "committee member" and role["term"] == current_term and role["committee_id"] == committee["_id"] ): break else: new_role = { "type": "committee member", "committee": committee["committee"], "term": current_term, "chamber": committee["chamber"], "committee_id": committee["_id"], "state": state, } if "subcommittee" in committee: new_role["subcommittee"] = committee["subcommittee"] legislator["roles"].append(new_role) legislator["updated_at"] = datetime.datetime.utcnow() db.legislators.save(legislator, safe=True) db.committees.save(committee, safe=True) print "imported %s committee files" % len(paths) link_parents(state) ensure_indexes()
def import_committees(state, data_dir): data_dir = os.path.join(data_dir, state) pattern = os.path.join(data_dir, 'committees', '*.json') meta = db.metadata.find_one({'_id': state}) current_term = meta['terms'][-1]['name'] current_session = meta['terms'][-1]['sessions'][-1] paths = glob.glob(pattern) for committee in db.committees.find({'state': state}): committee['members'] = [] db.committees.save(committee) if not paths: # Not standalone committees for legislator in db.legislators.find({ 'roles': {'$elemMatch': {'term': current_term, 'state': state}}}): for role in legislator['roles']: if (role['type'] == 'committee member' and 'committee_id' not in role): spec = {'state': role['state'], 'chamber': role['chamber'], 'committee': role['committee']} if 'subcommittee' in role: spec['subcommittee'] = role['subcommittee'] committee = db.committees.find_one(spec) if not committee: committee = spec committee['_type'] = 'committee' committee['members'] = [] committee['sources'] = [] if 'subcommittee' not in committee: committee['subcommittee'] = None insert_with_id(committee) for member in committee['members']: if member['leg_id'] == legislator['leg_id']: break else: committee['members'].append( {'name': legislator['full_name'], 'leg_id': legislator['leg_id'], 'role': role.get('position') or 'member'}) db.committees.save(committee, safe=True) role['committee_id'] = committee['_id'] db.legislators.save(legislator, safe=True) for path in paths: with open(path) as f: data = prepare_obj(json.load(f)) spec = {'state': state, 'chamber': data['chamber'], 'committee': data['committee']} if 'subcommittee' in data: spec['subcommittee'] = data['subcommittee'] committee = db.committees.find_one(spec) if not committee: insert_with_id(data) committee = data else: update(committee, data, db.committees) for member in committee['members']: if not member['name']: continue (pre, first, last, suff) = name_tools.split(member['name']) leg_id = get_legislator_id(state, current_session, data['chamber'], member['name']) if not leg_id: print "No matches for %s" % member['name'].encode( 'ascii', 'ignore') member['leg_id'] = None continue legislator = db.legislators.find_one({'_id': leg_id}) member['leg_id'] = leg_id for role in legislator['roles']: if (role['type'] == 'committee member' and role['term'] == current_term and role['committee_id'] == committee['_id']): break else: new_role = {'type': 'committee member', 'committee': committee['committee'], 'term': current_term, 'chamber': committee['chamber'], 'committee_id': committee['_id'], 'state': state} if 'subcommittee' in committee: new_role['subcommittee'] = committee['subcommittee'] legislator['roles'].append(new_role) legislator['updated_at'] = datetime.datetime.utcnow() db.legislators.save(legislator, safe=True) db.committees.save(committee, safe=True) print 'imported %s committee files' % len(paths) link_parents(state) ensure_indexes()
def last_first(self): prefix, first, last, suffix = name_tools.split(self.__unicode__()) return re.sub(r'\s+([^\w])', r'\1', '%s %s, %s' % (last, suffix, first))
def parse_comma_name(name): """ Parse a name of the form "Last name, First name" to (first name, last name) Try to do something reasonable if there is no comma. """ if ',' in name: # In this case name_tools does it well prefix, first_name, last_name, suffix = name_tools.split(name) else: words, separators = split_name_words(name) if not words: return ('','') first_name = None last_name = None from_lists = True # Search for initials in the words initial = map(contains_initials, words) capitalized = map(is_fully_capitalized, words) # CASE 1: the first word is capitalized but not all of them are # we assume that it is the first word of the last name if not initial[0] and capitalized[0] and not all(capitalized): (last,first) = predsplit_forward( (lambda i: capitalized[i] and not initial[i]), words) # CASE 2: the last word is capitalized but not all of them are # we assume that it is the last word of the last name elif not initial[-1] and capitalized[-1] and not all(capitalized): (first,last) = predsplit_forward( (lambda i: (not capitalized[i]) or initial[i]), words) # CASE 3: the first word is an initial elif initial[0]: (first,last) = predsplit_forward( (lambda i: initial[i]), words) # CASE 4: the last word is an initial # this is trickier, we know that the last name comes first # but we don't really know where it stops. # For simplicity we assume that all the words in the first # name are initials elif initial[-1]: (last,first) = predsplit_backwards( (lambda i: initial[i]), words) # CASE 5: there are initials in the name, but neither # at the beginning nor at the end elif True in initial: last_initial_idx = None for i in range(len(words)): if initial[i]: last_initial_idx = i first = words[:last_initial_idx+1] last = words[last_initial_idx+1:] # CASE 6: we have no clue # We fall back on name_tools, where wise things are done # to parse correctly names such as "Colin de la Higuera" else: prefix, first_name, last_name, suffix = name_tools.split(name) from_lists = False if from_lists: first_name = ' '.join(first) last_name = ' '.join(last) first_name = first_name.strip() last_name = last_name.strip() first_name = normalize_name_words(first_name) last_name = normalize_name_words(last_name) if not last_name: first_name, last_name = last_name, first_name return (first_name,last_name)
def convert_legislator(leg): if leg.given_name and leg.family_name: first_name = leg.given_name last_name = leg.family_name suffixes = "" else: _, first_name, last_name, suffixes = name_tools.split(leg.name) legacy_ids = [ oid.identifier for oid in leg.identifiers.all() if oid.scheme == "legacy_openstates" ] if not legacy_ids: legacy_ids = ["~not available~"] party = None chamber = None district = None state = None cr = leg.current_role party = cr["party"] chamber = cr["chamber"] district = cr["district"] state = cr["state"] email = None offices = defaultdict(dict) for cd in leg.contact_details.all(): offices[cd.note][cd.type] = cd.value if cd.type == "email" and not email: email = cd.value active = bool(chamber and district) try: url = leg.links.all()[0].url except IndexError: url = "" return { "id": legacy_ids[0], "leg_id": legacy_ids[0], "all_ids": legacy_ids, "full_name": leg.name, "first_name": first_name, "last_name": last_name, "suffix": suffixes, "photo_url": leg.image, "url": url, "email": email, "party": party, "chamber": chamber, "district": district, "state": state, "sources": [{ "url": s.url } for s in leg.sources.all()], "active": active, "roles": [{ "term": static.TERMS[state][-1]["name"], "district": district, "chamber": chamber, "state": state, "party": party, "type": "member", "start_date": None, "end_date": None, }] if active else [], "offices": [{ "name": label, "fax": details.get("fax"), "phone": details.get("voice"), "email": details.get("email"), "address": details.get("address"), "type": "capitol" if "capitol" in label.lower() else "district", } for label, details in offices.items()], "old_roles": {}, "middle_name": "", "country": "us", "level": "state", "created_at": leg.created_at.strftime(DATE_FORMAT), "updated_at": leg.updated_at.strftime(DATE_FORMAT), }
def import_committees(state, data_dir): data_dir = os.path.join(data_dir, state) pattern = os.path.join(data_dir, 'committees', '*.json') meta = db.metadata.find_one({'_id': state}) current_term = meta['terms'][-1]['name'] paths = glob.glob(pattern) if not paths: # Not standalone committees for legislator in db.legislators.find({ 'roles': {'$elemMatch': {'term': current_term, 'state': state}}}): for role in legislator['roles']: if (role['type'] == 'committee member' and 'committee_id' not in role): spec = {'state': role['state'], 'chamber': role['chamber'], 'committee': role['committee']} if 'subcommittee' in role: spec['subcommittee'] = role['subcommittee'] committee = db.committees.find_one(spec) if not committee: committee = spec committee['_type'] = 'committee' committee['members'] = [] insert_with_id(committee) for member in committee['members']: if member['leg_id'] == legislator['leg_id']: break else: committee['members'].append( {'name': legislator['full_name'], 'leg_id': legislator['leg_id'], 'role': 'member'}) db.committees.save(committee, safe=True) role['committee_id'] = committee['_id'] db.legislators.save(legislator, safe=True) for path in paths: with open(path) as f: data = prepare_obj(json.load(f)) spec = {'state': state, 'committee': data['committee']} if 'subcommittee' in data: spec['subcommittee'] = data['subcommittee'] committee = db.committees.find_one(spec) if not committee: insert_with_id(data) committee = data else: update(committee, data, db.committees) for member in committee['members']: if not member['legislator']: continue (pre, first, last, suff) = name_tools.split(member['legislator']) found = db.legislators.find({ 'first_name': first, 'last_name': last, 'roles': {'$elemMatch': {'term': current_term, 'state': state}}}) if found.count() > 1: print "Too many matches for %s" % member['legislator'].encode( 'ascii', 'ignore') continue elif found.count() == 0: print "No matches for %s" % member['legislator'].encode( 'ascii', 'ignore') continue legislator = found[0] member['leg_id'] = legislator['_id'] for role in legislator['roles']: if (role['type'] == 'committee member' and role['term'] == current_term and role['committee_id'] == committee['_id']): break else: new_role = {'type': 'committee member', 'committee': committee['committee'], 'term': current_term, 'chamber': committee['chamber'], 'committee_id': committee['_id'], 'state': state} if 'subcommittee' in committee: new_role['subcommittee'] = committee['subcommittee'] legislator['roles'].append(new_role) legislator['updated_at'] = datetime.datetime.now() db.legislators.save(legislator, safe=True) db.committees.save(committee, safe=True) ensure_indexes()
def convert_legislator(leg): if leg.given_name and leg.family_name: first_name = leg.given_name last_name = leg.family_name suffixes = '' else: _, first_name, last_name, suffixes = name_tools.split(leg.name) legacy_ids = [oid.identifier for oid in leg.identifiers.all() if oid.scheme == 'legacy_openstates'] if not legacy_ids: legacy_ids = ['~not available~'] party = None chamber = None district = None state = None cr = get_current_role(leg) party = cr['party'] chamber = cr['chamber'] district = cr['district'] state = cr['state'] email = None offices = defaultdict(dict) for cd in leg.contact_details.all(): offices[cd.note][cd.type] = cd.value if cd.type == 'email' and not email: email = cd.value active = bool(chamber and district) try: url = leg.links.all()[0].url except IndexError: url = "" return { 'id': legacy_ids[0], 'leg_id': legacy_ids[0], 'all_ids': legacy_ids, 'full_name': leg.name, 'first_name': first_name, 'last_name': last_name, 'suffix': suffixes, 'photo_url': leg.image, 'url': url, 'email': email, 'party': party, 'chamber': chamber, 'district': district, 'state': state, 'sources': [{'url': s.url} for s in leg.sources.all()], 'active': active, 'roles': [{ "term": static.TERMS[state][-1]['name'], "district": district, "chamber": chamber, "state": state, "party": party, "type": "member", "start_date": None, "end_date": None, }] if active else [], 'offices': [ { 'name': label, 'fax': details.get('fax'), 'phone': details.get('voice'), 'email': details.get('email'), 'address': details.get('address'), 'type': 'capitol' if 'capitol' in label.lower() else 'district', } for label, details in offices.items() ], 'old_roles': {}, 'middle_name': '', 'country': 'us', 'level': 'state', 'created_at': leg.created_at.strftime(DATE_FORMAT), 'updated_at': leg.updated_at.strftime(DATE_FORMAT), }