def select_special_comm(self): main_url = 'http://www.nebraskalegislature.gov/committees/select-committees.php' with self.urlopen(main_url) as page: page = lxml.html.fromstring(page) for comm_names in page.xpath('/html/body/div[@id="wrapper"]/div[@id="content"]/div[@id="content_text"]/div[@class="content_box_container"]/div[@class="content_box"]'): name = comm_names.xpath('h2')[0].text if name != None: committee = Committee('upper', name) committee.add_source(main_url) for senators in comm_names.xpath('ul[@class="nobullet"]/li'): senator = senators[0].text if 'Chairperson' in senator: role = 'Chairperson' senator = senator[5:-13] else: role = 'member' senator = senator[5:-1] committee.add_member(senator, role) self.save_committee(committee) else: name = comm_names.xpath('h2/a')[0].text committee = Committee('upper', name) committee.add_source(main_url) for senators in comm_names.xpath('ul[@class="nobullet"]/li'): senator = senators[0].text if 'Chairperson' in senator: role = 'chairperson' senator = senator[5:-13] else: role = 'member' senator = senator[5:-1] committee.add_member(senator, role) self.save_committee(committee)
def scrape_committee(self, name, url, chamber): com = Committee(chamber, name) com.add_source(url) data = self.get(url).text doc = lxml.html.fromstring(data) for leg in doc.xpath('//div[@id="members"]/div[@id="members"]/p/a/text()'): leg = leg.replace('Representative ', '') leg = leg.replace('Senator ', '') leg = leg.strip() if ' (' in leg: leg, role = leg.split(' (') if 'Vice-Chair' in role: role = 'vice-chair' elif 'Co-Chair' in role: role = 'co-chair' elif 'Chair' in role: role = 'chair' else: raise Exception('unknown role: %s' % role) else: role = 'member' com.add_member(leg, role) self.save_committee(com)
def scrape_reps_comm(self): url = 'http://www.maine.gov/legis/house/hsecoms.htm' with self.urlopen(url) as page: root = lxml.html.fromstring(page) count = 0 for n in range(1, 12, 2): path = 'string(//body/center[%s]/h1/a)' % (n) comm_name = root.xpath(path) committee = Committee('lower', comm_name) count = count + 1 path2 = '/html/body/ul[%s]/li/a' % (count) for el in root.xpath(path2): rep = el.text if rep.find('(') != -1: mark = rep.find('(') rep = rep[15: mark] committee.add_member(rep) committee.add_source(url) self.save_committee(committee)
def scrape_senate_committee(self, term, link): with self.urlopen(link) as html: doc = lxml.html.fromstring(html) # strip first 30 and last 10 # Minnesota Senate Committees - __________ Committee committee_name = doc.xpath('//title/text()')[0][30:-10] com = Committee('upper', committee_name) # first id=bio table is members for row in doc.xpath('//table[@id="bio"]')[0].xpath('tr'): row = fix_whitespace(row.text_content()) # switch role if ':' in row: position, name = row.split(': ') role = position.lower().strip() else: name = row # add the member com.add_member(name.strip(), role) com.add_source(link) self.save_committee(com)
def scrape_upper_committee(self, name, url): with self.urlopen(url) as page: page = lxml.html.fromstring(page) comm = Committee('upper', name) comm.add_source(url) member_div = page.xpath("//div[@class = 'committee-members']")[0] seen = set() for link in member_div.xpath(".//a"): if not link.text: continue member = link.text.strip() next_elem = link.getnext() if (next_elem is not None and next_elem.tag == 'a' and next_elem.attrib['href'] == link.attrib['href']): # Sometimes NY is cool and splits names across a # couple links member = "%s %s" % (member, next_elem.text.strip()) member = re.sub(r'\s+', ' ', member) if member in seen or not member: continue seen.add(member) name, role = parse_name(member) comm.add_member(name, role) self.save_committee(comm)
def scrape_house_committee(self, committee_name, link): """Scrape individual committee page and add members""" html = self.urlopen(link) doc = lxml.html.fromstring(html) subcommittee = False for h1 in doc.xpath('//h1/text()'): if 'subcommittee' in h1.lower(): subcommittee = True subcomm_name = ('Subcommittee' if subcommittee else None) if subcommittee: committee_name = committee_name.replace(' Subcommittee', '') com = Committee('lower', committee_name, subcomm_name) find_expr = "//div[@class='col1']/ul[position()<3]/li/a" for a in doc.xpath(find_expr): name = a.text role = (a.tail or '').strip(', ') or 'member' if name: com.add_member(name, role) com.add_source(link) if com['members']: self.save_committee(com)
def scrape_upper_committee(self,url): filename, resp = self.urlretrieve(url) root = lxml.etree.fromstring( convert_pdf(filename,'xml')) for link in root.xpath('/pdf2xml/page'): comm = None for line in link.findall('text'): text = line.findtext('b') if text is not None and text.startswith('Comisi'): comm = Committee('upper',text); comm.add_source(url) else: if line.text and line.text.startswith('Hon.'): line_text = line.text.replace(u'–','-') name_split = line_text.split(u'-',1) title = 'member' # print name_split if len(name_split) >= 2: name_split[1] = name_split[1].strip() if name_split[1] == 'Presidenta' or name_split[1] == 'Presidente': title = 'chairman' elif name_split[1] == 'Vicepresidente' or name_split[1] == 'Vicepresidenta': title = 'vicechairman' elif name_split[1] == 'Secretaria' or name_split[1] == 'Secretario': title = 'secretary' # if title != 'member': # print name_split[0] if name_split[0] != 'VACANTE': comm.add_member(name_split[0].replace('Hon.',''),title) self.save_committee(comm) os.remove(filename);
def select_special_comm(self): main_url = "http://www.nebraskalegislature.gov/committees/select-committees.php" with self.urlopen(main_url) as page: page = lxml.html.fromstring(page) for comm_names in page.xpath('//div[@class="content_box"]'): name = comm_names.xpath("h2")[0].text if name != None: committee = Committee("upper", name) committee.add_source(main_url) for senators in comm_names.xpath('ul[@class="nobullet"]/li'): senator = senators[0].text if "Chairperson" in senator: role = "Chairperson" senator = senator[5:-13].strip() else: role = "member" senator = senator[5:].strip() committee.add_member(senator, role) self.save_committee(committee) else: name = comm_names.xpath("h2/a")[0].text committee = Committee("upper", name) committee.add_source(main_url) for senators in comm_names.xpath('ul[@class="nobullet"]/li'): senator = senators[0].text if "Chairperson" in senator: role = "chairperson" senator = senator[5:-13].strip() else: role = "member" senator = senator[5:].strip() committee.add_member(senator, role) self.save_committee(committee)
def scrape(self, chamber, term): for t in self.metadata['terms']: if t['name'] == term: session = t['sessions'][-1] sessionsuffix = 'th' if str(session)[-1] == '1': sessionsuffix = 'st' elif str(session)[-1] == '2': sessionsuffix = 'nd' elif str(session)[-1] == '3': sessionsuffix = 'rd' insert = str(session) + sessionsuffix + str(term[0:4]) chamber_letter = {'lower':'A', 'upper':'S'}[chamber] url = 'http://www.leg.state.nv.us/Session/%s/Committees/%s_Committees/' % ( insert, chamber_letter) page = self.urlopen(url) root = lxml.html.fromstring(page) for com_a in root.xpath('//strong/a'): com_url = url + com_a.get('href') if com_a.text == 'Committee of the Whole': continue com = Committee(chamber, com_a.text) com.add_source(com_url) self.scrape_comm_members(chamber, com, com_url) self.save_committee(com)
def scrape_committee(self, term, chambers, href, name): page = self.get(href).text page = lxml.html.fromstring(page) page.make_links_absolute(href) members = page.xpath("//div[@class='view-content']" "//a[contains(@href, 'members')]") if "/joint/" in href: chamber = "joint" elif "/senate/" in href: chamber = "upper" elif "/house/" in href: chamber = "lower" else: print "XXX: Fail! %s" % (href) return cttie = Committee(chamber, name) for a in members: member = a.text role = a.xpath("ancestor::div/h2[@class='pane-title']/text()")[0] role = {"Legislative Members": "member", "Chairman": "chair", "Vice Chairman": "member"}[role] if member is None or member.startswith("District"): continue cttie.add_member(member, role=role) cttie.add_source(href) self.save_committee(cttie)
def scrape_upper(self): url = "http://senadopr.us/Lists/Listado%20de%20Comisiones/Comisiones%20del%20Senado.aspx" with self.urlopen(url) as html: doc = lxml.html.fromstring(html) doc.make_links_absolute(url) table = doc.xpath( '//table[@id="{C05AFE0D-D977-4033-8D7B-C43ABF948A4A}-{3E52C91B-AFC8-4493-967A-C8A47AC4E7B6}"]' ) for link in table[0].iterchildren("tr"): td_column = list(link) name = td_column[0].find("a") if name is not None: com_source = name.get("href") # if committee does not have a url use the default. if com_source == "http://senadopr.us/": com_source = url com_name = name.text # check the committee name to see if it's a join one. if td_column[1].text == "Comisi\xf3n Conjunta": chamber = "joint" else: chamber = "upper" com = Committee(chamber, com_name) com.add_source(com_source) com.add_member(clean_spaces(td_column[2].find("a").text), "chairman") self.save_committee(com)
def scrape_senate_committee(self, url): html = self.urlopen(url) doc = lxml.html.fromstring(html) name = doc.xpath('//h6/text()')[0] com = Committee(chamber='upper', committee=name) for member in doc.xpath('//div[@id="committeelist"]//a'): member_name = member.text.strip() # don't add clerks if member_name == 'Committee Clerk': continue if 'Committee Chair' in member.tail: role = 'chair' elif 'Majority Vice' in member.tail: role = 'majority vice chair' elif 'Minority Vice' in member.tail: role = 'minority vice chair' else: role = 'member' com.add_member(member_name, role=role) com.add_source(url) self.save_committee(com)
def scrape_approp_subcommittees(self, url): html = self.urlopen(url) doc = lxml.html.fromstring(html) for strong in doc.xpath('//strong'): com = Committee(chamber='upper', committee='Appropriations', subcommittee=strong.text.strip()) com.add_source(url) legislators = strong.getnext().tail.replace('Senators', '').strip() for leg in re.split(', | and ', legislators): if leg.endswith('(C)'): role = 'chairman' leg = leg[:-4] elif leg.endswith('(VC)'): role = 'vice chairman' leg = leg[:-5] elif leg.endswith('(MVC)'): role = 'minority vice chairman' leg = leg[:-6] else: role = 'member' com.add_member(leg, role=role) self.save_committee(com)
def scrape(self, chamber, term): self.validate_term(term, latest_only=True) chamber_abbr = {'upper': 's', 'lower': 'h'}[chamber] url = "http://le.utah.gov/asp/interim/standing.asp?house=%s" % chamber_abbr with self.urlopen(url) as page: page = lxml.html.fromstring(page) page.make_links_absolute(url) for comm_link in page.xpath("//a[contains(@href, 'Com=')]"): comm_name = comm_link.text.strip() # Drop leading "House" or "Senate" from name comm_name = re.sub(r"^(House|Senate) ", "", comm_name) comm = Committee(chamber, comm_name) for mbr_link in comm_link.xpath( "../../../font[2]/a[not(contains(@href, 'mailto'))]"): name = mbr_link.text.strip() next_el = mbr_link.getnext() if next_el is not None and next_el.tag == 'i': type = next_el.text.strip() else: type = 'member' comm.add_member(name, type) self.save_committee(comm)
def scrape_senate_committee(self, url): html = self.get(url).text doc = lxml.html.fromstring(html) name = doc.xpath('//h3/text()')[0] name = name.replace(' Committee', '') com = Committee(chamber='upper', committee=name) for member in doc.xpath('//div[@id="committeeright"]//a'): member_name = member.text.strip() # don't add clerks if member_name == 'Committee Clerk': continue # skip phone links if member.get("href").startswith("tel:"): continue if 'Committee Chair' in member.tail: role = 'chair' elif 'Majority Vice' in member.tail: role = 'majority vice chair' elif 'Minority Vice' in member.tail: role = 'minority vice chair' else: role = 'member' com.add_member(member_name, role=role) com.add_source(url) self.save_committee(com)
def _scrape_upper_committee(self, name, url2): cat = "Assignments.asp" url3 = "".join((url2, cat)) committee = Committee('upper', name) committee.add_source(url2) page = self.lxmlize(url3) members = page.xpath('//table[@id="table38"]//font/a/b') for link in members: role = "member" if link == members[0]: role = "Chairman" if link == members[1]: role = "Vice-Chairman" name = link.xpath('string()') name = name.replace('Senator ', '') name = re.sub('[\s]{2,}', ' ', name).strip() committee.add_member(name, role) self.save_committee(committee)
def _scrape_lower_special_committees(self): url = 'http://house.louisiana.gov/H_Cmtes/SpecialCommittees.aspx' page = self.lxmlize(url) committee_list = page.xpath('//table[@id="table106"]//div[@class=' '"exBody1A"]/div[@class="accordion"]')[0] headers = committee_list.xpath('./h3') for header in headers: committee_name_text = header.xpath('string()') committee_name = committee_name_text.strip() committee_name = self._normalize_committee_name(committee_name) chamber = 'joint' if committee_name.startswith('Joint') else 'lower' committee = Committee(chamber, committee_name) committee.add_source(url) committee_memberlist = header.xpath('./following-sibling::div[' '@class="pane"]//tr[@class="linkStyle2"]') for row in committee_memberlist: member_name = row.xpath('normalize-space(string(./td[1]))') member_name = ' '.join(filter(None, name_tools.split(member_name))) member_role = row.xpath('normalize-space(string(./td[2]))') member_role = self._normalize_member_role(member_role) committee.add_member(member_name, member_role) self.save_committee(committee)
def scrape_senate_committee(self, name, url2): cat = "Assignments.asp" url3 = "".join((url2, cat)) committee = Committee("upper", name) committee.add_source(url2) text = self.get(url3).text page = lxml.html.fromstring(text) members = page.xpath('//table[@id="table38"]//font/a/b') for link in members: role = "member" if link == members[0]: role = "Chairman" if link == members[1]: role = "Vice-Chairman" name = link.xpath("string()") name = name.replace("Senator ", "") name = re.sub("[\s]{2,}", " ", name).strip() committee.add_member(name, role) self.save_committee(committee)
def scrape_joint_committee(self, url): html = self.urlopen(url) doc = lxml.html.fromstring(html) name = doc.xpath('//h1/text()') or doc.xpath('//h2/text()') name = name[0].strip() comm = Committee('joint', name) comm.add_source(url) members = chain(doc.xpath('//a[contains(@href, "MemberId")]'), doc.xpath('//a[contains(@href, "Senators")]')) seen = set() for a in members: parent_content = a.getparent().text_content() if ':' in parent_content: title = parent_content.split(':')[0].strip() else: title = 'member' name = a.text.split(' (')[0].strip() if (name, title) not in seen: comm.add_member(name, title) seen.add((name, title)) if comm['members']: self.save_committee(comm)
def scrape_senate_comm(self): url = 'http://www.maine.gov/legis/senate/Senate-Standing-Committees.html' html = self.urlopen(url) doc = lxml.html.fromstring(html) # committee titles for item in doc.xpath('//span[@style="FONT-SIZE: 11pt"]'): text = item.text_content().strip() # some contain COMMITTEE ON & some are blank, drop those if not text or text.startswith('COMMITTEE'): continue # titlecase committee name com = Committee('upper', text.title()) com.add_source(url) # up two and get ul sibling for leg in item.xpath('../../following-sibling::ul[1]/li'): lname = leg.text_content().strip() if 'Chair' in lname: role = 'chair' else: role = 'member' lname = leg.text_content().strip().split(' of ')[0].strip() com.add_member(lname, role) self.save_committee(com)
def scrape_committee(self, chamber, name, url): page = self.urlopen(url) page = lxml.html.fromstring(page) if page.xpath("//h3[. = 'Joint Committee']"): chamber = 'joint' subcommittee = page.xpath("//h3[@align='center']/text()")[0] if not "Subcommittee" in subcommittee: subcommittee = None comm = Committee(chamber, name, subcommittee=subcommittee) comm.add_source(url) for link in page.xpath("//a[contains(@href, 'member=')]"): member = link.text.strip() mtype = link.xpath("string(../preceding-sibling::td[1])") mtype = mtype.strip(": \r\n\t").lower() comm.add_member(member, mtype) if not comm['members']: self.warning('not saving %s, appears to be empty' % name) else: self.save_committee(comm)
def scrape(self, chamber, term): if term != '2011-2012': raise NoDataForPeriod(term) if chamber == 'upper': url = ('http://www.legis.state.pa.us/cfdocs/legis/' 'home/member_information/senators_ca.cfm') else: url = ('http://www.legis.state.pa.us/cfdocs/legis/' 'home/member_information/representatives_ca.cfm') with self.urlopen(url) as page: page = lxml.html.fromstring(page) committees = {} for li in page.xpath("//a[contains(@href, 'bio.cfm')]/../.."): name = li.xpath("string(b/a[contains(@href, 'bio.cfm')])") name = name[0:-4] for link in li.xpath("a"): if not link.tail: continue committee_name = link.tail.strip() committee_name = re.sub(r"\s+", " ", committee_name) subcommittee_name = None role = 'member' rest = link.getnext().text if rest: match = re.match(r',\s+(Subcommittee on .*)\s+-', rest) if match: subcommittee_name = match.group(1) role = rest.split('-')[1].strip().lower() else: role = rest.replace(', ', '').strip().lower() if role == 'chairman': role = 'chair' try: committee = committees[(chamber, committee_name, subcommittee_name)] except KeyError: committee = Committee(chamber, committee_name) committee.add_source(url) if subcommittee_name: committee['subcommittee'] = subcommittee_name committees[(chamber, committee_name, subcommittee_name)] = committee committee.add_member(name, role) for committee in committees.values(): self.save_committee(committee)
def scrape_house(self, url): html = self.urlopen(url) doc = lxml.html.fromstring(html) doc.make_links_absolute(url) for a in doc.xpath('//td/a'): com_name = a.text.strip() # blank entries in table if not com_name: continue if 'Reapportionment' in com_name or 'Horse Racing' in com_name: self.warning('skipping %s, known to be problematic' % com_name) continue com_url = a.get('href') com_html = self.urlopen(com_url) com_doc = lxml.html.fromstring(com_html) com = Committee('lower', com_name) for td in com_doc.xpath('//table[@id="commtable"]')[1].xpath('.//td'): leg = td.xpath('.//a/text()') if leg: leg = leg[0] pieces = td.text_content().split('\n') if len(pieces) == 2: role = pieces[1].lower() else: role = 'member' com.add_member(leg, role) com.add_source(com_url) self.save_committee(com)
def scrape_house(self): url = "http://house.louisiana.gov/H_Reps/H_Reps_CmtesFull.asp" comm_cache = {} text = self.urlopen(url) page = lxml.html.fromstring(text) for row in page.xpath("//table[@bordercolorlight='#EAEAEA']/tr"): cells = row.xpath('td') name = cells[0].xpath('string()').strip() if name.startswith('Vacant'): continue font = cells[1].xpath('font')[0] committees = [] if font.text: committees.append(font.text.strip()) for br in font.xpath('br'): if br.text: committees.append(br.text.strip()) if br.tail: committees.append(br.tail) for comm_name in committees: mtype = 'member' if comm_name.endswith(', Chairman'): mtype = 'chairman' comm_name = comm_name.replace(', Chairman', '') elif comm_name.endswith(', Co-Chairmain'): mtype = 'co-chairmain' comm_name = comm_name.replace(', Co-Chairmain', '') elif comm_name.endswith(', Vice Chair'): mtype = 'vice chair' comm_name = comm_name.replace(', Vice Chair', '') elif comm_name.endswith(', Ex Officio'): mtype = 'ex officio' comm_name = comm_name.replace(', Ex Officio', '') elif comm_name.endswith(", Interim Member"): mtype = 'interim' comm_name = comm_name.replace(", Interim Member", "") if comm_name.startswith('Joint'): chamber = 'joint' else: chamber = 'lower' try: committee = comm_cache[comm_name] except KeyError: committee = Committee(chamber, comm_name) committee.add_source(url) comm_cache[comm_name] = committee committee.add_member(name, mtype) for committee in comm_cache.values(): self.save_committee(committee)
def scrape_lower_committee(self, name, url): page = self.get(url).text page = lxml.html.fromstring(page) comm = Committee('lower', name) comm.add_source(url) seen = set() for link in page.xpath("//div[@class='commlinks']//a[contains(@href, 'mem')]"): member = link.text.strip() member = re.sub(r'\s+', ' ', member) name, role = parse_name(member) if name is None: continue # Figure out if this person is the chair. role_type = link.xpath('../../preceding-sibling::div[1]/text()') if role_type in (['Chair'], ['Co-Chair']): role = 'chair' else: role = 'member' if name not in seen: comm.add_member(name, role) seen.add(name) if comm['members']: self.save_committee(comm)
def scrape(self, chamber, term): chamber_name = 'senate' if chamber == 'upper' else 'house' url = 'http://ilga.gov/{0}/committees/default.asp'.format(chamber_name) html = self.urlopen(url) doc = lxml.html.fromstring(html) doc.make_links_absolute(url) top_level_com = None for a in doc.xpath('//a[contains(@href, "members.asp")]'): name = a.text.strip() code = a.getparent().getnext().text_content().strip() if 'Sub' in name: com = Committee(chamber, top_level_com, name, code=code) else: top_level_com = name com = Committee(chamber, name, code=code) com_url = a.get('href') self.scrape_members(com, com_url) com.add_source(com_url) if not com['members']: self.log('skipping empty committee on {0}'.format(com_url)) else: self.save_committee(com)
def scrape_lower_committees(self): # id range for senate committees on their website for comm_id in range(87, 124): comm_url = ('http://www.house.state.oh.us/index.php?option=' 'com_displaycommittees&task=2&type=Regular&' 'committeeId=%d' % comm_id) with self.urlopen(comm_url) as page: page = lxml.html.fromstring(page) comm_name = page.xpath( 'string(//table/tr[@class="committeeHeader"]/td)') comm_name = comm_name.replace("/", " ").strip() if not comm_name: continue if comm_id < 92: chamber = "joint" committee = Committee(chamber, comm_name) committee.add_source(comm_url) for link in page.xpath("//a[contains(@href, 'district')]"): name = link.text if name and name.strip(): committee.add_member(name.strip()) self.save_committee(committee)
def scrape_reps_comm(self): url = 'http://www.maine.gov/legis/house/hsecoms.htm' page = self.urlopen(url) root = lxml.html.fromstring(page) count = 0 for n in range(1, 12, 2): path = 'string(//body/center[%s]/h1/a)' % (n) comm_name = root.xpath(path) committee = Committee('lower', comm_name) count = count + 1 path2 = '/html/body/ul[%s]/li/a' % (count) for el in root.xpath(path2): rep = el.text if rep.find('(') != -1: mark = rep.find('(') rep = rep[15: mark].strip() if 'chair' in rep.lower(): role = 'chair' rep = re.sub(r'(?i)[\s,]*chair\s*$', '', rep).strip() else: role = 'member' committee.add_member(rep, role) committee.add_source(url) self.save_committee(committee)
def scrape_house_committee(self, committee_name, link): """Scrape individual committee page and add members""" find_expr = "//div[@class='col1']/ul[position()<3]/li" with self.urlopen(link) as page: # Find individual committee urls page = lxml.html.fromstring(page) #sub_committee if (len(page.xpath("//div[@class='col2']/h3[3]/a"))>0): sub_committee_url = self.base_href + '/house/committees/' + page.xpath("//div[@class='col2']/h3[3]/a")[0].attrib['href'] sub_committee_name = "General Sub of " + committee_name self.scrape_house_sub_committee(sub_committee_name, sub_committee_url) else: sub_committee_name = None com = Committee('lower', committee_name, subcommittee=sub_committee_name) for el in page.xpath(find_expr): member = [item.strip() for item in el.text_content().split(',',1)] if len(member) > 1: member_name, role = member else: member_name, role = member[0], 'member' if member_name != "": com.add_member(member_name, role) com.add_source(link) self.save_committee(com)
def scrape_house_committees(self): base_url = 'http://house.mi.gov/MHRPublic/CommitteeInfo.aspx?comkey=' with self.urlopen('http://house.mi.gov/mhrpublic/committee.aspx') as html: doc = lxml.html.fromstring(html) # get values out of drop down for opt in doc.xpath('//option'): name = opt.text # skip invalid choice if opt.text in ('Statutory Committees', 'Select One'): continue com_url = base_url + opt.get('value') with self.urlopen(com_url) as com_html: cdoc = lxml.html.fromstring(com_html) com = Committee(chamber='lower', committee=name) com.add_source(com_url) for a in doc.xpath('//a[starts-with(@id, "memberLink")]'): name = a.text.strip() # all links to http:// pages in servicecolumn2 are legislators for a in cdoc.xpath('//div[@class="servicecolumn2"]//a[starts-with(@href, "http")]'): name = a.text.strip() text = a.xpath('following-sibling::span/text()')[0] if 'Committee Chair' in text: role = 'chair' elif 'Vice-Chair' in text: role = 'vice chair' else: role = 'member' com.add_member(name, role=role) self.save_committee(com)
def scrape(self, chamber, term): url = self.urls[chamber] html = self.urlopen(url) doc = lxml.html.fromstring(html) doc.make_links_absolute(self.base_urls[chamber]) committee_types = { 'upper': ['Standing', 'Select', 'Joint'], 'lower': ['Standing', 'Select'] } for type_ in committee_types[chamber]: if type_ == 'Joint': _chamber = type_.lower() else: _chamber = chamber div = doc.xpath( '//div[contains(@class, "view-view-%sCommittee")]' % type_)[0] committees = div.xpath( 'descendant::span[@class="field-content"]/a/text()') committees = map(strip, committees) urls = div.xpath( 'descendant::span[@class="field-content"]/a/@href') for c, _url in zip(committees, urls): if c.endswith('Committee'): if type_ not in c: c = '%s %s' % (type_, c) elif ('Subcommittee' not in c): c = '%s Committee on %s' % (type_, c) else: if type_ not in c: c = '%s %s' % (type_, c) c = Committee(_chamber, c) c.add_source(_url) c.add_source(url) for member, role, kw in self.scrape_membernames( c, _url, chamber, term): c.add_member(member, role, **kw) _found = False if len(c['members']) == 0: for member, role, kw in self.scrape_membernames( c, _url + '/membersstaff', chamber, term): _found = True c.add_member(member, role, **kw) if _found: source = _url + '/membersstaff' c.add_source(source) if len(c['members']) == 0: cname = c['committee'] msg = '%r must have at least one member.' raise ValueError(msg % cname) code = codes[chamber].get(c['committee'].lower()) c['action_code'] = code self.save_committee(c) # Subcommittees div = doc.xpath('//div[contains(@class, "view-view-SubCommittee")]')[0] for subcom in div.xpath('div/div[@class="item-list"]'): committee = subcom.xpath('h4/text()')[0] names = subcom.xpath('descendant::a/text()') names = map(strip, names) urls = subcom.xpath('descendant::a/@href') committee = 'Standing Committee on ' + committee for n, _url in zip(names, urls): c = Committee(chamber, committee, subcommittee=n) c.add_source(_url) c.add_source(url) for member, role, kw in self.scrape_membernames( c, _url, chamber, term): c.add_member(member, role, **kw) _found = False if len(c['members']) == 0: for member, role, kw in self.scrape_membernames( c, _url + '/membersstaff', chamber, term): _found = True c.add_member(member, role, **kw) if _found: source = _url + '/membersstaff' c.add_source(source) if len(c['members']) == 0: cname = c['committee'] msg = '%r must have at least one member.' raise ValueError(msg % cname) self.save_committee(c)
def scrape_comm(self, chamber, term_name): url = 'http://billstatus.ls.state.ms.us/htms/%s_cmtememb.xml' % chamber with self.urlopen(url) as comm_page: root = lxml.etree.fromstring(comm_page, lxml.etree.HTMLParser()) if chamber == 'h': chamber = "lower" else: chamber = "upper" for mr in root.xpath('//committee'): name = mr.xpath('string(name)') comm = Committee(chamber, name) chair = mr.xpath('string(chair)') chair = chair.replace(", Chairman", "") role = "Chairman" if len(chair) > 0: comm.add_member(chair, role=role) vice_chair = mr.xpath('string(vice_chair)') vice_chair = vice_chair.replace(", Vice-Chairman", "") role = "Vice-Chairman" if len(vice_chair) > 0: comm.add_member(vice_chair, role=role) members = mr.xpath('string(members)').split(";") for leg in members: if leg[0] == " ": comm.add_member(leg[1: len(leg)]) else: comm.add_member(leg) comm.add_source(url) self.save_committee(comm)
def scrape_committee(self, chamber, url): committee_page = self.lxmlize(url) name_node = self.get_node( committee_page, '//table[@id="MainContent_formViewCommitteeInformation"]/tr//h3') c_name = ( name_node.text_content().strip() if name_node is not None and name_node.text_content() else None) if c_name: committee = Committee(chamber, clean_committee_name(c_name)) members_xpath = ( '//table[@id="MainContent_formViewCommitteeInformation_grid' 'ViewCommitteeMembers"]/tbody/tr' ) members = self.get_nodes(committee_page, members_xpath) tds = { 'title': 0, 'name': 1, 'role': 3 } for member in members: m_title = member[tds['title']].text_content() m_name = self.get_node( member[tds['name']], './/a[contains(@href, "/Members/Legislator?SponCode=")]' ).text_content() role = member[tds['role']].text_content() if m_title == 'Senator': m_chamber = 'upper' elif m_title == 'Representative': m_chamber = 'lower' else: m_chamber = None if role in ('Chair', 'Co-Chair', 'Vice Chair', 'Member', 'Advisory'): if chamber == 'joint': m_role = 'interim {}'.format(role.lower()) else: m_role = role.lower() else: m_role = None if m_role: committee.add_member(m_name, m_role, chamber=m_chamber) if not committee['members']: self.warning( 'skipping blank committee {0} at {1}'.format(c_name, url)) else: committee.add_source(url) # Interim committees are collected during the scraping # for joint committees, and most interim committees # have members from both chambers. However, a small # number of interim committees (right now, just 1) have # only members from one chamber, so the chamber is set # to their chamber instead of 'joint' for those # committees. if chamber == 'joint': m_chambers = set( [mem['chamber'] for mem in committee['members']]) if len(m_chambers) == 1: committee['chamber'] = m_chambers.pop() self.save_committee(committee) else: self.warning('No legislative committee found at {}'.format(url))
def scrape(self, chamber, term): if chamber == 'lower': # Committee members from both houses are listed # together. So, we'll only scrape once. return None session = None # Even thought each term spans two years, committee # memberships don't appear to change. So we only # need to scrape the first year of the term. for t in self.metadata["terms"]: if term == t["name"]: session = t['sessions'][-1] # session = self.metadata['session_details'][t['sessions'][-1]] break else: raise NoDataForPeriod(term) list_url = self.urls["list"] % (session, ) committees = {} page = self.get(list_url).text page = lxml.html.fromstring(page) for el in page.xpath(".//a[contains(@href, 'CommitteeMembers')]"): committees[el.text.strip()] = el.get("href") for c in committees: self.log(c) detail_url = self.urls["detail"] % (committees[c], ) page = self.get(detail_url).text page = lxml.html.fromstring(page) if re.match('\d{1,2}-', c): c = c.split('-', 1)[1] jcomm = Committee('joint', c.strip()) for table in page.xpath( ".//table[contains(@id, 'CommitteeMembers')]"): rows = table.xpath(".//tr") chamber = rows[0].xpath('.//td')[0].text_content().strip() chamber = 'upper' if chamber == 'Senator' else 'lower' comm = Committee(chamber, c.strip()) for row in rows[1:]: tds = row.xpath('.//td') name = tds[0].text_content().strip() role = 'chairman' if tds[3].text_content().strip( ) == 'Chairman' else 'member' comm.add_member(name, role, chamber=chamber) jcomm.add_member(name, role, chamber=chamber) comm.add_source(detail_url) self.save_committee(comm) jcomm.add_source(detail_url) self.save_committee(jcomm)
def scrape_committee(self, chamber, name, url, subcommittee=None): name = self._fix_committee_name(name) name = self._fix_committee_case(name) page = self.get(url).text page = lxml.html.fromstring(page) page.make_links_absolute(url) # Get the subcommittee name. xpath = '//div[@class="ms-WPBody"]//table//tr/td/b/text()' if subcommittee: subcommittee = page.xpath(xpath) if subcommittee: subcommittee = page.xpath(xpath).pop(0) subcommittee = self._fix_committee_name(subcommittee, parent=name, subcommittee=True) subcommittee = self._fix_committee_case(subcommittee) else: subcommittee = None # Dedupe. if (chamber, name, subcommittee) in self._seen: return self._seen.add((chamber, name, subcommittee)) comm = Committee(chamber, name, subcommittee=subcommittee) comm.add_source(url) member_nodes = page.xpath('//table[@class="dxgvTable"]/tr') for member_node in member_nodes: # Skip empty rows. if member_node.attrib['class'] == 'dxgvEmptyDataRow': continue mtype = member_node.xpath('string(td[1])').strip() if not mtype: mtype = 'member' member = member_node.xpath('string(td[3])').split() title = member[0] member = ' '.join(member[1:]) if title == 'Senator': mchamber = 'upper' elif title == 'Representative': mchamber = 'lower' else: # skip non-legislative members continue comm.add_member(member, mtype, chamber=mchamber) for a in page.xpath('//table[@id="ctl00_m_g_a194465c_f092_46df_b753_' '354150ac7dbd_ctl00_tblContainer"]//ul/li/a'): sub_name = a.text.strip() sub_url = urlescape(a.attrib['href']) self.scrape_committee(chamber, name, sub_url, subcommittee=sub_name) if not comm['members']: if subcommittee: self.warning( 'Not saving empty subcommittee {}.'.format(subcommittee)) else: self.warning('Not saving empty committee {}.'.format(name)) else: self.save_committee(comm)
def scrape_committee(self, chamber, com_name, url): html = self.get(url).text doc = lxml.html.fromstring(html) doc.make_links_absolute(url) com = Committee(chamber, com_name) com.add_source(url) if 'stab=04' in url: for table in doc.xpath('//table[@class="grid"]'): rows = table.xpath('tr') sub_name = rows[0].getchildren()[0].text.strip() # new table - subcommittee if sub_name != 'Full Committee': sub_name = sub_name.replace("Subcommittee", "").strip() com = Committee(chamber, com_name, subcommittee=sub_name) com.add_source(url) for row in rows[1:]: name = row.getchildren()[0].text_content().strip() name, role = define_role(name) com.add_member(name, role) self.save_committee(com) else: table_source = doc.xpath('//table[@class="noncogrid"]') if table_source != []: for table in table_source: row = table.xpath( 'tr/td/a[contains(@href, "sponpage")]/text()') sub_name_source = table.xpath('tr/th/text()') if "Subcommittee" in sub_name_source[0]: sub_name = sub_name_source[0] sub_name = sub_name.replace("Subcommittee", "").strip() com = Committee(chamber, com_name, subcommittee=sub_name) com.add_source(url) for name in row: name, role = define_role(name) com.add_member(name, role) self.save_committee(com) else: row = doc.xpath('//table[@class="spco"]/tr[1]/td/text()') for name in row: name, role = define_role(name) com.add_member(name, role) self.save_committee(com)
def scrape_upper_committee(self, url): filename, resp = self.urlretrieve(url) lines = convert_pdf(filename, 'text').split('\n') comm = None comm_name = '' title = '' MINIMUM_NAME_LENGTH = len('Hon _ _') for line in (x.decode('utf8') for x in lines): line = line.strip() if not line.strip(): continue if (line.startswith('Comisi') or line.startswith('COMISIONES') or line.startswith('SECRETAR')): if comm: # Joint committee rosters are not complete, unfortunately if "Conjunta" not in comm_name: self.save_committee(comm) comm = None comm_name = '' if not (line.startswith('COMISIONES') or line.startswith('SECRETAR')): comm_name = line # Remove "Committee" from committee names comm_name = (comm_name.replace( u"Comisión de ", "").replace(u"Comisión Especial para el Estudio de ", "").replace(u"Comisión Especial para ", "")) comm_name = re.sub(r'(?u)^(las?|el|los)\s', "", comm_name) comm_name = comm_name[0].upper() + comm_name[1:] # Committee president is always listed right after committee name elif (not comm and comm_name and not re.search(r'^(?:Co.)?President', line) and not line.startswith('Miembr')): comm_name = comm_name + " " + line elif (not comm and (re.search(r'^(?:Co.)?President', line) or line.startswith('Miembr')) and len(line) > len('Presidente ') + MINIMUM_NAME_LENGTH): comm = Committee('upper', comm_name) comm.add_source(url) if comm: assert re.search(r'(?u)Hon\.?\s\w', line) (temp_title, name) = line.split("Hon") name = name.strip(". ") if temp_title.strip(): title = temp_title # Translate titles to English for parity with other states if "President" in title: title = 'chairman' elif title.startswith("Vicepresident"): title = 'vicechairman' elif title.startswith("Secretari"): title = 'secretary' elif "Miembr" in title: title = 'member' else: raise AssertionError( "Unknown member type: {}".format(title)) # Many of the ex-officio members have appended titles if ", " in name: name = name.split(", ")[0] if name.lower() != 'vacante': comm.add_member(name, title) if comm and "Conjunta" not in comm_name: self.save_committee(comm) os.remove(filename)
def get_jfac(self, name, url): """gets membership info for the Joint Finance and Appropriations Committee.""" jfac_page = self.urlopen(url) html = lxml.html.fromstring(jfac_page) table = html.xpath('body/table/tr/td[2]/table')[0] committee = Committee('joint', name) for row in table.xpath('tr')[1:]: senate, house = row.xpath('td/strong') senate = senate.text.replace(u'\xa0', ' ') house = house.text.replace(u'\xa0', ' ') if ',' in senate: committee.add_member(*senate.split(','), chamber='upper') else: committee.add_member(senate, chamber='upper') if ',' in house: committee.add_member(*house.split(','), chamber='lower') else: committee.add_member(house, chamber='lower') committee.add_source(url) self.save_committee(committee)
def scrape_upper_committee(self, committee_name, url): page = self.lxmlize(url) committee = Committee('upper', committee_name) committee.add_source(url) # Committee member attributes. member_name = None member_role = None # Attempt to record the committee chair. committee_chair = self.get_node( page, '//div[@class="nys-senator" and div[@class="nys-senator--info"' ' and p[@class="nys-senator--title" and' ' normalize-space(text())="Chair"]]]') if committee_chair is not None: info_node = self.get_node( committee_chair, 'div[@class="nys-senator--info" and p[@class=' '"nys-senator--title" and contains(text(), "Chair")]]') if info_node is not None: # Attempt to retrieve committee chair's name. member_name_text = self.get_node( info_node, './h4[@class="nys-senator--name"][1]/a[1]/text()') if member_name_text is not None: member_name = member_name_text.strip() else: warning = ('Could not find the name of the chair for the' ' {} committee') self.logger.warning(warning.format(committee_name)) # Attempt to retrieve committee chair's role (explicitly). member_role_text = self.get_node( info_node, './p[@class="nys-senator--title" and contains(text(), ' '"Chair")][1]/text()') if member_role_text is not None: member_role = member_role_text.strip() else: # This seems like a silly case, but could still be useful # to check for. warning = ('Could not find the role of the chair for the' ' {} committee') self.logger.warning(warning.format(committee_name)) if member_name is not None and member_role is not None: committee.add_member(member_name, member_role) else: warning = ('Could not find information for the chair of the' ' {} committee.') self.logger.warning(warning.format(committee_name)) else: warning = 'Missing chairperson for the {} committee.' self.logger.warning(warning.format(committee_name)) # Get list of regular committee members. member_nodes = self.get_nodes( page, '//div[contains(concat(" ", @class, " "), ' '" c-senators-container ")]//div[@class="view-content"]/' ' div/a') # Attempt to record each committee member. for member_node in member_nodes: member_name = None member_name_text = self.get_node( member_node, './/div[@class="nys-senator--info"][1]/h4[@class=' '"nys-senator--name"][1]/text()') if member_name_text is not None: member_name = member_name_text.strip() if member_name is not None: committee.add_member(member_name, 'member') else: warning = ('Could not find the name of a member in the {}' ' committee') self.logger.warning(warning.format(committee_name)) return committee
def scrape_lower_committee(self, name, url): com = Committee('lower', name) com.add_source(url) with self.urlopen(url) as html: doc = lxml.html.fromstring(html) contact, directiva, reps = doc.xpath('//div[@class="sbox"]/div[2]') # all members are tails of images (they use img tags for bullets) # first three members are in the directiva div #pres, vpres, secretary, _ = directiva.xpath('.//img') chair = directiva.xpath( 'b[text()="Presidente:"]/following-sibling::img[1]') vchair = directiva.xpath( 'b[text()="Vice Presidente:"]/following-sibling::img[1]') sec = directiva.xpath( 'b[text()="Secretario(a):"]/following-sibling::img[1]') member = 0 if chair: com.add_member(clean_spaces(chair[0].tail), 'chairman') ++member if vchair: com.add_member(clean_spaces(vchair[0].tail), 'vice chairman') ++member if sec: com.add_member(clean_spaces(sec[0].tail), 'secretary') ++member for img in reps.xpath('.//img'): com.add_member(clean_spaces(img.tail)) ++member if member > 0: self.save_committee(com)
def scrape_lower(self, chamber, term): url = self.urls[chamber] html = self.get(url).text doc = lxml.html.fromstring(html) doc.make_links_absolute(self.base_urls[chamber]) committee_types = { 'upper': ['Standing', 'Select', 'Joint'], 'lower': ['Standing', 'Select'] } for type_ in committee_types[chamber]: if type_ == 'Joint': _chamber = type_.lower() else: _chamber = chamber for xpath in [ '//div[contains(@class, "view-view-%sCommittee")]' % type_, '//div[contains(@id, "block-views-view_StandingCommittee-block_1")]', '//div[contains(@class, "views-field-title")]', ]: div = doc.xpath(xpath) if div: break div = div[0] committees = div.xpath( 'descendant::span[@class="field-content"]/a/text()') committees = map(strip, committees) urls = div.xpath( 'descendant::span[@class="field-content"]/a/@href') for c, _url in zip(committees, urls): if 'autism' in _url: # The autism page takes a stunning 10 minutes to respond # with a 403. Skip it. continue c = c.replace("Committee on ", "").replace(" Committee", "") c = Committee(_chamber, c) c.add_source(_url) c.add_source(url) for member, role in self.scrape_lower_members(_url): c.add_member(member, role) _found = False if not c['members']: for member, role in self.scrape_lower_members( _url + '/membersstaff'): _found = True c.add_member(member, role) if _found: source = _url + '/membersstaff' c.add_source(source) if c['members']: self.save_committee(c) else: self.warning("No members found: {}".format(c)) # Subcommittees div = doc.xpath('//div[contains(@class, "view-view-SubCommittee")]')[0] for subcom in div.xpath('div/div[@class="item-list"]'): committee = subcom.xpath('h4/text()')[0] names = subcom.xpath('descendant::a/text()') names = map(strip, names) urls = subcom.xpath('descendant::a/@href') for n, _url in zip(names, urls): n = re.search(r'^Subcommittee.*?on (.*)$', n).group(1) c = Committee(chamber, committee, subcommittee=n) c.add_source(_url) c.add_source(url) for member, role in self.scrape_lower_members(_url): c.add_member(member, role) _found = False if not c['members']: for member, role in self.scrape_lower_members( _url + '/membersstaff'): _found = True c.add_member(member, role) if _found: source = _url + '/membersstaff' c.add_source(source) if c['members']: self.save_committee(c) else: self.warning("No members found: {}".format(c))
def scrape(self, chamber, term): url = self.urls[chamber] html = self.urlopen(url) doc = lxml.html.fromstring(html) doc.make_links_absolute(self.base_urls[chamber]) committee_types = { 'upper': ['Standing', 'Select', 'Joint'], 'lower': ['Standing', 'Select'] } for type_ in committee_types[chamber]: if type_ == 'Joint': _chamber = type_.lower() else: _chamber = chamber for xpath in [ '//div[contains(@class, "view-view-%sCommittee")]' % type_, '//div[contains(@id, "block-views-view_StandingCommittee-block_1")]', ]: div = doc.xpath(xpath) if div: break div = div[0] committees = div.xpath( 'descendant::span[@class="field-content"]/a/text()') committees = map(strip, committees) urls = div.xpath( 'descendant::span[@class="field-content"]/a/@href') for c, _url in zip(committees, urls): if 'autism' in _url: # The autism page takes a stunning 10 minutes to respond # with a 403. Skip it. continue if c.endswith('Committee'): if type_ not in c: c = '%s %s' % (type_, c) elif ('Subcommittee' not in c): c = '%s Committee on %s' % (type_, c) else: if type_ not in c: c = '%s %s' % (type_, c) c = Committee(_chamber, c) c.add_source(_url) c.add_source(url) for member, role, kw in self.scrape_membernames( c, _url, chamber, term): c.add_member(member, role, **kw) _found = False if len(c['members']) == 0: for member, role, kw in self.scrape_membernames( c, _url + '/membersstaff', chamber, term): _found = True c.add_member(member, role, **kw) if _found: source = _url + '/membersstaff' c.add_source(source) if len(c['members']) == 0: # Some committees weren't staff in early # 2013; opting to skip rather than blow # up the whole scrape. return cname = c['committee'] msg = '%r must have at least one member.' raise ValueError(msg % cname) if c['members']: self.save_committee(c) # Subcommittees div = doc.xpath('//div[contains(@class, "view-view-SubCommittee")]')[0] for subcom in div.xpath('div/div[@class="item-list"]'): committee = subcom.xpath('h4/text()')[0] names = subcom.xpath('descendant::a/text()') names = map(strip, names) urls = subcom.xpath('descendant::a/@href') committee = 'Standing Committee on ' + committee for n, _url in zip(names, urls): c = Committee(chamber, committee, subcommittee=n) c.add_source(_url) c.add_source(url) for member, role, kw in self.scrape_membernames( c, _url, chamber, term): c.add_member(member, role, **kw) _found = False if len(c['members']) == 0: for member, role, kw in self.scrape_membernames( c, _url + '/membersstaff', chamber, term): _found = True c.add_member(member, role, **kw) if _found: source = _url + '/membersstaff' c.add_source(source) if len(c['members']) == 0: # Some committees weren't staff in early # 2013; opting to skip rather than blow # up the whole scrape. return cname = c['committee'] msg = '%r must have at least one member.' raise ValueError(msg % cname) if c['members']: self.save_committee(c)
def scrape(self, session, chambers): year_slug = session[5:] # Load all committees via the private API committee_dump_url = \ 'http://legislature.vermont.gov/committee/loadList/{}/'.\ format(year_slug) json_data = self.get(committee_dump_url).text committees = json.loads(json_data)['data'] # Parse the information from each committee for info in committees: # Strip whitespace from strings info = {k: v.strip() for k, v in info.iteritems()} # Determine the chamber if info['CommitteeType'] == 'House Standing': chamber = 'lower' elif info['CommitteeType'] == 'Senate Standing': chamber = 'upper' elif info['CommitteeType'] == 'Joint Committee': chamber = 'joint' elif info['CommitteeType'] in ('Study Committee', 'Commissions'): if info['CommitteeName'].startswith("House"): chamber = 'lower' elif info['CommitteeName'].startswith("Senate"): chamber = 'upper' else: chamber = 'joint' else: raise AssertionError( "Unknown committee type found: '{}'".format( info['CommitteeType'])) comm = Committee(chamber=chamber, committee=info['CommitteeName']) # Determine membership and member roles # First, parse the member list and make sure it isn't a placeholder REMOVE_TAGS_RE = r'<.*?>' members = [ re.sub(REMOVE_TAGS_RE, '', x) for x in info['Members'].split('</br>') ] members = [x.strip() for x in members if x.strip()] for member in members: # Strip out titles, and exclude committee assistants if member.startswith("Rep. "): member = member[len("Rep. "):] elif member.startswith("Sen. "): member = member[len("Sen. "):] else: self.info("Non-legislator member found: {}".format(member)) # Determine the member's role in the committee if ',' in member: (member, role) = [x.strip() for x in member.split(',')] if 'jr' in role.lower() or 'sr' in role.lower(): raise AssertionError( "Name suffix confused for a committee role") else: role = 'member' comm.add_member(legislator=member, role=role) comm.add_source(committee_dump_url) self.save_committee(comm)
def __missing__(self, key): val = Committee('joint', key) self[key] = val return val
def scrape_page(self, a, chamber, term): page, text = self.lxmlize(a.attrib['href']) committee = a.text_content() twitter_ids = re.findall("setUser\('(.*)'\)", text) twitter_id = twitter_ids[0] if twitter_ids != [] else None roles = { ", Chair": "chair", ", Vice-Chair": "member" } committee = Committee(chamber, committee, twitter=twitter_id) committee.add_source(a.attrib['href']) tables = page.xpath("//table") added = False seen_people = set([]) for table in tables: people = table.xpath( ".//a[contains(@href, 'MemberDetailPage')]") for person in people: person = person.text_content().strip() role = "member" for flag in roles: if person.endswith(flag): role = roles[flag] person = person[:-len(flag)].strip() if person in seen_people: continue if person in "": continue seen_people.add(person) committee.add_member(person, role) added = True if added: self.save_committee(committee) return tables = page.xpath("//table") added = False seen_people = set([]) for table in tables: if "committee members" in table.text_content().lower(): for person in table.xpath(".//td/text()"): person = person.strip() if person != "": if person in seen_people: continue seen_people.add(person) committee.add_member(person, "member") added = True if added: self.save_committee(committee) return self.warning("Unable to scrape!")
def scrape_upper(self, chamber, term): url = 'http://senate.ca.gov/committees' doc = self.lxmlize(url) standing_committees = doc.xpath( '//h2[text()="Standing Committees"]/../following-sibling::div//a') sub_committees = doc.xpath( '//h2[text()="Sub Committees"]/../following-sibling::div//a') joint_committees = doc.xpath( '//h2[text()="Joint Committees"]/../following-sibling::div//a') other_committees = doc.xpath( '//h2[text()="Other"]/../following-sibling::div//a') for committee in (standing_committees + sub_committees + joint_committees + other_committees): (comm_name, ) = committee.xpath('text()') comm = Committee(chamber=chamber, committee=comm_name) (comm_url, ) = committee.xpath('@href') comm.add_source(comm_url) comm_doc = self.lxmlize(comm_url) if comm_name.startswith("Joint"): comm['chamber'] = 'joint' comm['committee'] = (comm_name.replace("Joint ", "").replace( "Committee on ", "").replace(" Committee", "")) if comm_name.startswith("Subcommittee"): (full_comm_name, ) = comm_doc.xpath('//div[@class="banner-sitename"]/a/text()') full_comm_name = re.search(r'^Senate (.*) Committee$', full_comm_name).group(1) comm['committee'] = full_comm_name comm_name = re.search(r'^Subcommittee.*?on (.*)$', comm_name).group(1) comm['subcommittee'] = comm_name members = comm_doc.xpath( '//a[(contains(@href, "/sd") or ' 'contains(@href, "assembly.ca.gov/a")) and ' '(starts-with(text(), "Senator") or ' 'starts-with(text(), "Assembly Member"))]/text()') for member in members: if not member.strip(): continue (mem_name, mem_role) = re.search( r'''(?ux) ^(?:Senator|Assembly\sMember)\s # Legislator title (.+?) # Capture the senator's full name (?:\s\((.{2,}?)\))? # There may be role in parentheses (?:\s\([RD]\))? # There may be a party affiliation \s*$ ''', member).groups() comm.add_member(legislator=mem_name, role=mem_role if mem_role else 'member') assert comm['members'], "No members found for committee {}".format( comm_name) self.save_committee(comm)
def select_special_comm(self): main_url = 'http://www.nebraskalegislature.gov/committees/select-committees.php' with self.urlopen(main_url) as page: page = lxml.html.fromstring(page) for comm_names in page.xpath('//div[@class="content_box"]'): name = comm_names.xpath('h2')[0].text if name != None: committee = Committee('upper', name) committee.add_source(main_url) for senators in comm_names.xpath( 'ul[@class="nobullet"]/li'): senator = senators[0].text if 'Chairperson' in senator: role = 'Chairperson' senator = senator[5:-13].strip() else: role = 'member' senator = senator[5:].strip() committee.add_member(senator, role) self.save_committee(committee) else: name = comm_names.xpath('h2/a')[0].text committee = Committee('upper', name) committee.add_source(main_url) for senators in comm_names.xpath( 'ul[@class="nobullet"]/li'): senator = senators[0].text if 'Chairperson' in senator: role = 'chairperson' senator = senator[5:-13].strip() else: role = 'member' senator = senator[5:].strip() committee.add_member(senator, role) self.save_committee(committee)
def _scrape_upper_chamber(self, session, chamber): self.log('Scraping upper chamber for committees.') if self._is_post_2015: url = '{base}{year}web/standing-committees'.format( base=self._senate_url_base, year=session[2:]) comm_container_id = 'primary' else: url = '{base}{year}info/com-standing.htm'.format( base=self._senate_url_base, year=session[2:]) comm_container_id = 'mainContent' page = self.lxmlize(url) comm_links = self.get_nodes( page, '//div[@id = "{}"]//p/a'.format(comm_container_id)) for comm_link in comm_links: # Normalize to uppercase - varies between "Assigned bills" and "Assigned Bills" if "ASSIGNED BILLS" in comm_link.text_content().upper(): continue comm_link = comm_link.attrib['href'] if self._is_post_2015: if not "web" in comm_link: continue else: if not "comm" in comm_link: continue comm_page = self.lxmlize(comm_link) if self._is_post_2015: comm_name = self.get_node(comm_page, '//h1[@class="entry-title"]/text()') members = self.get_nodes( comm_page, '//div[@id="bwg_standart_thumbnails_0"]/a') else: comm_name = self.get_node(comm_page, '//div[@id="mainContent"]/p/text()') members = self.get_nodes(comm_page, '//div[@id="mainContent"]//td/a') comm_name = comm_name.replace(' Committee', '') comm_name = comm_name.strip() committee = Committee(chamber, comm_name) for member in members: mem_link = member.attrib["href"] if not "mem" in mem_link: continue if self._is_post_2015: mem_parts = self.get_node( member, './/span[@class="bwg_title_spun2_0"]') mem_parts = member.text_content().strip().split(',') # Senator title stripping mainly for post-2015. mem_name = re.sub('^Senator[\s]+', '', mem_parts[0]) #this one time, MO forgot the comma between #the member and his district. Very rarely relevant try: int(mem_name[-4:-2]) #the district's # is in this position except ValueError: pass else: mem_name = " ".join( mem_name.split(" ")[0:-1]) #member name fixed #ok, so this next line. We don't care about #the first 2 elements of mem_parts anymore #so whatever. But if the member as a role, we want #to make sure there are 3 elements in mem_parts and #the last one is actually the role. This sucks, sorry. mem_parts.append(mem_parts[-1]) mem_role = 'member' if len(mem_parts) > 2: mem_role = mem_parts[2].lower() if mem_name == "": continue committee.add_member(mem_name, role=mem_role) committee.add_source(url) committee.add_source(comm_link) self.save_committee(committee)
def scrape_senate_committees(self, term_name, chamber): years = [t[2:] for t in term_name.split('-')] for year in years: if int(year) > int(str(dt.datetime.now().year)[2:]): self.log("Not running session %s, it's in the future." % (term_name)) continue url = '{base}{year}info/com-standing.htm'.format( base=self.senate_url_base, year=year) page_string = self.get(url).text page = lxml.html.fromstring(page_string) comm_links = page.xpath('//div[@id = "mainContent"]//p/a') for comm_link in comm_links: if "Assigned bills" in comm_link.text_content(): continue comm_link = comm_link.attrib['href'] if not "comm" in comm_link: continue comm_page = lxml.html.fromstring(self.get(comm_link).text) comm_name = comm_page.xpath( "//div[@id='mainContent']/p/text()")[0].strip() comm_name = comm_name.replace(' Committee', '') comm_name = comm_name.strip() committee = Committee(chamber, comm_name) members = comm_page.xpath("//div[@id='mainContent']//li/a") for member in members: mem_link = member.attrib["href"] if not "members" in mem_link: continue mem_parts = member.text_content().strip().split(',') mem_name = mem_parts[0] #this one time, MO forgot the comma between #the member and his district. Very rarely relevant try: int(mem_name[-4:-2] ) #the district's # is in this position except ValueError: pass else: mem_name = " ".join( mem_name.split(" ")[0:-1]) #member name fixed #ok, so this next line. We don't care about #the first 2 elements of mem_parts anymore #so whatever. But if the member as a role, we want #to make sure there are 3 elements in mem_parts and #the last one is actually the role. This sucks, sorry. mem_parts.append(mem_parts[-1]) mem_role = 'member' if len(mem_parts) > 2: mem_role = mem_parts[2].lower() if mem_name == "": continue committee.add_member(mem_name, role=mem_role) committee.add_source(url) committee.add_source(comm_link) self.save_committee(committee)
def scrape_house(self): url = "http://house.louisiana.gov/H_Reps/H_Reps_CmtesFull.asp" comm_cache = {} text = self.get(url).text page = lxml.html.fromstring(text) for row in page.xpath("//table[@bordercolorlight='#EAEAEA']/tr"): cells = row.xpath('td') name = cells[0].xpath('string()').strip() if name.startswith('Vacant'): continue font = cells[1] committees = [] if font is not None and font.text: committees.append(font.text.strip()) for br in font.xpath('br'): if br.text: committees.append(br.text.strip()) if br.tail: committees.append(br.tail) for comm_name in committees: mtype = 'member' if comm_name.endswith(', Chairman'): mtype = 'chairman' comm_name = comm_name.replace(', Chairman', '') elif comm_name.endswith(', Co-Chairmain'): mtype = 'co-chairmain' comm_name = comm_name.replace(', Co-Chairmain', '') elif comm_name.endswith(', Vice Chair'): mtype = 'vice chair' comm_name = comm_name.replace(', Vice Chair', '') elif comm_name.endswith(', Ex Officio'): mtype = 'ex officio' comm_name = comm_name.replace(', Ex Officio', '') elif comm_name.endswith(", Interim Member"): mtype = 'interim' comm_name = comm_name.replace(", Interim Member", "") if comm_name.startswith('Joint'): chamber = 'joint' else: chamber = 'lower' try: committee = comm_cache[comm_name] except KeyError: if comm_name.strip() == "": continue committee = Committee(chamber, comm_name) committee.add_source(url) comm_cache[comm_name] = committee committee.add_member(name, mtype) special = self.scrape_house_special(comm_cache.keys()) for name, comm in special.items(): comm_cache[name] = comm for committee in comm_cache.values(): self.save_committee(committee)
def scrape_committees_pdf(self, year, chamber, filename, url): if chamber == 'lower' and year == 2015: text = self._fix_house_text(filename) else: text = convert_pdf(filename, type='text-nolayout') for hotgarbage, replacement in ( (r'Judicial Branch, Law Enforcement,\s+and\s+Justice', 'Judicial Branch, Law Enforcement, and Justice'), (r'Natural Resources and\s+Transportation', 'Natural Resources and Transportation'), (r'(?u)Federal Relations, Energy,?\s+and\s+Telecommunications', 'Federal Relations, Energy, and Telecommunications')): text = re.sub(hotgarbage, replacement, text) lines = iter(text.splitlines()) # Drop any lines before the ag committee. lines = dropwhile(lambda s: 'Agriculture' not in s, lines) def is_committee_name(line): if '(cont.)' in line.lower(): return False for s in ('committee', ' and ', 'business', 'resources', 'legislative', 'administration', 'government', 'local', 'planning', 'judicial', 'natural', 'resources', 'general', 'health', 'human', 'education'): if s in line.lower(): return True if line.istitle() and len(line.split()) == 1: return True return False def is_legislator_name(line): return re.search(r'\([RD]', line) comm = None in_senate_subcommittees = False while True: try: line = lines.next() except StopIteration: break # Replace Unicode variants with ASCII equivalents line = line.replace(" ", " ").replace("‐", "-") if 'Subcommittees' in line: # These appear in both chambers' lists, so de-dup the scraping if chamber == 'lower': break elif chamber == 'upper': self.info("Beginning scrape of joint subcommittees") in_senate_subcommittees = True chamber = 'joint' continue if is_committee_name(line): subcommittee = None if in_senate_subcommittees: committee = ('Joint Appropriations/Finance & Claims') subcommittee = line.strip() else: committee = line.strip() if comm and comm['members']: self.save_committee(comm) comm = Committee(chamber, committee=committee, subcommittee=subcommittee) comm.add_source(url) elif is_legislator_name(line): name, party = line.rsplit('(', 1) name = name.strip().replace("Rep. ", "").replace("Sen. ", "") if re.search(' Ch', party): role = 'chair' elif ' VCh' in party: role = 'vice chair' elif ' MVCh' in party: role = 'minority vice chair' else: role = 'member' comm.add_member(name, role) if comm['members']: self.save_committee(comm)
def scrape_comm(self, url, chamber): data = self.post(url).json()['Data'] for item in data: comm_name = item['CommitteeName'] committee = Committee(chamber, comm_name) chair_man = str(item['ChairName']) vice_chair = str(item['ViceChairName']) comm_id = item['CommitteeId'] comm_url = self.get_comm_url(chamber, comm_id, comm_name) members = self.scrape_member_info(comm_url) if vice_chair != 'None': committee.add_member(vice_chair, 'Vice-Chair') if chair_man != 'None': committee.add_member(chair_man, 'Chairman') for member in members: # vice_chair and chair_man already added. if chair_man not in member and vice_chair not in member: member = " ".join(member.split()) if member: committee.add_member(member) committee.add_source(comm_url) committee.add_source(url) self.save_committee(committee)
def scrape(self, chamber, term): self.validate_term(term) session = self.get_session_for_term(term) try: session_id = self.get_session_id(session) except KeyError: raise NoDataForPeriod # not getting the floor committees maybe try it during the new session # for committee_type in ('S', 'F'): # self.scrape_index(chamber, session, session_id, committee_type) url = base_url + 'xml/committees.asp?session=%s' % session_id with self.urlopen(url) as page: root = etree.fromstring(page.bytes, etree.XMLParser(recover=True)) body = '//body[@Body="%s"]/committee' % { 'upper': 'S', 'lower': 'H' }[chamber] for com in root.xpath(body): c_id, name, short_name, sub = com.values() # the really good thing about AZ xml api is that their committee element # tells you whether this is a sub committee or not if sub == '1': # bad thing is that the committee names are no longer consistant # so we can try to get the parent name: parent = name.split('Subcommittee')[0].strip() # and maybe the Sub Committee's name try: name = name[name.index('Subcommittee'):] except ValueError: # but if that doesn't work out then we will fix it manually # shouldnt be too hard since parent and subcommittee will be the same #self.log("I am my own grandpa: %s" % name) pass c = Committee(chamber, parent, short_name=short_name, subcommittee=name, session=session, az_committee_id=c_id) else: c = Committee(chamber, name, short_name=short_name, session=session, az_committee_id=c_id) c.add_source(url) #for some reason they don't always have any info on the committees' try: self.scrape_com_info(session, session_id, c_id, c) except HTTPError: pass if not c['members']: continue self.save_committee(c)
def scrape_joint_committee(self, committee_name, url): com = Committee('joint', committee_name) with self.urlopen(url) as page: page = lxml.html.fromstring(page) if 'state.tn.us' in url: for el in page.xpath( "//div[@class='Blurb']/table//tr[2 <= position() and position() < 10]/td[1]/a" ): member_name = el.text if 'Senator' in member_name: member_name = member_name[8:len(member_name)] elif 'Representative' in member_name: member_name = member_name[15:len(member_name)] else: member_name = member_name[17:len(member_name)] com.add_member(member_name, 'member') elif 'gov-opps' in url: links = ['senate', 'house'] for link in links: chamber_link = self.base_href + '/' + link + '/committees/gov-opps.html' with self.urlopen(chamber_link) as chamber_page: chamber_page = lxml.html.fromstring(chamber_page) for mem in chamber_page.xpath( "//div[@class='col1']/ul[position() <= 2]/li/a" ): member = [ item.strip() for item in mem.text_content().split(',', 1) ] if len(member) > 1: member_name, role = member else: member_name, role = member[0], 'member' if member_name != "": com.add_member(member_name, role) com.add_source(chamber_link) else: # If the member sections all state "TBA", skip saving this committee. li_text = page.xpath( "//div[@class='col1']/ul[position() <= 3]/li/text()") if set(li_text) == set(['TBA']): return for el in page.xpath( "//div[@class='col1']/ul[position() <= 3]/li/a"): member = [ item.strip() for item in el.text_content().split(',', 1) ] if len(member) > 1: member_name, role = member else: member_name, role = member[0], 'member' if member_name != "": com.add_member(member_name, role) com.add_source(url) self.save_committee(com)
def get_joint_committees_data(self, name, url): page = self.get(url).text html = lxml.html.fromstring(page) committee = Committee('joint', name) table = html.xpath("//section[@class=' row-equal-height no-padding']") for td in table: senate_members = td.xpath('div[1]/div/div/div[2]/div/p/strong') if (len(senate_members) > 0): member_string = list(senate_members[0].itertext()) if (len(member_string) > 1): name = member_string[0] for ch in ['\r\n', u'\xa0', u'\u2013', 'Sen.']: if ch in name: name = name.replace(ch, ' ').encode('ascii', 'ignore').strip() role = member_string[1] for ch in ['\r\n', u'\xa0', u'\u2013', ',']: if ch in role: role = role.replace(ch, ' ').encode('ascii', 'ignore').strip() committee.add_member(name, role=role, chamber='senate') else: name = member_string[0] for ch in ['\r\n', u'\xa0', u'\u2013', 'Sen.']: if ch in name: name = name.replace(ch, ' ').encode('ascii', 'ignore').strip() committee.add_member(name, chamber='senate') house_members = list( td.xpath('div[2]/div/div/div[2]/div/p/strong')) if (len(house_members) > 0): member_string = list(house_members[0].itertext()) if (len(member_string) > 1): name = member_string[0] for ch in ['\r\n', u'\xa0', u'\u2013', 'Rep.']: if ch in name: name = name.replace(ch, ' ').encode('ascii', 'ignore').strip() role = member_string[1] for ch in ['\r\n', u'\xa0', u'\u2013', ',']: if ch in role: role = role.replace(ch, ' ').encode('ascii', 'ignore').strip() committee.add_member(name, role=role, chamber='house') else: name = member_string[0] for ch in ['\r\n', u'\xa0', u'\u2013', 'Rep.']: if ch in name: name = name.replace(ch, ' ').encode('ascii', 'ignore').strip() committee.add_member(name, chamber='house') committee.add_source(url) self.save_committee(committee)
def scrape_current(self, chamber, term): if chamber == 'upper': chambers = ['special_committees', 'senate_committees'] else: chambers = ['house_committees'] committee_request = self.urlopen(ksapi.url + 'ctte/') committee_json = json.loads(committee_request) for com_type in chambers: committees = committee_json['content'][com_type] for committee_data in committees: # set to joint if we are using the special_committees com_chamber = ('joint' if com_type == 'special_committees' else chamber) committee = Committee(com_chamber, committee_data['TITLE']) com_url = ksapi.url + 'ctte/%s/' % committee_data['KPID'] try: detail_json = self.urlopen(com_url) except scrapelib.HTTPError: self.warning("error fetching committee %s" % com_url) continue details = json.loads(detail_json)['content'] for chair in details['CHAIR']: committee.add_member(chair['FULLNAME'], 'chairman') for vicechair in details['VICECHAIR']: committee.add_member(vicechair['FULLNAME'], 'vice-chairman') for rankedmember in details['RMMEM']: committee.add_member(rankedmember['FULLNAME'], 'ranking member') for member in details['MEMBERS']: committee.add_member(member['FULLNAME']) if not committee['members']: self.warning('skipping blank committee %s' % committee_data['TITLE']) else: committee.add_source(com_url) self.save_committee(committee)
def scrape_committees_pdf(self, year, chamber, filename, url): text = convert_pdf(filename, type='text-nolayout') # Hot garbage. for hotgarbage, replacement in ( ('Judicial Branch, Law Enforcement,\s+and\s+Justice', 'Judicial Branch, Law Enforcement, and Justice'), ('Natural Resources and\s+Transportation', 'Natural Resources and Transportation'), ('Federal Relations, Energy,\sand\sTelecommunications', 'Federal Relations, Energy, and Telecommunications')): text = re.sub(hotgarbage, replacement, text) lines = iter(text.splitlines()) # Drop any lines before the ag committee. lines = dropwhile(lambda s: 'Agriculture' not in s, lines) def is_committee_name(line): if '(cont.)' in line.lower(): return False for s in ('committee', ' and ', 'business', 'resources', 'legislative', 'administration', 'government', 'local', 'planning', 'judicial', 'natural', 'resources', 'general', 'health', 'human'): if s in line.lower(): return True if line.istitle() and len(line.split()) == 1: return True return False def is_legislator_name(line): return re.search(r'\([RD]', line) comm = None in_senate_subcommittees = False while 1: try: line = lines.next() except StopIteration: break if 'Joint Appropriations/Finance &' in line: # Toss the line continuation. lines.next() # Move on. in_senate_subcommittees = True chamber = 'joint' continue if is_committee_name(line): subcommittee = None if in_senate_subcommittees: committee = ('Joint Appropriations/Finance & Claims') subcommittee = line else: committee = line if comm and comm['members']: self.save_committee(comm) comm = Committee(chamber, committee=committee, subcommittee=subcommittee) comm.add_source(url) elif is_legislator_name(line): name, party = line.rsplit('(', 1) name = name.strip() if re.search('[^V] Ch', party): role = 'chair' elif 'V Ch' in party: role = 'vice chair' else: role = 'member' comm.add_member(name, role) if comm['members']: self.save_committee(comm)
def scrape(self, term, chambers): t = next( (item for item in self.metadata["terms"] if item["name"] == term), None) session = max(t["sessions"]) subcomms = self.get_subcommittee_info(session) api_base_url = "https://api.iga.in.gov" html_base_url = "http://iga.in.gov/legislative/{}/committees/".format( session) client = ApiClient(self) r = client.get("committees", session=session) all_pages = client.unpaginate(r) for comm_info in all_pages: #this is kind of roundabout, but needed in order #to take advantage of all of our machinery to make #sure we're not overloading their api comm_link = comm_info["link"] comm_name = comm_link.split("/")[-1] if "withdrawn" in comm_name or "conference" in comm_name: continue try: comm_json = client.get("committee", committee_link=comm_link[1:]) except HTTPError: self.logger.warning("Page does not exist") continue try: chamber = comm_json["chamber"]["name"] except KeyError: chamber = 'joint' else: if chamber == "Senate": chamber = "upper" elif chamber == "House": chamber = "lower" else: raise AssertionError( "Unknown committee chamber {}".format(chamber)) name = comm_json["name"] try: owning_comm = subcomms[name] except KeyError: name = name.replace("Statutory Committee on", "").strip() comm = Committee(chamber, name) else: name = name.replace("Statutory Committee on", "").replace("Subcommittee", "").strip() comm = Committee(chamber, owning_comm, subcommittee=name) chair = self.process_special_members(comm, comm_json, "chair") vicechair = self.process_special_members(comm, comm_json, "viceChair") ranking = self.process_special_members(comm, comm_json, "rankingMinMember") #leadership is also listed in membership #so we have to make sure we haven't seen them yet comm_members = [m for m in [chair, vicechair, ranking] if m] for mem in comm_json["members"]: mem_name = mem["firstName"] + " " + mem["lastName"] if mem_name not in comm_members: comm_members.append(mem_name) comm.add_member(mem_name) api_source = api_base_url + comm_link if comm_name[:10] == "committee_": html_source = html_base_url + comm_name[10:] comm.add_source(html_source) comm.add_source(api_source) self.save_committee(comm)
def scrape_joint_committee(self, committee_name, url): if 'state.tn.us' in url: com = Committee('joint', committee_name) page = self.get(url).text page = lxml.html.fromstring(page) for el in page.xpath( "//div[@class='Blurb']/table//tr[2 <= position() and position() < 10]/td[1]" ): if el.xpath('text()') == ['Vacant']: continue (member_name, ) = el.xpath('a/text()') if el.xpath('text()'): role = el.xpath('text()')[0].strip(' ,') else: role = 'member' if 'Senator' in member_name: member_name = member_name[8:len(member_name)] elif 'Representative' in member_name: member_name = member_name[15:len(member_name)] else: member_name = member_name[17:len(member_name)] com.add_member(member_name, role) com.add_source(url) self.save_committee(com) elif 'gov-opps' in url: com = Committee('joint', committee_name) page = self.get(url).text page = lxml.html.fromstring(page) links = ['senate', 'house'] for link in links: chamber_link = self.base_href + '/' + link + '/committees/gov-opps.html' chamber_page = self.get(chamber_link).text chamber_page = lxml.html.fromstring(chamber_page) OFFICER_SEARCH = '//h2[contains(text(), "Committee Officers")]/' \ 'following-sibling::div/ul/li/a' MEMBER_SEARCH = '//h2[contains(text(), "Committee Members")]/' \ 'following-sibling::div/ul/li/a' for a in (chamber_page.xpath(OFFICER_SEARCH) + chamber_page.xpath(MEMBER_SEARCH)): member_name = ' '.join( [x.strip() for x in a.xpath('.//text()') if x.strip()]) role = a.xpath('small') if role: role = role[0].xpath('text()')[0].strip() else: role = 'member' com.add_member(member_name, role) com.add_source(chamber_link) com.add_source(url) self.save_committee(com) else: self._scrape_committee(committee_name, url, 'joint')
def scrape(self, chamber, term): urls = { 'upper': 'http://legis.delaware.gov/LIS/LIS%s.nsf/SCommittees', 'lower': 'http://legis.delaware.gov/LIS/LIS%s.nsf/HCommittees' } # Mapping of term names to session numbers (see metatdata). term2session = { "2015-2016": "148", "2013-2014": "147", "2011-2012": "146" } session = term2session[term] if chamber == "lower": #only scrape joint comms once self.scrape_joint_committees(term, session) url = urls[chamber] % (session, ) page = lxml.html.fromstring(self.get(url).text) page.make_links_absolute(url) for row in page.xpath('//tr'): if len(row.xpath('./td')) > 0: #if statement removes header tr comm = row.xpath('.//a')[1] comm_name = comm.text_content().strip() comm_url = comm.attrib["href"] comm_page = lxml.html.fromstring(self.get(comm_url).text) comm_page.make_links_absolute(comm_url) committee = Committee(chamber, comm_name) committee.add_source(comm_url) committee.add_source(url) chair = comm_page.xpath(".//div[@class='sub_title']") chair = chair[0].text.replace("Chairman:", "").strip() committee.add_member(chair, "Chairman") for table in comm_page.xpath(".//table"): header, content = table.xpath(".//td") header = header.text_content().strip() content = content.text_content().strip() if "Vice" in header: if content: committee.add_member(content, "Vice-Chairman") elif header == "Members:": for m in content.split("\n"): committee.add_member(m.strip()) self.save_committee(committee)