def scrape_senate_committee(self, url): html = self.get(url).text doc = lxml.html.fromstring(html) name = doc.xpath('//h3/text()')[0] name = name.replace(' Committee', '') com = Committee(chamber='upper', committee=name) for member in doc.xpath('//div[@id="committeeright"]//a'): member_name = member.text.strip() # don't add clerks if member_name == 'Committee Clerk': continue # skip phone links if member.get("href").startswith("tel:"): continue if 'Committee Chair' in member.tail: role = 'chair' elif 'Majority Vice' in member.tail: role = 'majority vice chair' elif 'Minority Vice' in member.tail: role = 'minority vice chair' else: role = 'member' com.add_member(member_name, role=role) com.add_source(url) self.save_committee(com)
def scrape_committee(self, name, url, chamber): com = Committee(chamber, name) com.add_source(url) data = self.get(url).text doc = lxml.html.fromstring(data) for leg in doc.xpath('//div[@id="members"]/div[@id="members"]/p/a/text()'): leg = leg.replace('Representative ', '') leg = leg.replace('Senator ', '') leg = leg.strip() if ' (' in leg: leg, role = leg.split(' (') if 'Vice-Chair' in role: role = 'vice-chair' elif 'Co-Chair' in role: role = 'co-chair' elif 'Chair' in role: role = 'chair' else: raise Exception('unknown role: %s' % role) else: role = 'member' com.add_member(leg, role) self.save_committee(com)
def scrape_committee(self, chamber, url): html = self.urlopen(url) doc = lxml.html.fromstring(html) name = doc.xpath('//span[@class="committeeShortName"]/text()') if len(name) == 0: self.warning("Had to skip this malformed page.") return # Because of http://www.malegislature.gov/Committees/Senate/S29 this # XXX: hack had to be pushed in. Remove me ASAP. This just skips # malformed pages. name = name[0] com = Committee(chamber, name) com.add_source(url) # get both titles and names, order is consistent titles = doc.xpath('//p[@class="rankingMemberTitle"]/text()') names = doc.xpath('//p[@class="rankingMemberName"]/a/text()') for title, name in zip(titles, names): com.add_member(name, title) for member in doc.xpath('//div[@class="committeeRegularMembers"]//a/text()'): com.add_member(member) if com['members']: self.save_committee(com)
def scrape_assembly(self): """Scrape Assembly Committees""" assembly_committees_url = "http://assembly.state.ny.us/comm/" with self.urlopen(assembly_committees_url) as html: doc = lxml.html.fromstring(html) standing_committees, subcommittees, legislative_commissions, task_forces = doc.cssselect('#sitelinks ul') committee_paths = set([l.get('href') for l in standing_committees.cssselect("li a[href]") if l.get("href").startswith('?sec=mem')]) for committee_path in committee_paths: committee_url = assembly_committees_url+committee_path with self.urlopen(committee_url) as chtml: cdoc = lxml.html.fromstring(chtml) for h in cdoc.cssselect("#content .pagehdg"): if h.text: committee_name = h.text.split('Committee Members')[0].strip() break committee = Committee("lower", committee_name) committee.add_source(committee_url) members = cdoc.cssselect("#sitelinks")[0] first = 1 for member in members.iter('span'): member = member.xpath('li/a')[0].text if first == 1: committee.add_member(member, 'chair') first = 0 else: committee.add_member(member) self.save_committee(committee)
def select_special_comm(self): main_url = 'http://www.nebraskalegislature.gov/committees/select-committees.php' with self.urlopen(main_url) as page: page = lxml.html.fromstring(page) for comm_names in page.xpath('/html/body/div[@id="wrapper"]/div[@id="content"]/div[@id="content_text"]/div[@class="content_box_container"]/div[@class="content_box"]'): name = comm_names.xpath('h2')[0].text if name != None: committee = Committee('upper', name) committee.add_source(main_url) for senators in comm_names.xpath('ul[@class="nobullet"]/li'): senator = senators[0].text if 'Chairperson' in senator: role = 'Chairperson' senator = senator[5:-13] else: role = 'member' senator = senator[5:-1] committee.add_member(senator, role) self.save_committee(committee) else: name = comm_names.xpath('h2/a')[0].text committee = Committee('upper', name) committee.add_source(main_url) for senators in comm_names.xpath('ul[@class="nobullet"]/li'): senator = senators[0].text if 'Chairperson' in senator: role = 'chairperson' senator = senator[5:-13] else: role = 'member' senator = senator[5:-1] committee.add_member(senator, role) self.save_committee(committee)
def scrape_upper_committee(self, name, url): with self.urlopen(url) as page: page = lxml.html.fromstring(page) comm = Committee('upper', name) comm.add_source(url) member_div = page.xpath("//div[@class = 'committee-members']")[0] seen = set() for link in member_div.xpath(".//a"): if not link.text: continue member = link.text.strip() next_elem = link.getnext() if (next_elem is not None and next_elem.tag == 'a' and next_elem.attrib['href'] == link.attrib['href']): # Sometimes NY is cool and splits names across a # couple links member = "%s %s" % (member, next_elem.text.strip()) member = re.sub(r'\s+', ' ', member) if member in seen or not member: continue seen.add(member) name, role = parse_name(member) comm.add_member(name, role) self.save_committee(comm)
def scrape_lower_committee(self, name, parent, url): page = lxml.html.fromstring(self.urlopen(url)) page.make_links_absolute(url) if 'Joint' in name or (parent and 'Joint' in parent): chamber = 'joint' else: chamber = 'lower' if parent: comm = Committee(chamber, parent, subcommittee=name) else: comm = Committee(chamber, name) comm.add_source(url) xpath = "//a[contains(@href, 'District')]" for link in page.xpath(xpath): member = link.xpath('string()').strip() member = re.sub(r'\s+', ' ', member) if not member: continue match = re.match(r'((Co-)?(Vice )?Chair)?Rep\. ([^\(]+)', member) member = match.group(4).strip() role = match.group(1) or 'member' comm.add_member(member, role.lower()) self.save_committee(comm)
def standing_comm(self): main_url = 'http://www.nebraskalegislature.gov/committees/standing-committees.php' with self.urlopen(main_url) as page: page = lxml.html.fromstring(page) for comm_links in page.xpath('/html/body/div[@id="wrapper"]/div[@id="content"]/div[@id="content_text"]/div[@class="content_box_container"]/div[@class="content_box"][1]/ul[@class="nobullet"]/li/a'): detail_link = comm_links.attrib['href'] with self.urlopen(detail_link) as detail_page: detail_page = lxml.html.fromstring(detail_page) name = detail_page.xpath('/html/body[@class="home blog"]/div[@id="page"]/div[@id="content"]/div[@class="content_header"]/div[@class="content_header_right"]/a')[0].text name = name.split() name = name[0:-1] comm_name = '' for x in range(len(name)): comm_name += name[x] + ' ' comm_name = comm_name[0: -1] committee = Committee('upper', comm_name) for senators in detail_page.xpath('/html/body[@class="home blog"]/div[@id="page"]/div[@id="sidebar"]/ul[1]/li[1]/ul/li/a'): senator = senators.text if 'Chairperson' in senator: role = 'Chairperson' senator = senator[6: -13] else: role = 'member' senator = senator[6:-1] committee.add_member(senator, role) committee.add_source(main_url) committee.add_source(detail_link) self.save_committee(committee)
def scrape_lower_committee(self, name, parent, url): page = lxml.html.fromstring(self.urlopen(url)) page.make_links_absolute(url) if "Joint" in name or (parent and "Joint" in parent): chamber = "joint" else: chamber = "lower" if parent: comm = Committee(chamber, parent, subcommittee=name) else: comm = Committee(chamber, name) comm.add_source(url) for link in page.xpath("//a[contains(@href, 'District')]"): member = link.xpath("string()").strip() member = re.sub(r"\s+", " ", member) if not member: continue match = re.match(r"((Co-)?(Vice )?Chair)?Rep\. ([^\(]+)", member) member = match.group(4).strip() role = match.group(1) or "member" comm.add_member(member, role.lower()) self.save_committee(comm)
def scrape(self, chamber, term): self.validate_term(term, latest_only=True) chamber_abbr = {'upper': 's', 'lower': 'h'}[chamber] url = "http://le.utah.gov/asp/interim/standing.asp?house=%s" % chamber_abbr with self.urlopen(url) as page: page = lxml.html.fromstring(page) page.make_links_absolute(url) for comm_link in page.xpath("//a[contains(@href, 'Com=')]"): comm_name = comm_link.text.strip() # Drop leading "House" or "Senate" from name comm_name = re.sub(r"^(House|Senate) ", "", comm_name) comm = Committee(chamber, comm_name) for mbr_link in comm_link.xpath( "../../../font[2]/a[not(contains(@href, 'mailto'))]"): name = mbr_link.text.strip() next_el = mbr_link.getnext() if next_el is not None and next_el.tag == 'i': type = next_el.text.strip() else: type = 'member' comm.add_member(name, type) self.save_committee(comm)
def scrape_approp_subcommittees(self, url): html = self.urlopen(url) doc = lxml.html.fromstring(html) for strong in doc.xpath('//strong'): com = Committee(chamber='upper', committee='Appropriations', subcommittee=strong.text.strip()) com.add_source(url) legislators = strong.getnext().tail.replace('Senators', '').strip() for leg in re.split(', | and ', legislators): if leg.endswith('(C)'): role = 'chairman' leg = leg[:-4] elif leg.endswith('(VC)'): role = 'vice chairman' leg = leg[:-5] elif leg.endswith('(MVC)'): role = 'minority vice chairman' leg = leg[:-6] else: role = 'member' com.add_member(leg, role=role) self.save_committee(com)
def scrape(self, term, chambers): com_url = 'http://www.dccouncil.washington.dc.us/committees' data = self.urlopen(com_url) doc = lxml.html.fromstring(data) doc.make_links_absolute(com_url) urls = set(doc.xpath('//a[contains(@href, "committee-on")]/@href')) for url in urls: data = self.urlopen(url) doc = lxml.html.fromstring(data) try: name = doc.xpath('//h1/text()')[0].replace('Committee on ', '') except IndexError: name = doc.xpath('//h2/text()')[0].replace('Committee on ', '') # skip link to Committees page if name == 'Committees': continue com = Committee('upper', name) for chair in doc.xpath('//h3[text()="Committee Chair"]/following-sibling::p'): com.add_member(chair.text_content(), role='chairperson') for member in doc.xpath('//h3[text()="Councilmembers"]/following-sibling::ul//a'): com.add_member(member.text_content(), role='member') com.add_source(url) self.save_committee(com)
def scrape_upper_committee(self,url): filename, resp = self.urlretrieve(url) root = lxml.etree.fromstring( convert_pdf(filename,'xml')) for link in root.xpath('/pdf2xml/page'): comm = None for line in link.findall('text'): text = line.findtext('b') if text is not None and text.startswith('Comisi'): comm = Committee('upper',text); comm.add_source(url) else: if line.text and line.text.startswith('Hon.'): line_text = line.text.replace(u'–','-') name_split = line_text.split(u'-',1) title = 'member' # print name_split if len(name_split) >= 2: name_split[1] = name_split[1].strip() if name_split[1] == 'Presidenta' or name_split[1] == 'Presidente': title = 'chairman' elif name_split[1] == 'Vicepresidente' or name_split[1] == 'Vicepresidenta': title = 'vicechairman' elif name_split[1] == 'Secretaria' or name_split[1] == 'Secretario': title = 'secretary' # if title != 'member': # print name_split[0] if name_split[0] != 'VACANTE': comm.add_member(name_split[0].replace('Hon.',''),title) self.save_committee(comm) os.remove(filename);
def scrape_committee(self, term, chambers, href, name): page = self.get(href).text page = lxml.html.fromstring(page) page.make_links_absolute(href) members = page.xpath("//div[@class='view-content']" "//a[contains(@href, 'members')]") if "/joint/" in href: chamber = "joint" elif "/senate/" in href: chamber = "upper" elif "/house/" in href: chamber = "lower" else: print "XXX: Fail! %s" % (href) return cttie = Committee(chamber, name) for a in members: member = a.text role = a.xpath("ancestor::div/h2[@class='pane-title']/text()")[0] role = {"Legislative Members": "member", "Chairman": "chair", "Vice Chairman": "member"}[role] if member is None or member.startswith("District"): continue cttie.add_member(member, role=role) cttie.add_source(href) self.save_committee(cttie)
def scrape_joint_committee(self, url): html = self.urlopen(url) doc = lxml.html.fromstring(html) name = doc.xpath('//h1/text()') or doc.xpath('//h2/text()') name = name[0].strip() comm = Committee('joint', name) comm.add_source(url) members = chain(doc.xpath('//a[contains(@href, "MemberId")]'), doc.xpath('//a[contains(@href, "Senators")]')) seen = set() for a in members: parent_content = a.getparent().text_content() if ':' in parent_content: title = parent_content.split(':')[0].strip() else: title = 'member' name = a.text.split(' (')[0].strip() if (name, title) not in seen: comm.add_member(name, title) seen.add((name, title)) if comm['members']: self.save_committee(comm)
def _scrape_upper_committee(self, name, url2): cat = "Assignments.asp" url3 = "".join((url2, cat)) committee = Committee('upper', name) committee.add_source(url2) page = self.lxmlize(url3) members = page.xpath('//table[@id="table38"]//font/a/b') for link in members: role = "member" if link == members[0]: role = "Chairman" if link == members[1]: role = "Vice-Chairman" name = link.xpath('string()') name = name.replace('Senator ', '') name = re.sub('[\s]{2,}', ' ', name).strip() committee.add_member(name, role) self.save_committee(committee)
def scrape_upper(self): url = "http://senadopr.us/Lists/Listado%20de%20Comisiones/Comisiones%20del%20Senado.aspx" with self.urlopen(url) as html: doc = lxml.html.fromstring(html) doc.make_links_absolute(url) table = doc.xpath( '//table[@id="{C05AFE0D-D977-4033-8D7B-C43ABF948A4A}-{3E52C91B-AFC8-4493-967A-C8A47AC4E7B6}"]' ) for link in table[0].iterchildren("tr"): td_column = list(link) name = td_column[0].find("a") if name is not None: com_source = name.get("href") # if committee does not have a url use the default. if com_source == "http://senadopr.us/": com_source = url com_name = name.text # check the committee name to see if it's a join one. if td_column[1].text == "Comisi\xf3n Conjunta": chamber = "joint" else: chamber = "upper" com = Committee(chamber, com_name) com.add_source(com_source) com.add_member(clean_spaces(td_column[2].find("a").text), "chairman") self.save_committee(com)
def scrape_senate_committee(self, name, url2): cat = "Assignments.asp" url3 = "".join((url2, cat)) committee = Committee("upper", name) committee.add_source(url2) text = self.get(url3).text page = lxml.html.fromstring(text) members = page.xpath('//table[@id="table38"]//font/a/b') for link in members: role = "member" if link == members[0]: role = "Chairman" if link == members[1]: role = "Vice-Chairman" name = link.xpath("string()") name = name.replace("Senator ", "") name = re.sub("[\s]{2,}", " ", name).strip() committee.add_member(name, role) self.save_committee(committee)
def select_special_comm(self): main_url = "http://www.nebraskalegislature.gov/committees/select-committees.php" with self.urlopen(main_url) as page: page = lxml.html.fromstring(page) for comm_names in page.xpath('//div[@class="content_box"]'): name = comm_names.xpath("h2")[0].text if name != None: committee = Committee("upper", name) committee.add_source(main_url) for senators in comm_names.xpath('ul[@class="nobullet"]/li'): senator = senators[0].text if "Chairperson" in senator: role = "Chairperson" senator = senator[5:-13].strip() else: role = "member" senator = senator[5:].strip() committee.add_member(senator, role) self.save_committee(committee) else: name = comm_names.xpath("h2/a")[0].text committee = Committee("upper", name) committee.add_source(main_url) for senators in comm_names.xpath('ul[@class="nobullet"]/li'): senator = senators[0].text if "Chairperson" in senator: role = "chairperson" senator = senator[5:-13].strip() else: role = "member" senator = senator[5:].strip() committee.add_member(senator, role) self.save_committee(committee)
def scrape_committee(self, chamber, name, url): page = self.urlopen(url) page = lxml.html.fromstring(page) if page.xpath("//h3[. = 'Joint Committee']"): chamber = 'joint' subcommittee = page.xpath("//h3[@align='center']/text()")[0] if not "Subcommittee" in subcommittee: subcommittee = None comm = Committee(chamber, name, subcommittee=subcommittee) comm.add_source(url) for link in page.xpath("//a[contains(@href, 'member=')]"): member = link.text.strip() mtype = link.xpath("string(../preceding-sibling::td[1])") mtype = mtype.strip(": \r\n\t").lower() comm.add_member(member, mtype) if not comm['members']: self.warning('not saving %s, appears to be empty' % name) else: self.save_committee(comm)
def scrape_senate_comm(self): url = 'http://www.maine.gov/legis/senate/Senate-Standing-Committees.html' html = self.urlopen(url) doc = lxml.html.fromstring(html) # committee titles for item in doc.xpath('//span[@style="FONT-SIZE: 11pt"]'): text = item.text_content().strip() # some contain COMMITTEE ON & some are blank, drop those if not text or text.startswith('COMMITTEE'): continue # titlecase committee name com = Committee('upper', text.title()) com.add_source(url) # up two and get ul sibling for leg in item.xpath('../../following-sibling::ul[1]/li'): lname = leg.text_content().strip() if 'Chair' in lname: role = 'chair' else: role = 'member' lname = leg.text_content().strip().split(' of ')[0].strip() com.add_member(lname, role) self.save_committee(com)
def scrape_senate_committee(self, term, link): with self.urlopen(link) as html: doc = lxml.html.fromstring(html) # strip first 30 and last 10 # Minnesota Senate Committees - __________ Committee committee_name = doc.xpath('//title/text()')[0][30:-10] com = Committee('upper', committee_name) # first id=bio table is members for row in doc.xpath('//table[@id="bio"]')[0].xpath('tr'): row = fix_whitespace(row.text_content()) # switch role if ':' in row: position, name = row.split(': ') role = position.lower().strip() else: name = row # add the member com.add_member(name.strip(), role) com.add_source(link) self.save_committee(com)
def scrape_house_committee(self, committee_name, link): """Scrape individual committee page and add members""" html = self.urlopen(link) doc = lxml.html.fromstring(html) subcommittee = False for h1 in doc.xpath('//h1/text()'): if 'subcommittee' in h1.lower(): subcommittee = True subcomm_name = ('Subcommittee' if subcommittee else None) if subcommittee: committee_name = committee_name.replace(' Subcommittee', '') com = Committee('lower', committee_name, subcomm_name) find_expr = "//div[@class='col1']/ul[position()<3]/li/a" for a in doc.xpath(find_expr): name = a.text role = (a.tail or '').strip(', ') or 'member' if name: com.add_member(name, role) com.add_source(link) if com['members']: self.save_committee(com)
def scrape_reps_comm(self): url = 'http://www.maine.gov/legis/house/hsecoms.htm' page = self.urlopen(url) root = lxml.html.fromstring(page) count = 0 for n in range(1, 12, 2): path = 'string(//body/center[%s]/h1/a)' % (n) comm_name = root.xpath(path) committee = Committee('lower', comm_name) count = count + 1 path2 = '/html/body/ul[%s]/li/a' % (count) for el in root.xpath(path2): rep = el.text if rep.find('(') != -1: mark = rep.find('(') rep = rep[15: mark].strip() if 'chair' in rep.lower(): role = 'chair' rep = re.sub(r'(?i)[\s,]*chair\s*$', '', rep).strip() else: role = 'member' committee.add_member(rep, role) committee.add_source(url) self.save_committee(committee)
def scrape_reps_comm(self): url = 'http://www.maine.gov/legis/house/hsecoms.htm' with self.urlopen(url) as page: root = lxml.html.fromstring(page) count = 0 for n in range(1, 12, 2): path = 'string(//body/center[%s]/h1/a)' % (n) comm_name = root.xpath(path) committee = Committee('lower', comm_name) count = count + 1 path2 = '/html/body/ul[%s]/li/a' % (count) for el in root.xpath(path2): rep = el.text if rep.find('(') != -1: mark = rep.find('(') rep = rep[15: mark] committee.add_member(rep) committee.add_source(url) self.save_committee(committee)
def scrape_senate_committee(self, url): html = self.urlopen(url) doc = lxml.html.fromstring(html) name = doc.xpath('//h6/text()')[0] com = Committee(chamber='upper', committee=name) for member in doc.xpath('//div[@id="committeelist"]//a'): member_name = member.text.strip() # don't add clerks if member_name == 'Committee Clerk': continue if 'Committee Chair' in member.tail: role = 'chair' elif 'Majority Vice' in member.tail: role = 'majority vice chair' elif 'Minority Vice' in member.tail: role = 'minority vice chair' else: role = 'member' com.add_member(member_name, role=role) com.add_source(url) self.save_committee(com)
def scrape(self, chamber, term): url = "http://www.assembly.ab.ca/net/index.aspx?p=membership_list" html = self.urlopen(url) doc = lxml.html.fromstring(html) doc.make_links_absolute(url) committees = doc.xpath('//div[@id="_ctl0_Panel_committees"]') divs = committees[0].xpath("div")[1:] for div in divs[:]: if "class" in div.attrib and div.attrib["class"] == "committeetype_header": divs.remove(div) divs = iter(divs) while True: try: name, _, content = itertools.islice(divs, 3) except ValueError, StopIteration: break committee_name = name.text_content()[4:] committee = Committee("lower", committee_name) for td in content.xpath("table/descendant::td"): if td.xpath('a[contains(@href, "number")]'): name = td.xpath("a")[0].text_content() role = (td.xpath("a")[0].tail or "").strip("() ") committee.add_member(name, role or "member") xpath = 'table/descendant::td/a[contains(@href, "committees")]/@href' committee_url = content.xpath(xpath).pop() committee.add_source(url) committee.add_source(committee_url) self.save_committee(committee)
def _scrape_lower_special_committees(self): url = 'http://house.louisiana.gov/H_Cmtes/SpecialCommittees.aspx' page = self.lxmlize(url) committee_list = page.xpath('//table[@id="table106"]//div[@class=' '"exBody1A"]/div[@class="accordion"]')[0] headers = committee_list.xpath('./h3') for header in headers: committee_name_text = header.xpath('string()') committee_name = committee_name_text.strip() committee_name = self._normalize_committee_name(committee_name) chamber = 'joint' if committee_name.startswith('Joint') else 'lower' committee = Committee(chamber, committee_name) committee.add_source(url) committee_memberlist = header.xpath('./following-sibling::div[' '@class="pane"]//tr[@class="linkStyle2"]') for row in committee_memberlist: member_name = row.xpath('normalize-space(string(./td[1]))') member_name = ' '.join(filter(None, name_tools.split(member_name))) member_role = row.xpath('normalize-space(string(./td[2]))') member_role = self._normalize_member_role(member_role) committee.add_member(member_name, member_role) self.save_committee(committee)
def standing_comm(self): main_url = "http://www.nebraskalegislature.gov/committees/standing-committees.php" with self.urlopen(main_url) as page: page = lxml.html.fromstring(page) for comm_links in page.xpath( '//div[@id="content_text"]/div[@class="content_box_container"]/div[@class="content_box"][1]/ul[@class="nobullet"]/li/a' ): detail_link = comm_links.attrib["href"] with self.urlopen(detail_link) as detail_page: detail_page = lxml.html.fromstring(detail_page) name = detail_page.xpath( '//div[@id="content"]/div[@class="content_header"]/div[@class="content_header_right"]/a' )[0].text name = name.split() name = name[0:-1] comm_name = "" for x in range(len(name)): comm_name += name[x] + " " comm_name = comm_name[0:-1] committee = Committee("upper", comm_name) for senators in detail_page.xpath('//div[@id="sidebar"]/ul[1]/li[1]/ul/li/a'): senator = senators.text if "Chairperson" in senator: role = "Chairperson" senator = senator[6:-13].strip() else: role = "member" senator = senator[6:].strip() committee.add_member(senator, role) committee.add_source(main_url) committee.add_source(detail_link) self.save_committee(committee)
def scrape(self, chamber, term): if term != '2011-2012': raise NoDataForPeriod(term) if chamber == 'upper': url = ('http://www.legis.state.pa.us/cfdocs/legis/' 'home/member_information/senators_ca.cfm') else: url = ('http://www.legis.state.pa.us/cfdocs/legis/' 'home/member_information/representatives_ca.cfm') with self.urlopen(url) as page: page = lxml.html.fromstring(page) committees = {} for li in page.xpath("//a[contains(@href, 'bio.cfm')]/../.."): name = li.xpath("string(b/a[contains(@href, 'bio.cfm')])") name = name[0:-4] for link in li.xpath("a"): if not link.tail: continue committee_name = link.tail.strip() committee_name = re.sub(r"\s+", " ", committee_name) subcommittee_name = None role = 'member' rest = link.getnext().text if rest: match = re.match(r',\s+(Subcommittee on .*)\s+-', rest) if match: subcommittee_name = match.group(1) role = rest.split('-')[1].strip().lower() else: role = rest.replace(', ', '').strip().lower() if role == 'chairman': role = 'chair' try: committee = committees[(chamber, committee_name, subcommittee_name)] except KeyError: committee = Committee(chamber, committee_name) committee.add_source(url) if subcommittee_name: committee['subcommittee'] = subcommittee_name committees[(chamber, committee_name, subcommittee_name)] = committee committee.add_member(name, role) for committee in committees.values(): self.save_committee(committee)
def scrape_house_special(self, scraped_committees): url = 'http://house.louisiana.gov/H_Reps/H_Reps_SpecialCmtes.asp' text = self.get(url).text page = lxml.html.fromstring(text) page.make_links_absolute('http://house.louisiana.gov') committees = {} for el in page.xpath("//a[contains(@href,'H_Cmtes/')]"): comm_name = el.xpath('normalize-space(string())') comm_name = self.normalize_committee_name(comm_name) # skip committees that have already been scraped from # http://house.louisiana.gov/H_Reps/H_Reps_CmtesFull.asp if comm_name not in scraped_committees: comm_url = el.get('href').replace('../', '') try: text = self.get(comm_url).text except HTTPError: self.logger.warning("Link not working, skipping.") continue # check for no record found if re.search('No records returned.', text): self.logger.warning("No record found, skipping.") continue chamber = 'joint' if comm_name.startswith('Joint') else 'lower' committee = Committee(chamber, comm_name) committee.add_source(url) page = lxml.html.fromstring(text) page.make_links_absolute('http://house.louisiana.gov') for row in page.xpath('//table[@id="table1"]//tbody/tr'): member_info = row.xpath('./td') mname = member_info[0].xpath('normalize-space(string())') mtype = member_info[1].xpath('normalize-space(string())') if mtype == 'Chairman': mtype = 'chairman' elif mtype == 'Co-Chairmain': mtype = 'co-chairmain' elif mtype == 'Vice Chair': mtype = 'vice chair' elif mtype == 'Ex Officio': mtype = 'ex officio' elif mtype == 'Interim Member': mtype = 'interim' else: mtype = 'member' committee.add_member(mname, mtype) committees[comm_name] = committee return committees
def scrape_committee(self, committee_url, committee_name, chamber): page = self.lxmlize(committee_url, ignore=[500]) if page is None: return people = page.xpath("//div[@id='membership']//tbody/tr") c = Committee(chamber=chamber, committee=committee_name) for row in people: role, who = [x.text_content().strip() for x in row.xpath("./td")] c.add_member(who, role=role) c.add_source(committee_url) self.save_committee(c)
def scrape_senate(self): """Scrape Senate Committees""" for name, comm in nyss_billyslation.models.committees.items(): name = name.title().replace('And', 'and') committee = Committee('upper', name) for member in comm.members: committee.add_member(member.fullname) self.save_committee(committee)
def scrape_lower_committee(self, name, url): with self.urlopen(url) as page: page = lxml.html.fromstring(page) comm = Committee('lower', name) comm.add_source(url) for link in page.xpath("//a[contains(@href, 'mem?ad')]"): member = link.text.strip() comm.add_member(member) self.save_committee(comm)
def scrape_page(self, a, chamber, term): page, text = self.lxmlize(a.attrib['href']) committee = a.text_content() twitter_ids = re.findall("setUser\('(.*)'\)", text) twitter_id = twitter_ids[0] if twitter_ids != [] else None roles = {", Chair": "chair", ", Vice-Chair": "member"} committee = Committee(chamber, committee, twitter=twitter_id) committee.add_source(a.attrib['href']) tables = page.xpath("//table[@width='545' or @width='540']") added = False seen_people = set([]) for table in tables: people = table.xpath( ".//a[contains(@href, 'MemberDetailPage')]/text()") for person in [x.strip() for x in people]: role = "member" for flag in roles: if person.endswith(flag): role = roles[flag] person = person[:-len(flag)].strip() if person in seen_people: continue seen_people.add(person) committee.add_member(person, role) added = True if added: self.save_committee(committee) return tables = page.xpath("//table[@width='466']") added = False seen_people = set([]) for table in tables: if "committee members" in table.text_content().lower(): for person in table.xpath(".//td/text()"): person = person.strip() if person != "": if person in seen_people: continue seen_people.add(person) committee.add_member(person, "member") added = True if added: self.save_committee(committee) return self.warning("Unable to scrape!")
def scrape_reps_committees(self, term_name, chamber): url = '{base}ActiveCommittees.aspx'.format(base=self.reps_url_base) with self.urlopen(url) as page_string: page = lxml.html.fromstring(page_string) table = page.xpath('id("contentdata")/table[1]')[0] # Last tr has the date trs = table.xpath('tr')[:-1] for tr in trs: committee_parts = [ part.strip() for part in tr.text_content().split(',') ] committee_name = committee_parts[0].title().strip() if len(committee_parts) > 0: status = committee_parts[1].strip() committee_url = tr.xpath('td/a')[0].attrib.get('href') committee_url = '{base}{url}'.format(base=self.reps_url_base, url=committee_url) actual_chamber = chamber if committee_name.startswith('Joint'): actual_chamber = 'joint' committee = Committee(actual_chamber, committee_name, status=status) with self.urlopen(committee_url) as committee_page_string: committee_page = lxml.html.fromstring( committee_page_string) # First tr has the title (sigh) mem_trs = committee_page.xpath('id("memGroup")/tr')[1:] for mem_tr in mem_trs: mem_code = None mem_links = mem_tr.xpath('td/a[1]') if len(mem_links): mem_code = mem_links[0].attrib.get('href') # Output is "Rubble, Barney, Neighbor" mem_parts = mem_tr.text_content().strip().split(',') if self.no_members_text in mem_parts: continue mem_name = (mem_parts[1].strip() + ' ' + mem_parts[0].strip()) # Sometimes Senator abbreviation is in the name mem_name = mem_name.replace('Sen. ', '') mem_role = 'member' if len(mem_parts) > 2: # Handle the case where there is a comma in the # role name mem_role = ', '.join( [p.strip() for p in mem_parts[2:]]).lower() committee.add_member(mem_name, role=mem_role, _code=mem_code) committee.add_source(url) committee.add_source(committee_url) self.save_committee(committee)
def scrape(self, chamber, term): if chamber == 'lower': # Committee members from both houses are listed # together. So, we'll only scrape once. return None session = None # Even thought each term spans two years, committee # memberships don't appear to change. So we only # need to scrape the first year of the term. for t in self.metadata["terms"]: if term == t["name"]: session = t['sessions'][-1] # session = self.metadata['session_details'][t['sessions'][-1]] break else: raise NoDataForPeriod(term) list_url = self.urls["list"] % (session, ) committees = {} page = self.urlopen(list_url) page = lxml.html.fromstring(page) for el in page.xpath(".//a[contains(@href, 'CommitteeMembers')]"): committees[el.text.strip()] = el.get("href") for c in committees: self.log(c) detail_url = self.urls["detail"] % (committees[c], ) page = self.urlopen(detail_url) page = lxml.html.fromstring(page) if re.match('\d{1,2}-', c): c = c.split('-', 1)[1] jcomm = Committee('joint', c.strip()) for table in page.xpath( ".//table[contains(@id, 'CommitteeMembers')]"): rows = table.xpath(".//tr") chamber = rows[0].xpath('.//td')[0].text_content().strip() chamber = 'upper' if chamber == 'Senator' else 'lower' comm = Committee(chamber, c.strip()) for row in rows[1:]: tds = row.xpath('.//td') name = tds[0].text_content().strip() role = 'chairman' if tds[3].text_content().strip( ) == 'Chairman' else 'member' comm.add_member(name, role, chamber=chamber) jcomm.add_member(name, role, chamber=chamber) comm.add_source(detail_url) self.save_committee(comm) jcomm.add_source(detail_url) self.save_committee(jcomm)
def _committee_data(lines, chamber, url, name_dict): '''Given a list of lines of committee data from a td element on the committees page, extract the commitee name, the members, and yeild a committee object. Also yield the name dict incase the calling function needs it for something. ''' name_pattern = r'\s{,20}(?:(.+)\:)?\s{,20}(.+?) \((?:\w\-([^)]+))' # Functions to identify unused data. junk = [ lambda s: s != 'On Call', lambda s: 'Staff:' not in s, lambda s: 'Secretary:' not in s, lambda s: s.strip(), lambda s: not s.isupper() ] # Toss unused data. for j in junk: lines = filter(j, lines) if (len(lines) < 2) or (u'\xa0' in lines): return lines = lines[::-1] kw = {'chamber': chamber} kw['committee'] = lines.pop().strip() if lines[-1].startswith('Meets'): kw['meetings_info'] = lines.pop().strip() c = Committee(**kw) for name in reversed(lines): kwargs = {} m = re.search(name_pattern, name) if m: title, name, city = m.groups() if title: title = title.lower() house = re.search(r'(Sen\.|Rep\.)\s+', name) if house: house = house.group() if 'Sen.' in house: kwargs['chamber'] = 'upper' elif 'Rep.' in house: kwargs['chamber'] = 'lower' name = name.replace(house, '').strip() name_dict[city.lower()].add(name) c.add_member(name, role=(title or 'member'), **kwargs) c.add_source(url) return name_dict, c
def scrape_joint_comm(self, chamber, session): fileurl = 'http://www.maine.gov/legis/house/commlist.xls' joint = urllib.urlopen(fileurl).read() f = open('me_joint.xls', 'w') f.write(joint) f.close() wb = xlrd.open_workbook('me_joint.xls') sh = wb.sheet_by_index(0) cur_comm_name = '' chamber = 'joint' for rownum in range(1, sh.nrows): comm_name = sh.cell(rownum, 0).value first_name = sh.cell(rownum, 3).value middle_name = sh.cell(rownum, 4).value last_name = sh.cell(rownum, 5).value jrsr = sh.cell(rownum, 6).value full_name = first_name + " " + middle_name + " " + last_name + " " + jrsr party = sh.cell(rownum, 7).value legalres = sh.cell(rownum, 8).value address1 = sh.cell(rownum, 9).value address2 = sh.cell(rownum, 10).value town = sh.cell(rownum, 11).value state = sh.cell(rownum, 12).value zipcode = int(sh.cell(rownum, 13).value) phone = str(sh.cell(rownum, 14).value) home_email = sh.cell(rownum, 15).value leg_email = sh.cell(rownum, 16).value leg_chamber = sh.cell(rownum, 2).value chair = sh.cell(rownum, 1).value role = "member" if chair == 1: role = leg_chamber + " " + "Chair" if comm_name != cur_comm_name: cur_comm_name = comm_name committee = Committee(chamber, comm_name) committee.add_member(full_name, role = role, party = party, legalres= legalres, address1 = address1, address2 = address2, town = town, state = state, zipcode = zipcode, phone = phone, home_email = home_email, leg_email = leg_email) committee.add_source(fileurl) else: committee.add_member(full_name, role = role, party = party, legalres = legalres, address1 = address1, address2 = address2, town = town, state = state, zipcode = zipcode, phone = phone, home_email = home_email, leg_email = leg_email) self.save_committee(committee)
def scrape_session(self, term, chambers, session): sid = self.metadata['session_details'][session]['_guid'] committees = self.cservice.GetCommitteesBySession(sid) #if committees.strip() == "": # return # If we get here, it's a problem. # Commenting this out for future debugging. - PRT committees = committees['CommitteeListing'] for committee in committees: cid = committee['Id'] committee = self.cservice.GetCommittee(cid) name, typ, guid, code, description = [ committee[x] for x in ['Name', 'Type', 'Id', 'Code', 'Description'] ] chamber = { "House": "lower", "Senate": "upper", "Joint": "joint" }[typ] ctty = None if code in self.ctty_cache: ctty = self.ctty_cache[code] if (ctty['chamber'] != chamber) and (description and 'joint' in description.lower()): ctty['chamber'] = 'joint' else: ctty = None if ctty is None: ctty = Committee(chamber, name, code=code, _guid=guid, description=description) self.ctty_cache[code] = ctty members = committee['Members']['CommitteeMember'] for member in members: name = "{First} {Last}".format( **dict(member['Member']['Name'])) role = member['Role'] ctty.add_member(name, role, _guid=member['Member']['Id']) ctty.add_source(self.csource) ctty.add_source(CTTIE_URL.format(**{ "sid": sid, "cttie": guid, })) self.save_committee(ctty)
def test_committee(): c = Committee('upper', 'committee name') c.add_member('Washington', role='chairman') c.add_member('Filmore', note='note') assert_equal(c['members'], [{ 'name': 'Washington', 'role': 'chairman' }, { 'name': 'Filmore', 'role': 'member', 'note': 'note' }])
def get_jmfc(self, name, url): """Gets the Joint Millennium Fund Committee info""" with self.urlopen(url) as jfmc_page: html = lxml.html.fromstring(jfmc_page) committee = Committee('joint', name) table = html.xpath('//table')[2] for row in table.xpath('tbody/tr'): senate, house = [ td.text.replace('\r\n', ' ').replace(u'\xa0', ' ') \ for td in row.xpath('td') ] committee.add_member(*senate.strip('Sen.').strip().split(',')) committee.add_member(*house.strip('Rep.').strip().split(',')) committee.add_source(url) self.save_committee(committee)
def scrape_committee(self, chamber, name, url, subcommittee=None): if subcommittee: split_sub = subcommittee.split('-') if len(split_sub) > 1: subcommittee = '-'.join(split_sub[1:]) subcommittee = re.sub(r'^(HOUSE|SENATE)\s+', '', subcommittee.strip()) if (name, subcommittee) in self._seen: return self._seen.add((name, subcommittee)) comm = Committee(chamber, name, subcommittee=subcommittee) comm.add_source(url) with self.urlopen(url) as page: page = lxml.html.fromstring(page) for tr in page.xpath('//table[@class="gridtable"]/' 'tr[position()>1]'): if tr.xpath('string(td[1])'): mtype = tr.xpath('string(td[1])') else: mtype = 'member' member = tr.xpath('string(td[3])').split() title = member[0] member = ' '.join(member[1:]) if title == 'Senator': mchamber = 'upper' elif title == 'Representative': mchamber = 'lower' else: # skip non-legislative members continue comm.add_member(member, mtype, chamber=mchamber) for a in page.xpath('//ul/li/a'): sub_name = a.text.strip() sub_url = urlescape(a.attrib['href']) self.scrape_committee(chamber, name, sub_url, subcommittee=sub_name) if not comm['members']: self.warning('not saving empty committee %s' % name) else: self.save_committee(comm)
def scrape_upper_committee(self, name, url): page = lxml.html.fromstring(self.urlopen(url)) comm = Committee('upper', name) comm.add_source(url) for link in page.xpath("//a[contains(@href, 'biographies')]"): member = link.xpath("string()").strip() member = re.sub(r'\s+', ' ', member) if not member: continue comm.add_member(member) self.save_committee(comm)
def scrape_joint_committees(self,term,session): url = "http://legis.delaware.gov/legislature.nsf/testside.html?OpenPage&BaseTarget=right" page = self.lxmlize(url) joint_comms = page.xpath("//a[text()='Joint Committees']") comm_list = joint_comms[0].getnext() for li in comm_list.xpath("./li/a"): comm_name = li.text comm_link = li.attrib["href"] if comm_name.strip() == "Sunset": #I don't even want to go into it. new_link = "http://legis.delaware.gov/Sunset/"\ "Sunset.nsf/general+Info/JSC+Members?opendocument" assert new_link != comm_link, "Remove Sunset Committee special casing" comm_link = new_link committee = Committee("joint", comm_name) committee.add_source(comm_link) comm_page = self.lxmlize(comm_link) people = comm_page.xpath("//a/b") things_to_replace = ["Senator", "Representative", "(D)","(R)", "House Minority Whip", "House Majority Whip", "Senate Minority Whip", "Senate Majority Whip", "House Minority Leader", "House Majority Leader", "Senate Minority Leader", "Senate Majority Leader", "President Pro Tempore", "Speaker of the House"] for person in people: person_name = person.text_content() for thing in things_to_replace: person_name = person_name.replace(thing,"") person_name = person_name.strip().strip(",") role = "Member" if person_name.strip()[-1] == ")": person_name,role = person_name.rsplit("(",1) role = role.replace(")","").strip() elif ", Vice-Chair" in person_name: role = "Vice-Chair" person_name = person_name.replace(", Vice-Chair","") elif ", Chair" in person_name: role = "Chair" person_name = person_name.replace(", Chair","") person_name = person_name.strip().strip(",").strip() committee.add_member(person_name,role) self.save_committee(committee)
def scrape_house_committee(self, name, url): com = Committee('lower', name) com.add_source(url) with self.urlopen(url) as html: doc = lxml.html.fromstring(html) contact, directiva, reps = doc.xpath('//div[@class="sbox"]/div[2]') # all members are tails of images (they use img tags for bullets) # first three members are in the directiva div #pres, vpres, secretary, _ = directiva.xpath('.//img') chair = directiva.xpath('b[text()="Presidente:"]/following-sibling::img[1]') vchair = directiva.xpath('b[text()="Vice Presidente:"]/following-sibling::img[1]') sec = directiva.xpath('b[text()="Secretario(a):"]/following-sibling::img[1]') if chair: com.add_member(clean_spaces(chair[0].tail), 'chairman') if vchair: com.add_member(clean_spaces(vchair[0].tail), 'vice chairman') if sec: com.add_member(clean_spaces(sec[0].tail), 'secretary') for img in reps.xpath('.//img'): com.add_member(clean_spaces(img.tail)) self.save_committee(com)
def scrape_lower_committee(self, name, url): com = Committee('lower', name) com.add_source(url) doc = self.lxmlize(url) contact, directiva, reps = doc.xpath('//div[@class="sbox"]/div[2]') # all members are tails of images (they use img tags for bullets) # first three members are in the directiva div chair = directiva.xpath('b[text()="Presidente:"]/following-sibling::img[1]') vchair = directiva.xpath('b[text()="Vice Presidente:"]/following-sibling::img[1]') sec = directiva.xpath('b[text()="Secretario(a):"]/following-sibling::img[1]') member = 0 if chair and chair[0].tail is not None: chair = chair[0].tail com.add_member(clean_spaces(chair), 'chairman') member += 1 if vchair and vchair[0].tail is not None: vchair = vchair[0].tail com.add_member(clean_spaces(vchair), 'vice chairman') member += 1 if sec and sec is not None: sec = sec[0].tail com.add_member(clean_spaces(sec), 'secretary') member += 1 for img in reps.xpath('.//img'): member_name = clean_spaces(img.tail) if member_name is not None: com.add_member(member_name) member += 1 if member > 0: self.save_committee(com)
def scrape_comm(self, chamber, term_name): url = 'http://billstatus.ls.state.ms.us/htms/%s_cmtememb.xml' % chamber with self.urlopen(url) as comm_page: root = lxml.etree.fromstring(comm_page, lxml.etree.HTMLParser()) if chamber == 'h': chamber = "lower" else: chamber = "upper" for mr in root.xpath('//committee'): name = mr.xpath('string(name)') comm = Committee(chamber, name) chair = mr.xpath('string(chair)') chair = chair.replace(", Chairman", "") role = "Chairman" if len(chair) > 0: comm.add_member(chair, role=role) vice_chair = mr.xpath('string(vice_chair)') vice_chair = vice_chair.replace(", Vice-Chairman", "") role = "Vice-Chairman" if len(vice_chair) > 0: comm.add_member(vice_chair, role=role) members = mr.xpath('string(members)').split(";") for leg in members: if leg[0] == " ": comm.add_member(leg[1: len(leg)]) else: comm.add_member(leg) comm.add_source(url) self.save_committee(comm)
def get_joint_committees_data(self, name, url): page = self.get(url).text html = lxml.html.fromstring(page) committee = Committee('joint', name) table = html.xpath("//section[@class=' row-equal-height no-padding']") for td in table: senate_members = td.xpath('div[1]/div/div/div[2]/div/p/strong') if (len(senate_members) > 0): member_string = list(senate_members[0].itertext()) if (len(member_string) > 1): name = member_string[0] for ch in ['\r\n', u'\xa0', u'\u2013', 'Sen.']: if ch in name: name = name.replace(ch, ' ').encode('ascii', 'ignore').strip() role = member_string[1] for ch in ['\r\n', u'\xa0', u'\u2013', ',']: if ch in role: role = role.replace(ch, ' ').encode('ascii', 'ignore').strip() committee.add_member(name, role=role, chamber='senate') else: name = member_string[0] for ch in ['\r\n', u'\xa0', u'\u2013', 'Sen.']: if ch in name: name = name.replace(ch, ' ').encode('ascii', 'ignore').strip() committee.add_member(name, chamber='senate') house_members = list( td.xpath('div[2]/div/div/div[2]/div/p/strong')) if (len(house_members) > 0): member_string = list(house_members[0].itertext()) if (len(member_string) > 1): name = member_string[0] for ch in ['\r\n', u'\xa0', u'\u2013', 'Rep.']: if ch in name: name = name.replace(ch, ' ').encode('ascii', 'ignore').strip() role = member_string[1] for ch in ['\r\n', u'\xa0', u'\u2013', ',']: if ch in role: role = role.replace(ch, ' ').encode('ascii', 'ignore').strip() committee.add_member(name, role=role, chamber='house') else: name = member_string[0] for ch in ['\r\n', u'\xa0', u'\u2013', 'Rep.']: if ch in name: name = name.replace(ch, ' ').encode('ascii', 'ignore').strip() committee.add_member(name, chamber='house') committee.add_source(url) self.save_committee(committee)
def scrape_lower_committee(self, name, url): page = self.urlopen(url) page = lxml.html.fromstring(page) comm = Committee('lower', name) comm.add_source(url) for link in page.xpath("//a[contains(@href, 'mem?ad')]"): member = link.text.strip() member = re.sub(r'\s+', ' ', member) name, role = parse_name(member) comm.add_member(name, role) self.save_committee(comm)
def get_jlfc(self, name, url): """Gets info for the Joint Legislative Oversight Committee""" jlfc_page = self.urlopen(url) html = lxml.html.fromstring(jlfc_page) committee = Committee('joint', name) member_path = '//h3[contains(text(), "%s")]/following-sibling::p[1]' for chamber in ('Senate', 'House'): members = html.xpath(member_path % chamber)[0]\ .text_content().split('\r\n') for member in members: if member.strip(): committee.add_member(*member.replace(u'\xa0', ' ').split(','), chamber=_REV_CHAMBERS[chamber.lower()]) committee.add_source(url) self.save_committee(committee)
def scrape(self, chamber, term): self.validate_term(term, latest_only=True) chamber_abbr = {'upper': 'S', 'lower': 'H'}[chamber] url = ( 'http://www.leg.state.vt.us/legdir/comms.cfm?Body=%s&Session=2014' % chamber_abbr) html = self.urlopen(url) page = lxml.html.fromstring(html) for li in page.xpath("//li"): # Strip the room number from the committee name comm_name = re.match(r'[^\(]+', li.text_content()).group(0).strip() # Strip chamber from beginning of committee name comm_name = re.sub(r'^(HOUSE|SENATE) COMMITTEE ON ', '', comm_name) # normalize case of committee name comm_name = comm_name.title() comm = Committee(chamber, comm_name) comm.add_source(url) for tr in li.xpath("../../following-sibling::tr"): name = tr.text_content().strip() # Break when we reach the next committee if 'COMMITTEE' in name: break match = re.search( '^([\w\s\.]+),\s+' '(Chair|Vice Chair|Vice-Chair|Ranking Member|Clerk)$', name) if match: name = match.group(1) mtype = match.group(2).lower() else: mtype = 'member' if not name.startswith(DOUBLED_NAMES): name = re.sub(r'of [\w\s\.]+$', '', name) comm.add_member(name, mtype) if comm['members']: self.save_committee(comm)
def scrape_senate_committees(self, term_name, chamber): years = [t[2:] for t in term_name.split('-')] for year in years: if int(year) > int(str(dt.datetime.now().year)[2:]): self.log("Not running session %s, it's in the future." % (term_name)) continue url = '{base}{year}info/com-standing.htm'.format( base=self.senate_url_base, year=year) page_string = self.urlopen(url) page = lxml.html.fromstring(page_string) ps = page.xpath('id("mainContent")/table/*[3]/p') for p in ps: links = p.xpath('a[1]') if not links: continue a = links[0] committee_name = a.text_content().strip() committee_url = a.attrib.get('href') if 'joint' in committee_name.lower(): c = "joint" else: c = chamber committee = Committee(c, committee_name) committee_page_string = self.urlopen(committee_url) committee_page = lxml.html.fromstring(committee_page_string) lis = committee_page.xpath( "//div[@id='mainContent']/ul/ul[1]/li") if len(lis) == 0: lis = committee_page.xpath("//div[@id='mainContent']//li") # This MIGHT cause issues. for li in lis: mem_parts = li.text_content().strip().split(',') mem_name = mem_parts[0] mem_role = 'member' if len(mem_parts) > 2: mem_role = mem_parts[2].lower() if mem_name == "": continue committee.add_member(mem_name, role=mem_role) committee.add_source(url) committee.add_source(committee_url) self.save_committee(committee)
def scrape_committee(self, chamber, url): html = self.get(url, verify=False).text doc = lxml.html.fromstring(html) name = doc.xpath('//title/text()')[0] com = Committee(chamber, name) com.add_source(url) members = doc.xpath('//a[contains(@href, "/Legislators/Profile")]') for member in members: title = member.xpath('../span') role = title[0].text.lower() if title else 'member' com.add_member(member.text, role) if com['members']: self.save_committee(com)
def scrape_house_special(self, scraped_committees): url = 'http://house.louisiana.gov/H_Reps/H_Reps_SpecialCmtes.asp' text = self.urlopen(url) page = lxml.html.fromstring(text) page.make_links_absolute('http://house.louisiana.gov') committees = {} for el in page.xpath("//a[contains(@href,'../H_Cmtes/')]"): comm_name = el.xpath('normalize-space(string())') comm_name = self.normalize_committee_name(comm_name) # skip committees that have already been scraped from # http://house.louisiana.gov/H_Reps/H_Reps_CmtesFull.asp if comm_name not in scraped_committees: comm_url = el.get('href').replace('../', '') committees[comm_name] = comm_url for name, url in committees.items(): chamber = 'joint' if name.startswith('Joint') else 'lower' committee = Committee(chamber, name) committee.add_source(url) text = self.urlopen(url) page = lxml.html.fromstring(text) page.make_links_absolute('http://house.louisiana.gov') for row in page.xpath('//table[@id="table1"]//tbody/tr'): member_info = row.xpath('./td') mname = member_info[0].xpath('normalize-space(string())') mtype = member_info[1].xpath('normalize-space(string())') if mtype == 'Chairman': mtype = 'chairman' elif mtype == 'Co-Chairmain': mtype = 'co-chairmain' elif mtype == 'Vice Chair': mtype = 'vice chair' elif mtype == 'Ex Officio': mtype = 'ex officio' elif mtype == 'Interim Member': mtype = 'interim' else: mtype = 'member' committee.add_member(mname, mtype) committees[name] = committee return committees
def _scrape_standing_committees(self): """Scrapes the Standing Committees page of the Nebraska state legislature.""" main_url = 'http://www.nebraskalegislature.gov/committees/standing-committees.php' page = self.lxmlize(main_url) committee_nodes = self.get_nodes( page, '//div[@class="main-content"]/div[@class="panel panel-leg"][1]/' 'div[@class="list-group"]/a[@class="list-group-item"]') for committee_node in committee_nodes: committee_page_url = committee_node.attrib['href'] committee_page = self.lxmlize(committee_page_url) name_text = self.get_node( committee_page, '//div[@class="container view-front"]/div[@class="row"]/' 'div[@class="col-sm-6 col-md-7"]/h1/text()[normalize-space()]') name = name_text.split()[0:-1] committee_name = '' for x in range(len(name)): committee_name += name[x] + ' ' committee_name = committee_name[0:-1] committee = Committee('upper', committee_name) members = self.get_nodes( committee_page, '//div[@class="col-sm-4 col-md-3 ltc-col-right"][1]/' 'div[@class="block-box"][1]/ul[@class="list-unstyled ' 'feature-content"]/li/a/text()[normalize-space()]') for member in members: member_name = re.sub(r'\Sen\.\s+', '', member) member_name = re.sub(r', Chairperson', '', member_name).strip() if 'Chairperson' in member: member_role = 'Chairperson' else: member_role = 'member' committee.add_member(member_name, member_role) committee.add_source(main_url) committee.add_source(committee_page_url) self.save_committee(committee)
def scrape(self, chamber, term): if chamber == 'lower': # Committee members from both houses are listed # together. So, we'll only scrape once. return None year = None # Even thought each term spans two years, committee # memberships don't appear to change. So we only # need to scrape the first year of the term. for t in self.metadata["terms"]: if term == t["name"]: year = t["start_year"] break if not year: raise NoDataForPeriod(term) list_url = self.urls["list"] % (year, ) committees = {} with self.urlopen(list_url) as page: page = lxml.html.fromstring(page) for el in page.xpath(".//a[contains(@href, 'CommitteeMembers')]"): committees[el.text] = el.get("href") for c in committees: self.log(c) detail_url = self.urls["detail"] % (committees[c],) with self.urlopen(detail_url) as page: page = lxml.html.fromstring(page) for table in page.xpath(".//table[contains(@id, 'CommitteeMembers')]"): rows = table.xpath(".//tr") chamber = rows[0].xpath('.//td')[0].text_content().strip() chamber = 'upper' if chamber == 'Senator' else 'lower' comm = Committee(chamber, c) for row in rows[1:]: tds = row.xpath('.//td') name = tds[0].text_content().strip() role = 'chairman' if tds[3].text_content().strip() == 'Chairman' else 'member' self.log(name) self.log(role) comm.add_member(name, role) comm.add_source(detail_url) self.save_committee(comm)
def scrape_upper_committee(self, link, name): url = re.sub(r'\s+', '', link.attrib['href']) html = self.urlopen(url) doc = lxml.html.fromstring(html) doc.make_links_absolute(url) comm = Committee('upper', name) comm.add_source(url) xpath = '//a[contains(@href, "?member=")]' for link in doc.xpath(xpath): name = link.text_content().strip() name = re.sub(r'^Delegate\s+', '', name) role = link.getnext().text or 'member' comm.add_member(name, role.strip()) return comm
def scrape(self, chamber, term): self.validate_term(term, latest_only=True) chamber_abbr = {'upper': 'S', 'lower': 'H'}[chamber] url = ('http://www.leg.state.vt.us/lms/legdir/comms.asp?Body=%s' % chamber_abbr) with self.urlopen(url) as page: page = lxml.html.fromstring(page) for li in page.xpath("//li"): # Strip the room number from the committee name comm_name = re.match(r'[^\(]+', li.text_content()).group(0).strip() # Strip chamber from beginning of committee name comm_name = re.sub(r'^(HOUSE|SENATE) COMMITTEE ON ', '', comm_name) comm_name = comm_name.title() comm = Committee(chamber, comm_name) comm.add_source(url) for tr in li.xpath("../../following-sibling::tr"): # Break when we reach the next committee if tr.xpath("th/li"): break name = tr.xpath("string()").strip() match = re.search( '^([\w\s\.]+),\s+' '(Chair|Vice Chair|Ranking Member|Clerk)$', name) if match: name = match.group(1) mtype = match.group(2).lower() else: mtype = 'member' name = re.sub(r'of [\w\s\.]+$', '', name) comm.add_member(name, mtype) self.save_committee(comm)
def process_committee(self, data): if data['classification'] != 'committee': return parent = parse_psuedo_id(data['parent_id']) chamber = parent['classification'] if 'name' in parent: com = Committee(chamber, parent['name'], subcommittee=data['name']) else: com = Committee(chamber, data['name']) for member in self.memberships[data['_id']]: com.add_member(member['person_name'], role=member['role']) for source in data['sources']: com.add_source(source['url']) self.save_committee(com)