def scrape(self, chamber, term): if term != '2011-2012': raise NoDataForPeriod base_url = ("http://www.arkleg.state.ar.us/assembly/2011/2011R/" "Pages/Committees.aspx?committeetype=") for chamber, url_ext in COMM_TYPES.iteritems(): chamber_url = urlescape(base_url + url_ext) with self.urlopen(chamber_url) as page: page = lxml.html.fromstring(page) for a in page.xpath('//td[@class="dxtl dxtl__B0"]/a'): if a.attrib.get('colspan') == '2': # colspan=2 signals a subcommittee, but it's easier # to pick those up from links on the committee page, # so we do that in scrape_committee() and skip # it here continue name = re.sub(r'\s*-\s*(SENATE|HOUSE)$', '', a.text).strip() comm_url = urlescape(a.attrib['href']) if chamber == 'task_force': chamber = 'joint' self.scrape_committee(chamber, name, comm_url)
def scrape(self, chamber, term): # Get start year of term. for termdict in self.metadata['terms']: if termdict['name'] == term: break start_year = termdict['start_year'] base_url = ("http://www.arkleg.state.ar.us/assembly/%s/%sR/" "Pages/Committees.aspx?committeetype=") base_url = base_url % (start_year, start_year) for chamber, url_ext in COMM_TYPES.iteritems(): chamber_url = urlescape(base_url + url_ext) page = self.urlopen(chamber_url) page = lxml.html.fromstring(page) for a in page.xpath('//td[@class="dxtl dxtl__B0"]/a'): if a.attrib.get('colspan') == '2': # colspan=2 signals a subcommittee, but it's easier # to pick those up from links on the committee page, # so we do that in scrape_committee() and skip # it here continue name = re.sub(r'\s*-\s*(SENATE|HOUSE)$', '', a.text).strip() comm_url = urlescape(a.attrib['href']) if chamber == 'task_force': chamber = 'joint' self.scrape_committee(chamber, name, comm_url)
def scrape_bill(self, chamber, session, url): with self.urlopen(url) as data: if "Bill does not exist." in data: return bill = self.parse_bill_xml(chamber, session, data) bill.add_source(urlescape(url)) versions_url = url.replace('billhistory', 'billtext/html') # URLs for versions inexplicably (H|S)(J|C) instead of (H|J)(CR|JR) versions_url = versions_url.replace('JR', 'J').replace('CR', 'C') versions_url = '/'.join(versions_url.split('/')[0:-1]) bill_prefix = bill['bill_id'].split()[0] bill_num = int(bill['bill_id'].split()[1]) long_bill_id = "%s%05d" % (bill_prefix, bill_num) try: with self.urlopen(versions_url) as versions_list: bill.add_source(urlescape(versions_url)) for version in parse_ftp_listing(versions_list): if version.startswith(long_bill_id): version_name = version.split('.')[0] version_url = urlparse.urljoin( versions_url + '/', version) bill.add_version(version_name, urlescape(version_url)) except urllib2.URLError: # Sometimes the text is missing pass self.save_bill(bill)
def scrape(self, chamber, term): # Get start year of term. for termdict in self.metadata['terms']: if termdict['name'] == term: break start_year = termdict['start_year'] base_url = ("http://www.arkleg.state.ar.us/assembly/%s/%sR/" "Pages/Committees.aspx?committeetype=") base_url = base_url % (start_year, start_year) for chamber, url_ext in COMM_TYPES.iteritems(): chamber_url = urlescape(base_url + url_ext) page = self.get(chamber_url).text page = lxml.html.fromstring(page) page.make_links_absolute(chamber_url) for a in page.xpath('//td[@class="dxtl dxtl__B0"]/a'): if a.attrib.get('colspan') == '2': # colspan=2 signals a subcommittee, but it's easier # to pick those up from links on the committee page, # so we do that in scrape_committee() and skip # it here continue name = re.sub(r'\s*-\s*(SENATE|HOUSE)$', '', a.text).strip() comm_url = urlescape(a.attrib['href']) if chamber == 'task_force': chamber = 'joint' self.scrape_committee(chamber, name, comm_url)
def scrape_bill(self, chamber, session, bill_id, url): try: page = lxml.html.fromstring(self.urlopen(url)) except scrapelib.HTTPError as e: self.warning("error (%s) fetching %s, skipping" % (e, url)) return title = page.xpath("string(//span[contains(@id, 'PlaceHolder1_txtST')])").strip() if "JR" in bill_id: bill_type = ["joint resolution"] elif "CR" in bill_id: bill_type = ["concurrent resolution"] elif "R" in bill_id: bill_type = ["resolution"] else: bill_type = ["bill"] bill = Bill(session, chamber, bill_id, title, type=bill_type) bill.add_source(url) bill["subjects"] = self.subject_map[bill_id] for link in page.xpath("//a[contains(@id, 'Auth')]"): name = link.xpath("string()").strip() if "otherAuth" in link.attrib["id"]: bill.add_sponsor("coauthor", name) else: bill.add_sponsor("author", name) act_table = page.xpath("//table[contains(@id, 'Actions')]")[0] for tr in act_table.xpath("tr")[2:]: action = tr.xpath("string(td[1])").strip() if not action or action == "None": continue date = tr.xpath("string(td[3])").strip() date = datetime.datetime.strptime(date, "%m/%d/%Y").date() actor = tr.xpath("string(td[4])").strip() if actor == "H": actor = "lower" elif actor == "S": actor = "upper" bill.add_action(actor, action, date, type=action_type(action)) version_table = page.xpath("//table[contains(@id, 'Versions')]")[0] for link in version_table.xpath(".//a[contains(@href, '.DOC')]"): version_url = link.attrib["href"] if "COMMITTEE REPORTS" in version_url: continue name = link.text.strip() bill.add_version(name, version_url) for link in page.xpath(".//a[contains(@href, '_VOTES')]"): self.scrape_votes(bill, urlescape(link.attrib["href"])) self.save_bill(bill)
def scrape(self, chamber, term): self.validate_term(term, latest_only=True) if chamber == 'upper': chamber_abbrev = 'sen' title_abbrev = 'sen' else: chamber_abbrev = 'hse' title_abbrev = 'del' url = "http://www.legis.state.wv.us/districts/maps/%s_dist.cfm" % ( chamber_abbrev) page = lxml.html.fromstring(self.urlopen(url)) page.make_links_absolute(url) view_url = '%smemview' % title_abbrev for link in page.xpath("//a[contains(@href, '%s')]" % view_url): name = link.xpath("string()").strip() leg_url = urlescape(link.attrib['href']) if name in [ 'Members', 'Senate Members', 'House Members', 'Vacancy', 'VACANT' ]: continue self.scrape_legislator(chamber, term, name, leg_url)
def scrape(self, chamber, term): self.validate_term(term, latest_only=True) if chamber == 'upper': chamber_abbrev = 'sen' title_abbrev = 'sen' else: chamber_abbrev = 'hse' title_abbrev = 'del' url = "http://www.legis.state.wv.us/districts/maps/%s_dist.cfm" % ( chamber_abbrev) page = lxml.html.fromstring(self.urlopen(url)) page.make_links_absolute(url) view_url = '%smemview' % title_abbrev for link in page.xpath("//a[contains(@href, '%s')]" % view_url): name = link.xpath("string()").strip() leg_url = urlescape(link.attrib['href']) if name in ['Members', 'Senate Members', 'House Members', 'Vacancy']: continue self.scrape_legislator(chamber, term, name, leg_url)
def scrape(self, chamber, term): self.validate_term(term, latest_only=True) if chamber == 'upper': chamber_abbrev = 'Senate1' else: chamber_abbrev = 'House' url = 'http://www.legis.state.wv.us/%s/roster.cfm' % chamber_abbrev page = lxml.html.fromstring(self.get(url).text) page.make_links_absolute(url) for link in page.xpath("//a[contains(@href, '?member=')]"): if not link.text: continue name = link.xpath("string()").strip() leg_url = urlescape(link.attrib['href']) if name in [ 'Members', 'Senate Members', 'House Members', 'Vacancy', 'VACANT', 'Vacant', 'To Be Announced', 'To Be Appointed' ]: continue self.scrape_legislator(chamber, term, name, leg_url)
def scrape_committee(self, chamber, name, url, subcommittee=None): comm = Committee(chamber, name, subcommittee=subcommittee) comm.add_source(url) with self.urlopen(url) as page: page = lxml.html.fromstring(page) for tr in page.xpath('//table[@class="gridtable"]/' 'tr[position()>1]'): if tr.xpath('string(td[1])'): mtype = tr.xpath('string(td[1])') else: mtype = 'member' member = tr.xpath('string(td[3])').split() member = ' '.join(member[1:]) comm.add_member(member, mtype) for a in page.xpath('//ul/li/a'): sub_name = a.text.strip() sub_url = urlescape(a.attrib['href']) self.scrape_committee(chamber, name, sub_url, subcommittee=sub_name) self.save_committee(comm)
def scrape_committee(self, chamber, name, url, subcommittee=None): name = self._fix_committee_name(name) name = self._fix_committee_case(name) page = self.urlopen(url) page = lxml.html.fromstring(page) page.make_links_absolute(url) # Get the subcommittee name. xpath = '//div[@class="ms-WPBody"]//table//tr/td/b/text()' if subcommittee: subcommittee = page.xpath(xpath) if subcommittee: subcommittee = page.xpath(xpath).pop(0) subcommittee = self._fix_committee_name( subcommittee, parent=name, subcommittee=True) subcommittee = self._fix_committee_case(subcommittee) else: subcommittee = None # Dedupe. if (chamber, name, subcommittee) in self._seen: return self._seen.add((chamber, name, subcommittee)) comm = Committee(chamber, name, subcommittee=subcommittee) comm.add_source(url) for tr in page.xpath('//table[@class="dxgvTable"]/tr[position()>1]'): if tr.xpath('string(td[1])').strip(): mtype = tr.xpath('string(td[1])').strip() else: mtype = 'member' member = tr.xpath('string(td[3])').split() title = member[0] member = ' '.join(member[1:]) if title == 'Senator': mchamber = 'upper' elif title == 'Representative': mchamber = 'lower' else: # skip non-legislative members continue comm.add_member(member, mtype, chamber=mchamber) for a in page.xpath('//ul/li/a'): sub_name = a.text.strip() sub_url = urlescape(a.attrib['href']) self.scrape_committee(chamber, name, sub_url, subcommittee=sub_name) if not comm['members']: self.warning('not saving empty committee %s' % name) else: self.save_committee(comm)
def scrape_bill(self, chamber, session, bill_id, url): page = lxml.html.fromstring(self.urlopen(url)) title = page.xpath( "string(//span[contains(@id, 'PlaceHolder1_txtST')])").strip() if 'JR' in bill_id: bill_type = ['joint resolution'] elif 'CR' in bill_id: bill_type = ['concurrent resolution'] elif 'R' in bill_id: bill_type = ['resolution'] else: bill_type = ['bill'] bill = Bill(session, chamber, bill_id, title, type=bill_type) bill.add_source(url) bill['subjects'] = self.subject_map[bill_id] for link in page.xpath("//a[contains(@id, 'Auth')]"): name = link.xpath("string()").strip() if 'otherAuth' in link.attrib['id']: bill.add_sponsor('coauthor', name) else: bill.add_sponsor('author', name) act_table = page.xpath("//table[contains(@id, 'Actions')]")[0] for tr in act_table.xpath("tr")[2:]: action = tr.xpath("string(td[1])").strip() if not action or action == 'None': continue date = tr.xpath("string(td[3])").strip() date = datetime.datetime.strptime(date, "%m/%d/%Y").date() actor = tr.xpath("string(td[4])").strip() if actor == 'H': actor = 'lower' elif actor == 'S': actor = 'upper' bill.add_action(actor, action, date, type=action_type(action)) version_table = page.xpath("//table[contains(@id, 'Versions')]")[0] for link in version_table.xpath(".//a[contains(@href, '.DOC')]"): version_url = link.attrib['href'] if 'COMMITTEE REPORTS' in version_url: continue name = link.text.strip() bill.add_version(name, version_url) for link in page.xpath(".//a[contains(@href, '_VOTES')]"): self.scrape_votes(bill, urlescape(link.attrib['href'])) self.save_bill(bill)
def scrape(self, chamber, term): if term != '2011-2012': raise NoDataForPeriod base_url = ("http://www.arkleg.state.ar.us/assembly/2011/2011R/" "Pages/Committees.aspx?committeetype=") for chamber, url_ext in COMM_TYPES.iteritems(): chamber_url = urlescape(base_url + url_ext) with self.urlopen(chamber_url) as page: page = lxml.html.fromstring(page) for a in page.xpath('//td[@class="dxtl dxtl__B0"]/a'): name = a.text.strip() comm_url = urlescape(a.attrib['href']) if chamber == 'task_force': chamber = 'joint' self.scrape_committee(chamber, name, comm_url)
def scrape(self, chamber, term): url = ('http://www.arkleg.state.ar.us/assembly/2011/2011R/Pages/' 'LegislatorSearchResults.aspx?member=&committee=All&chamber=') with self.urlopen(url) as page: root = lxml.html.fromstring(page) for a in root.xpath('//table[@class="dxgvTable"]' '/tr[contains(@class, "dxgvDataRow")]' '/td[1]/a'): member_url = urlescape(a.attrib['href']) self.scrape_member(chamber, term, member_url)
def scrape_committee(self, chamber, name, url, subcommittee=None): name = self._fix_committee_name(name) name = self._fix_committee_case(name) page = self.urlopen(url) page = lxml.html.fromstring(page) # Get the subcommittee name. xpath = '//div[@class="ms-WPBody"]//table//tr/td/b/text()' if subcommittee: subcommittee = page.xpath(xpath).pop(0) subcommittee = self._fix_committee_name(subcommittee, parent=name, subcommittee=True) subcommittee = self._fix_committee_case(subcommittee) # Dedupe. if (chamber, name, subcommittee) in self._seen: return self._seen.add((chamber, name, subcommittee)) comm = Committee(chamber, name, subcommittee=subcommittee) comm.add_source(url) for tr in page.xpath('//table[@class="gridtable"]/tr[position()>1]'): if tr.xpath("string(td[1])"): mtype = tr.xpath("string(td[1])") else: mtype = "member" member = tr.xpath("string(td[3])").split() title = member[0] member = " ".join(member[1:]) if title == "Senator": mchamber = "upper" elif title == "Representative": mchamber = "lower" else: # skip non-legislative members continue comm.add_member(member, mtype, chamber=mchamber) for a in page.xpath("//ul/li/a"): sub_name = a.text.strip() sub_url = urlescape(a.attrib["href"]) self.scrape_committee(chamber, name, sub_url, subcommittee=sub_name) if not comm["members"]: self.warning("not saving empty committee %s" % name) else: self.save_committee(comm)
def scrape_committee(self, chamber, name, url, subcommittee=None): if subcommittee: split_sub = subcommittee.split('-') if len(split_sub) > 1: subcommittee = '-'.join(split_sub[1:]) subcommittee = re.sub(r'^(HOUSE|SENATE)\s+', '', subcommittee.strip()) if (name, subcommittee) in self._seen: return self._seen.add((name, subcommittee)) comm = Committee(chamber, name, subcommittee=subcommittee) comm.add_source(url) with self.urlopen(url) as page: page = lxml.html.fromstring(page) for tr in page.xpath('//table[@class="gridtable"]/' 'tr[position()>1]'): if tr.xpath('string(td[1])'): mtype = tr.xpath('string(td[1])') else: mtype = 'member' member = tr.xpath('string(td[3])').split() title = member[0] member = ' '.join(member[1:]) if title == 'Senator': mchamber = 'upper' elif title == 'Representative': mchamber = 'lower' else: # skip non-legislative members continue comm.add_member(member, mtype, chamber=mchamber) for a in page.xpath('//ul/li/a'): sub_name = a.text.strip() sub_url = urlescape(a.attrib['href']) self.scrape_committee(chamber, name, sub_url, subcommittee=sub_name) if not comm['members']: self.warning('not saving empty committee %s' % name) else: self.save_committee(comm)
def scrape(self, chamber, term): if term != "2011-2012": raise NoDataForPeriod url = ( "http://www.arkleg.state.ar.us/assembly/2011/2011R/Pages/" "LegislatorSearchResults.aspx?member=&committee=All&chamber=" ) with self.urlopen(url) as page: root = lxml.html.fromstring(page) for a in root.xpath('//table[@class="dxgvTable"]' '/tr[contains(@class, "dxgvDataRow")]' "/td[1]/a"): member_url = urlescape(a.attrib["href"]) self.scrape_member(chamber, term, member_url)
def scrape(self, chamber, term): # Get start year of term. for termdict in self.metadata['terms']: if termdict['name'] == term: break start_year = termdict['start_year'] url = ('http://www.arkleg.state.ar.us/assembly/%s/%sR/Pages/' 'LegislatorSearchResults.aspx?member=&committee=All&chamber=') url = url % (start_year, start_year) page = self.urlopen(url) root = lxml.html.fromstring(page) for a in root.xpath('//table[@class="dxgvTable"]' '/tr[contains(@class, "dxgvDataRow")]' '/td[1]/a'): member_url = urlescape(a.attrib['href']) self.scrape_member(chamber, term, member_url)
def scrape(self, chamber, term): # Get start year of term. for termdict in self.metadata['terms']: if termdict['name'] == term: break start_year = termdict['start_year'] url = ('http://www.arkleg.state.ar.us/assembly/%s/%sR/Pages/' 'LegislatorSearchResults.aspx?member=&committee=All&chamber=') url = url % (start_year, start_year) page = self.get(url).text root = lxml.html.fromstring(page) for a in root.xpath('//table[@class="dxgvTable"]' '/tr[contains(@class, "dxgvDataRow")]' '/td[1]/a'): member_url = urlescape(a.attrib['href']) self.scrape_member(chamber, term, member_url)
def scrape_committee(self, chamber, name, url, subcommittee=None): if subcommittee: split_sub = subcommittee.split("-") if len(split_sub) > 1: subcommittee = "-".join(split_sub[1:]) subcommittee = re.sub(r"^(HOUSE|SENATE)\s+", "", subcommittee.strip()) if (name, subcommittee) in self._seen: return self._seen.add((name, subcommittee)) comm = Committee(chamber, name, subcommittee=subcommittee) comm.add_source(url) with self.urlopen(url) as page: page = lxml.html.fromstring(page) for tr in page.xpath('//table[@class="gridtable"]/' "tr[position()>1]"): if tr.xpath("string(td[1])"): mtype = tr.xpath("string(td[1])") else: mtype = "member" member = tr.xpath("string(td[3])").split() title = member[0] member = " ".join(member[1:]) if title == "Senator": mchamber = "upper" elif title == "Representative": mchamber = "lower" comm.add_member(member, mtype, chamber=mchamber) for a in page.xpath("//ul/li/a"): sub_name = a.text.strip() sub_url = urlescape(a.attrib["href"]) self.scrape_committee(chamber, name, sub_url, subcommittee=sub_name) self.save_committee(comm)
def scrape(self, chamber, term): self.validate_term(term, latest_only=True) if chamber == "upper": chamber_abbrev = "sen" title_abbrev = "sen" else: chamber_abbrev = "hse" title_abbrev = "del" url = "http://www.legis.state.wv.us/districts/maps/%s_dist.cfm" % (chamber_abbrev) page = lxml.html.fromstring(self.urlopen(url)) page.make_links_absolute(url) view_url = "%smemview" % title_abbrev for link in page.xpath("//a[contains(@href, '%s')]" % view_url): name = link.xpath("string()").strip() leg_url = urlescape(link.attrib["href"]) if name in ["Members", "Senate Members", "House Members", "Vacancy", "VACANT", "Vacant"]: continue self.scrape_legislator(chamber, term, name, leg_url)
def scrape(self, chamber, term): self.validate_term(term, latest_only=True) if chamber == 'upper': chamber_abbrev = 'Senate1' else: chamber_abbrev = 'House' url = 'http://www.legis.state.wv.us/%s/roster.cfm' % chamber_abbrev page = lxml.html.fromstring(self.get(url).text) page.make_links_absolute(url) for link in page.xpath("//a[contains(@href, '?member=')]"): if not link.text: continue name = link.xpath("string()").strip() leg_url = urlescape(link.attrib['href']) if name in ['Members', 'Senate Members', 'House Members', 'Vacancy', 'VACANT', 'Vacant', "To Be Announced"]: continue self.scrape_legislator(chamber, term, name, leg_url)
def scrape_member(self, chamber, term, member_url): with self.urlopen(member_url) as page: root = lxml.html.fromstring(page) root.make_links_absolute(member_url) sdiv = root.xpath('//div[@class="subtitle"]')[0] table = sdiv.getnext() photo_url = table.xpath('//img[@id="ctl00_ContentPlaceHolder1' '_imgMember"]')[0].attrib['src'] td = table.xpath('//td[@valign="top"]')[0] type = td.xpath('string(//div[1]/strong)').strip() full_name = td.xpath('string(//div[2]/strong)').strip() full_name = re.sub(r'\s+', ' ', full_name) district = td.xpath('string(//div[3])').strip() district = district.replace('District ', '') addrs = {} for atype, text in (('capital_address', 'Capitol address:'), ('district_address', 'District address:')): aspan = root.xpath("//span[. = '%s']" % text) addrs[atype] = None if aspan: addrs[atype] = aspan[0].tail elem = aspan[0].getnext() while elem is not None and elem.tag == 'br': if elem.tail: addrs[atype] += "\n" + elem.tail elem = elem.getnext() party = td.xpath('string(//div[4])').strip()[0] if party == 'D': party = 'Democratic' elif party == 'R': party = 'Republican' if type == 'Lt. Gov.': leg = Person(full_name) leg.add_role('Lt. Governor', term, party=party, **addrs) else: leg = Legislator(term, chamber, district, full_name, party=party, photo_url=photo_url, **addrs) leg.add_source(urlescape(member_url)) comm_div = root.xpath('//div[string() = "Committee Membership:"]' '/following-sibling::div' '[@class="rcwcontent"]')[0] for link in comm_div.xpath('*/a'): name = link.text if '(Vice Chair)' in name: mtype = 'vice chair' elif '(Chair)' in name: mtype = 'chair' else: mtype = 'member' name = clean_committee_name(link.text) # There's no easy way to determine whether a committee # is joint or not using the mobile legislator directory # (without grabbing a whole bunch of pages, at least) # so for now we will hard-code the one broken case if (name == "Oversight of HHS Eligibility System" and term == '82'): comm_chamber = 'joint' else: comm_chamber = chamber if name.startswith('Appropriations-S/C on '): sub = name.replace('Appropriations-S/C on ', '') leg.add_role('committee member', term, chamber=comm_chamber, committee='Appropriations', subcommittee=sub, position=mtype) else: leg.add_role('committee member', term, chamber=comm_chamber, committee=name, position=mtype) if type == 'Lt. Gov.': self.save_person(leg) else: if district: self.save_legislator(leg)
def scrape_bill(self, chamber, session, bill_id, url): try: page = lxml.html.fromstring(self.urlopen(url)) except scrapelib.HTTPError as e: self.warning('error (%s) fetching %s, skipping' % (e, url)) return title = page.xpath( "string(//span[contains(@id, 'PlaceHolder1_txtST')])").strip() if 'JR' in bill_id: bill_type = ['joint resolution'] elif 'CR' in bill_id: bill_type = ['concurrent resolution'] elif 'R' in bill_id: bill_type = ['resolution'] else: bill_type = ['bill'] bill = Bill(session, chamber, bill_id, title, type=bill_type) bill.add_source(url) bill['subjects'] = self.subject_map[bill_id] for link in page.xpath("//a[contains(@id, 'Auth')]"): name = link.xpath("string()").strip() if 'otherAuth' in link.attrib['id']: bill.add_sponsor('cosponsor', name) else: bill.add_sponsor('primary', name) act_table = page.xpath("//table[contains(@id, 'Actions')]")[0] for tr in act_table.xpath("tr")[2:]: action = tr.xpath("string(td[1])").strip() if not action or action == 'None': continue date = tr.xpath("string(td[3])").strip() date = datetime.datetime.strptime(date, "%m/%d/%Y").date() actor = tr.xpath("string(td[4])").strip() if actor == 'H': actor = 'lower' elif actor == 'S': actor = 'upper' bill.add_action(actor, action, date, type=action_type(action)) version_table = page.xpath("//table[contains(@id, 'Versions')]")[0] for link in version_table.xpath(".//a[contains(@href, '.DOC')]"): version_url = link.attrib['href'] if 'COMMITTEE REPORTS' in version_url: continue name = link.text.strip() bill.add_version(name, version_url, mimetype='application/msword') for link in page.xpath(".//a[contains(@href, '_VOTES')]"): self.scrape_votes(bill, urlescape(link.attrib['href'])) # # If the bill has no actions and no versions, it's a bogus bill on # # their website, which appears to happen occasionally. Skip. has_no_actions = not bill['actions'] has_no_versions = not bill['versions'] has_no_title = (bill['title'] == "Short Title Not Found.") first_sponsor_is_bogus = bill['sponsors'][0]['name'] = "Author Not Found." has_no_sponsors = (len(bill['sponsors']) == 1) and first_sponsor_is_bogus if has_no_actions and has_no_versions: if has_no_title or has_no_sponsors: msg = '%r appears to be bogus. Skipping it.' % bill_id self.logger.warning(msg) return else: # Otherwise, save the bills. self.save_bill(bill)
def scrape_member(self, chamber, term, member_url): page = self.get(member_url).text root = lxml.html.fromstring(page) root.make_links_absolute(member_url) sdiv = root.xpath('//div[@class="subtitle"]')[0] table = sdiv.getnext() photo_url = table.xpath('//img[@id="ctl00_ContentPlaceHolder1' '_imgMember"]')[0].attrib['src'] td = table.xpath('//td[@valign="top"]')[0] type = td.xpath('string(//div[1]/strong)').strip() full_name = td.xpath('//div/strong/text()') full_name = [re.sub(r'\s+', ' ', x).strip() for x in full_name] if full_name == []: self.warning("ERROR: CAN'T GET FULL NAME") return full_name = full_name[-1] district = td.xpath('string(//div[3])').strip() district = district.replace('District ', '') party = td.xpath('string(//div[4])').strip()[0] if party == 'D': party = 'Democratic' elif party == 'R': party = 'Republican' if type == 'Lt. Gov.': leg = Person(full_name) leg.add_role('Lt. Governor', term, party=party) else: leg = Legislator(term, chamber, district, full_name, party=party, photo_url=photo_url, url=member_url) leg.add_source(urlescape(member_url)) # add addresses for atype, text in (('capitol', 'Capitol address'), ('district', 'District address')): aspan = root.xpath("//span[. = '%s:']" % text) addr = '' phone = None if aspan: # cycle through brs addr = aspan[0].tail.strip() elem = aspan[0].getnext() while elem is not None and elem.tag == 'br': if elem.tail: if not phone_re.match(elem.tail): addr += "\n" + elem.tail else: phone = elem.tail elem = elem.getnext() # now add the addresses leg.add_office(atype, text, address=addr, phone=phone) # add committees comm_div = root.xpath('//div[string() = "Committee Membership:"]' '/following-sibling::div' '[@class="rcwcontent"]')[0] for link in comm_div.xpath('*/a'): name = link.text if '(Vice Chair)' in name: mtype = 'vice chair' elif '(Chair)' in name: mtype = 'chair' else: mtype = 'member' name = clean_committee_name(link.text) # There's no easy way to determine whether a committee # is joint or not using the mobile legislator directory # (without grabbing a whole bunch of pages, at least) # so for now we will hard-code the one broken case if (name == "Oversight of HHS Eligibility System" and term == '82'): comm_chamber = 'joint' else: comm_chamber = chamber if name.startswith('Appropriations-S/C on '): sub = name.replace('Appropriations-S/C on ', '') leg.add_role('committee member', term, chamber=comm_chamber, committee='Appropriations', subcommittee=sub, position=mtype) else: leg.add_role('committee member', term, chamber=comm_chamber, committee=name, position=mtype) if type == 'Lt. Gov.': self.save_object(leg) else: if district: self.save_legislator(leg)
def scrape_member(self, chamber, term, member_url): with self.urlopen(member_url) as page: root = lxml.html.fromstring(page) root.make_links_absolute(member_url) sdiv = root.xpath('//div[@class="subtitle"]')[0] table = sdiv.getnext() photo_url = table.xpath('//img[@id="ctl00_ContentPlaceHolder1' '_imgMember"]')[0].attrib["src"] td = table.xpath('//td[@valign="top"]')[0] type = td.xpath("string(//div[1]/strong)").strip() full_name = td.xpath("string(//div[2]/strong)").strip() full_name = re.sub(r"\s+", " ", full_name) district = td.xpath("string(//div[3])").strip() district = district.replace("District ", "") party = td.xpath("string(//div[4])").strip()[0] if party == "D": party = "Democratic" elif party == "R": party = "Republican" if type == "Lt. Gov.": leg = Person(full_name) leg.add_role("Lt. Governor", term, party=party) else: leg = Legislator(term, chamber, district, full_name, party=party, photo_url=photo_url, url=member_url) leg.add_source(urlescape(member_url)) # add addresses for atype, text in (("capitol", "Capitol address"), ("district", "District address")): aspan = root.xpath("//span[. = '%s:']" % text) addr = "" phone = None if aspan: # cycle through brs addr = aspan[0].tail.strip() elem = aspan[0].getnext() while elem is not None and elem.tag == "br": if elem.tail: if not phone_re.match(elem.tail): addr += "\n" + elem.tail else: phone = elem.tail elem = elem.getnext() # now add the addresses leg.add_office(atype, text, address=addr, phone=phone) # add committees comm_div = root.xpath( '//div[string() = "Committee Membership:"]' "/following-sibling::div" '[@class="rcwcontent"]' )[0] for link in comm_div.xpath("*/a"): name = link.text if "(Vice Chair)" in name: mtype = "vice chair" elif "(Chair)" in name: mtype = "chair" else: mtype = "member" name = clean_committee_name(link.text) # There's no easy way to determine whether a committee # is joint or not using the mobile legislator directory # (without grabbing a whole bunch of pages, at least) # so for now we will hard-code the one broken case if name == "Oversight of HHS Eligibility System" and term == "82": comm_chamber = "joint" else: comm_chamber = chamber if name.startswith("Appropriations-S/C on "): sub = name.replace("Appropriations-S/C on ", "") leg.add_role( "committee member", term, chamber=comm_chamber, committee="Appropriations", subcommittee=sub, position=mtype, ) else: leg.add_role("committee member", term, chamber=comm_chamber, committee=name, position=mtype) if type == "Lt. Gov.": self.save_object(leg) else: if district: self.save_legislator(leg)
def scrape_member(self, chamber, term, member_url): page = self.urlopen(member_url) root = lxml.html.fromstring(page) root.make_links_absolute(member_url) sdiv = root.xpath('//div[@class="subtitle"]')[0] table = sdiv.getnext() photo_url = table.xpath('//img[@id="ctl00_ContentPlaceHolder1' '_imgMember"]')[0].attrib['src'] td = table.xpath('//td[@valign="top"]')[0] type = td.xpath('string(//div[1]/strong)').strip() full_name = td.xpath('string(//div[2]/strong)').strip() full_name = re.sub(r'\s+', ' ', full_name) district = td.xpath('string(//div[3])').strip() district = district.replace('District ', '') party = td.xpath('string(//div[4])').strip()[0] if party == 'D': party = 'Democratic' elif party == 'R': party = 'Republican' if type == 'Lt. Gov.': leg = Person(full_name) leg.add_role('Lt. Governor', term, party=party) else: leg = Legislator(term, chamber, district, full_name, party=party, photo_url=photo_url, url=member_url) leg.add_source(urlescape(member_url)) # add addresses for atype, text in (('capitol', 'Capitol address'), ('district', 'District address')): aspan = root.xpath("//span[. = '%s:']" % text) addr = '' phone = None if aspan: # cycle through brs addr = aspan[0].tail.strip() elem = aspan[0].getnext() while elem is not None and elem.tag == 'br': if elem.tail: if not phone_re.match(elem.tail): addr += "\n" + elem.tail else: phone = elem.tail elem = elem.getnext() # now add the addresses leg.add_office(atype, text, address=addr, phone=phone) # add committees comm_div = root.xpath('//div[string() = "Committee Membership:"]' '/following-sibling::div' '[@class="rcwcontent"]')[0] for link in comm_div.xpath('*/a'): name = link.text if '(Vice Chair)' in name: mtype = 'vice chair' elif '(Chair)' in name: mtype = 'chair' else: mtype = 'member' name = clean_committee_name(link.text) # There's no easy way to determine whether a committee # is joint or not using the mobile legislator directory # (without grabbing a whole bunch of pages, at least) # so for now we will hard-code the one broken case if (name == "Oversight of HHS Eligibility System" and term == '82'): comm_chamber = 'joint' else: comm_chamber = chamber if name.startswith('Appropriations-S/C on '): sub = name.replace('Appropriations-S/C on ', '') leg.add_role('committee member', term, chamber=comm_chamber, committee='Appropriations', subcommittee=sub, position=mtype) else: leg.add_role('committee member', term, chamber=comm_chamber, committee=name, position=mtype) if type == 'Lt. Gov.': self.save_object(leg) else: if district: self.save_legislator(leg)
def scrape_member(self, chamber, term, member_url): with self.urlopen(member_url) as page: root = lxml.html.fromstring(page) root.make_links_absolute(member_url) sdiv = root.xpath('//div[@class="subtitle"]')[0] table = sdiv.getnext() photo_url = table.xpath('//img[@id="ctl00_ContentPlaceHolder1' '_imgMember"]')[0].attrib['src'] td = table.xpath('//td[@valign="top"]')[0] type = td.xpath('string(//div[1]/strong)').strip() full_name = td.xpath('string(//div[2]/strong)').strip() full_name = re.sub(r'\s+', ' ', full_name) district = td.xpath('string(//div[3])').strip() district = district.replace('District ', '') addrs = {} for atype, text in (('capital_address', 'Capitol address:'), ('district_address', 'District address:')): aspan = root.xpath("//span[. = '%s']" % text) addrs[atype] = None if aspan: addrs[atype] = aspan[0].tail elem = aspan[0].getnext() while elem is not None and elem.tag == 'br': if elem.tail: addrs[atype] += "\n" + elem.tail elem = elem.getnext() party = td.xpath('string(//div[4])').strip()[0] if party == 'D': party = 'Democratic' elif party == 'R': party = 'Republican' if type == 'Lt. Gov.': leg = Person(full_name) leg.add_role('Lt. Governor', term, party=party, **addrs) else: leg = Legislator(term, chamber, district, full_name, party=party, photo_url=photo_url, **addrs) leg.add_source(urlescape(member_url)) comm_div = root.xpath('//div[string() = "Committee Membership:"]' '/following-sibling::div' '[@class="rcwcontent"]')[0] for link in comm_div.xpath('*/a'): name = link.text if '(Vice Chair)' in name: mtype = 'vice chair' elif '(Chair)' in name: mtype = 'chair' else: mtype = 'member' name = clean_committee_name(link.text) if name.startswith('Appropriations-S/C on '): sub = name.replace('Appropriations-S/C on ', '') leg.add_role('committee member', term, chamber=chamber, committee='Appropriations', subcommittee=sub, position=mtype) else: leg.add_role('committee member', term, chamber=chamber, committee=name, position=mtype) if type == 'Lt. Gov.': self.save_person(leg) else: if district: self.save_legislator(leg)
def scrape_bill(self, chamber, session, bill_id, url): try: page = lxml.html.fromstring(self.urlopen(url)) except scrapelib.HTTPError as e: self.warning("error (%s) fetching %s, skipping" % (e, url)) return title = page.xpath("string(//span[contains(@id, 'PlaceHolder1_txtST')])").strip() if "JR" in bill_id: bill_type = ["joint resolution"] elif "CR" in bill_id: bill_type = ["concurrent resolution"] elif "R" in bill_id: bill_type = ["resolution"] else: bill_type = ["bill"] bill = Bill(session, chamber, bill_id, title, type=bill_type) bill.add_source(url) bill["subjects"] = self.subject_map[bill_id] for link in page.xpath("//a[contains(@id, 'Auth')]"): name = link.xpath("string()").strip() if ":" in name: raise Exception(name) if "otherAuth" in link.attrib["id"]: bill.add_sponsor("cosponsor", name) else: bill.add_sponsor("primary", name) act_table = page.xpath("//table[contains(@id, 'Actions')]")[0] for tr in act_table.xpath("tr")[2:]: action = tr.xpath("string(td[1])").strip() if not action or action == "None": continue date = tr.xpath("string(td[3])").strip() date = datetime.datetime.strptime(date, "%m/%d/%Y").date() actor = tr.xpath("string(td[4])").strip() if actor == "H": actor = "lower" elif actor == "S": actor = "upper" attrs = dict(actor=actor, action=action, date=date) attrs.update(**self.categorizer.categorize(action)) bill.add_action(**attrs) version_table = page.xpath("//table[contains(@id, 'Versions')]")[0] for link in version_table.xpath(".//a[contains(@href, '.PDF')]"): version_url = link.attrib["href"] if "COMMITTEE REPORTS" in version_url: continue name = link.text.strip() bill.add_version(name, version_url, mimetype="application/pdf") for link in page.xpath(".//a[contains(@href, '_VOTES')]"): if "HT_" not in link.attrib["href"]: self.scrape_votes(bill, urlescape(link.attrib["href"])) # # If the bill has no actions and no versions, it's a bogus bill on # # their website, which appears to happen occasionally. Skip. has_no_title = bill["title"] == "Short Title Not Found." if has_no_title: # If there's no title, this is an empty page. Skip! return else: # Otherwise, save the bills. self.save_bill(bill)
def scrape_bill(self, chamber, session, bill_id, url): try: page = lxml.html.fromstring(self.get(url).text) except scrapelib.HTTPError as e: self.warning('error (%s) fetching %s, skipping' % (e, url)) return title = page.xpath( "string(//span[contains(@id, 'PlaceHolder1_txtST')])").strip() if 'JR' in bill_id: bill_type = ['joint resolution'] elif 'CR' in bill_id: bill_type = ['concurrent resolution'] elif 'R' in bill_id: bill_type = ['resolution'] else: bill_type = ['bill'] bill = Bill(session, chamber, bill_id, title, type=bill_type) bill.add_source(url) bill['subjects'] = self.subject_map[bill_id] for link in page.xpath("//a[contains(@id, 'Auth')]"): name = link.xpath("string()").strip() if ':' in name: raise Exception(name) if 'otherAuth' in link.attrib['id']: bill.add_sponsor('cosponsor', name) else: bill.add_sponsor('primary', name) act_table = page.xpath("//table[contains(@id, 'Actions')]")[0] for tr in act_table.xpath("tr")[2:]: action = tr.xpath("string(td[1])").strip() if not action or action == 'None': continue date = tr.xpath("string(td[3])").strip() date = datetime.datetime.strptime(date, "%m/%d/%Y").date() actor = tr.xpath("string(td[4])").strip() if actor == 'H': actor = 'lower' elif actor == 'S': actor = 'upper' attrs = dict(actor=actor, action=action, date=date) attrs.update(**self.categorizer.categorize(action)) bill.add_action(**attrs) version_table = page.xpath("//table[contains(@id, 'Versions')]")[0] for link in version_table.xpath(".//a[contains(@href, '.PDF')]"): version_url = link.attrib['href'] name = link.text.strip() if 'COMMITTEE REPORTS' in version_url: bill.add_document(name, version_url, mimetype='application/pdf') continue bill.add_version(name, version_url, mimetype='application/pdf') for link in page.xpath(".//a[contains(@href, '_VOTES')]"): if 'HT_' not in link.attrib['href']: self.scrape_votes(bill, urlescape(link.attrib['href'])) # # If the bill has no actions and no versions, it's a bogus bill on # # their website, which appears to happen occasionally. Skip. has_no_title = (bill['title'] == "Short Title Not Found.") if has_no_title: # If there's no title, this is an empty page. Skip! return else: # Otherwise, save the bills. self.save_bill(bill)
def scrape_committee(self, chamber, name, url, subcommittee=None): name = self._fix_committee_name(name) name = self._fix_committee_case(name) page = self.get(url).text page = lxml.html.fromstring(page) page.make_links_absolute(url) # Get the subcommittee name. xpath = '//div[@class="ms-WPBody"]//table//tr/td/b/text()' if subcommittee: subcommittee = page.xpath(xpath) if subcommittee: subcommittee = page.xpath(xpath).pop(0) subcommittee = self._fix_committee_name(subcommittee, parent=name, subcommittee=True) subcommittee = self._fix_committee_case(subcommittee) else: subcommittee = None # Dedupe. if (chamber, name, subcommittee) in self._seen: return self._seen.add((chamber, name, subcommittee)) comm = Committee(chamber, name, subcommittee=subcommittee) comm.add_source(url) member_nodes = page.xpath('//table[@class="dxgvTable"]/tr') for member_node in member_nodes: # Skip empty rows. if member_node.attrib['class'] == 'dxgvEmptyDataRow': continue mtype = member_node.xpath('string(td[1])').strip() if not mtype: mtype = 'member' member = member_node.xpath('string(td[3])').split() title = member[0] member = ' '.join(member[1:]) if title == 'Senator': mchamber = 'upper' elif title == 'Representative': mchamber = 'lower' else: # skip non-legislative members continue comm.add_member(member, mtype, chamber=mchamber) for a in page.xpath('//table[@id="ctl00_m_g_a194465c_f092_46df_b753_' '354150ac7dbd_ctl00_tblContainer"]//ul/li/a'): sub_name = a.text.strip() sub_url = urlescape(a.attrib['href']) self.scrape_committee(chamber, name, sub_url, subcommittee=sub_name) if not comm['members']: if subcommittee: self.warning( 'Not saving empty subcommittee {}.'.format(subcommittee)) else: self.warning('Not saving empty committee {}.'.format(name)) else: self.save_committee(comm)
def scrape_committee(self, chamber, name, url, subcommittee=None): name = self._fix_committee_name(name) name = self._fix_committee_case(name) page = self.get(url).text page = lxml.html.fromstring(page) page.make_links_absolute(url) # Get the subcommittee name. xpath = '//div[@class="ms-WPBody"]//table//tr/td/b/text()' if subcommittee: subcommittee = page.xpath(xpath) if subcommittee: subcommittee = page.xpath(xpath).pop(0) subcommittee = self._fix_committee_name( subcommittee, parent=name, subcommittee=True) subcommittee = self._fix_committee_case(subcommittee) else: subcommittee = None # Dedupe. if (chamber, name, subcommittee) in self._seen: return self._seen.add((chamber, name, subcommittee)) comm = Committee(chamber, name, subcommittee=subcommittee) comm.add_source(url) member_nodes = page.xpath('//table[@class="dxgvTable"]/tr') for member_node in member_nodes: # Skip empty rows. if member_node.attrib['class'] == 'dxgvEmptyDataRow': continue mtype = member_node.xpath('string(td[1])').strip() if not mtype: mtype = 'member' member = member_node.xpath('string(td[3])').split() title = member[0] member = ' '.join(member[1:]) if title == 'Senator': mchamber = 'upper' elif title == 'Representative': mchamber = 'lower' else: # skip non-legislative members continue comm.add_member(member, mtype, chamber=mchamber) for a in page.xpath('//table[@id="ctl00_m_g_a194465c_f092_46df_b753_' '354150ac7dbd_ctl00_tblContainer"]//ul/li/a'): sub_name = a.text.strip() sub_url = urlescape(a.attrib['href']) self.scrape_committee(chamber, name, sub_url, subcommittee=sub_name) if not comm['members']: if subcommittee: self.warning('Not saving empty subcommittee {}.'.format( subcommittee)) else: self.warning('Not saving empty committee {}.'.format(name)) else: self.save_committee(comm)
def scrape_bill(self, chamber, session, bill_id, url): try: page = lxml.html.fromstring(self.urlopen(url)) except scrapelib.HTTPError as e: self.warning('error (%s) fetching %s, skipping' % (e, url)) return title = page.xpath( "string(//span[contains(@id, 'PlaceHolder1_txtST')])").strip() if 'JR' in bill_id: bill_type = ['joint resolution'] elif 'CR' in bill_id: bill_type = ['concurrent resolution'] elif 'R' in bill_id: bill_type = ['resolution'] else: bill_type = ['bill'] bill = Bill(session, chamber, bill_id, title, type=bill_type) bill.add_source(url) bill['subjects'] = self.subject_map[bill_id] for link in page.xpath("//a[contains(@id, 'Auth')]"): name = link.xpath("string()").strip() if 'otherAuth' in link.attrib['id']: bill.add_sponsor('cosponsor', name) else: bill.add_sponsor('primary', name) act_table = page.xpath("//table[contains(@id, 'Actions')]")[0] for tr in act_table.xpath("tr")[2:]: action = tr.xpath("string(td[1])").strip() if not action or action == 'None': continue date = tr.xpath("string(td[3])").strip() date = datetime.datetime.strptime(date, "%m/%d/%Y").date() actor = tr.xpath("string(td[4])").strip() if actor == 'H': actor = 'lower' elif actor == 'S': actor = 'upper' bill.add_action(actor, action, date, type=action_type(action)) version_table = page.xpath("//table[contains(@id, 'Versions')]")[0] for link in version_table.xpath(".//a[contains(@href, '.DOC')]"): version_url = link.attrib['href'] if 'COMMITTEE REPORTS' in version_url: continue name = link.text.strip() bill.add_version(name, version_url) for link in page.xpath(".//a[contains(@href, '_VOTES')]"): self.scrape_votes(bill, urlescape(link.attrib['href'])) # # If the bill has no actions and no versions, it's a bogus bill on # # their website, which appears to happen occasionally. Skip. has_no_actions = not bill['actions'] has_no_versions = not bill['versions'] has_no_title = (bill['title'] == "Short Title Not Found.") first_sponsor_is_bogus = bill['sponsors'][0][ 'name'] = "Author Not Found." has_no_sponsors = (len(bill['sponsors']) == 1) and first_sponsor_is_bogus if has_no_actions and has_no_versions: if has_no_title or has_no_sponsors: msg = '%r appears to be bogus. Skipping it.' % bill_id self.logger.warning(msg) return else: # Otherwise, save the bills. self.save_bill(bill)