def test_vote_event_pupa_identifier_dedupe(): j = create_jurisdiction() j.legislative_sessions.create(name='1900', identifier='1900') Organization.objects.create(id='org-id', name='Legislature', classification='legislature', jurisdiction=j) vote_event = ScrapeVoteEvent(legislative_session='1900', start_date='2013', classification='anything', result='passed', motion_text='a vote on something', identifier='Roll Call No. 1') vote_event.pupa_id = 'foo' dmi = DumbMockImporter() oi = OrganizationImporter('jid') bi = BillImporter('jid', dmi, oi) _, what = VoteEventImporter('jid', dmi, oi, bi).import_item(vote_event.as_dict()) assert what == 'insert' assert VoteEvent.objects.count() == 1 # same exact vote event, no changes _, what = VoteEventImporter('jid', dmi, oi, bi).import_item(vote_event.as_dict()) assert what == 'noop' assert VoteEvent.objects.count() == 1 # new info, update vote_event.result = 'failed' _, what = VoteEventImporter('jid', dmi, oi, bi).import_item(vote_event.as_dict()) assert what == 'update' assert VoteEvent.objects.count() == 1 # new bill identifier, update vote_event.identifier = 'First Roll Call' _, what = VoteEventImporter('jid', dmi, oi, bi).import_item(vote_event.as_dict()) assert what == 'update' assert VoteEvent.objects.count() == 1 # new pupa identifier, insert vote_event.pupa_id = 'bar' _, what = VoteEventImporter('jid', dmi, oi, bi).import_item(vote_event.as_dict()) assert what == 'insert' assert VoteEvent.objects.count() == 2
def test_vote_event_bill_actions_two_stage(): # this test is very similar to what we're testing in test_vote_event_bill_actions w/ # ve3 and ve4, that two bills that reference the same action won't conflict w/ the # OneToOneField, but in this case we do it in two stages so that the conflict is found # even if the votes weren't in the same scrape j = create_jurisdiction() j.legislative_sessions.create(name='1900', identifier='1900') org1 = ScrapeOrganization(name='House', classification='lower') bill = ScrapeBill('HB 1', '1900', 'Axe & Tack Tax Act', from_organization=org1._id) bill.add_action(description='passage', date='1900-04-02', chamber='lower') ve1 = ScrapeVoteEvent(legislative_session='1900', motion_text='passage', start_date='1900-04-02', classification='passage:bill', result='pass', bill_chamber='lower', bill='HB 1', bill_action='passage', organization=org1._id) ve2 = ScrapeVoteEvent(legislative_session='1900', motion_text='passage', start_date='1900-04-02', classification='passage:bill', result='pass', bill_chamber='lower', bill='HB 1', bill_action='passage', organization=org1._id) # disambiguate them ve1.pupa_id = 'one' ve2.pupa_id = 'two' oi = OrganizationImporter('jid') oi.import_data([org1.as_dict()]) bi = BillImporter('jid', oi, DumbMockImporter()) bi.import_data([bill.as_dict()]) # first imports just fine VoteEventImporter('jid', DumbMockImporter(), oi, bi).import_data([ ve1.as_dict(), ]) votes = list(VoteEvent.objects.all()) assert len(votes) == 1 assert votes[0].bill_action is not None # when second is imported, ensure that action stays pinned to first just as it would # have if they were both in same import VoteEventImporter('jid', DumbMockImporter(), oi, bi).import_data([ ve1.as_dict(), ve2.as_dict(), ]) votes = list(VoteEvent.objects.all()) assert len(votes) == 2 assert votes[0].bill_action is not None assert votes[1].bill_action is None
def parse_vote_page(self, vote_url, bill): vote_html = self.get(vote_url).text doc = lxml.html.fromstring(vote_html) # chamber if "senate" in vote_url: chamber = "upper" else: chamber = "lower" # date in the following format: Mar 23, 2009 date = doc.xpath('//td[starts-with(text(), "Legislative")]')[0].text date = date.replace(u"\xa0", " ") date = datetime.datetime.strptime(date[18:], "%b %d, %Y") # motion motion = "".join(x.text_content() for x in doc.xpath('//td[@colspan="23"]')) if motion == "": motion = "No motion given" # XXX: Double check this. See SJ 3. motion = motion.replace(u"\xa0", " ") # totals tot_class = doc.xpath('//td[contains(text(), "Yeas")]')[0].get("class") totals = doc.xpath('//td[@class="%s"]/text()' % tot_class)[1:] yes_count = int(totals[0].split()[-1]) no_count = int(totals[1].split()[-1]) other_count = int(totals[2].split()[-1]) other_count += int(totals[3].split()[-1]) other_count += int(totals[4].split()[-1]) passed = yes_count > no_count vote = VoteEvent( bill=bill, chamber=chamber, start_date=date.strftime("%Y-%m-%d"), motion_text=motion, classification="passage", result="pass" if passed else "fail", ) vote.pupa_id = vote_url # contains sequence number vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("other", other_count) # go through, find Voting Yea/Voting Nay/etc. and next tds are voters func = None for td in doc.xpath("//td/text()"): td = td.replace(u"\xa0", " ") if td.startswith("Voting Yea"): func = vote.yes elif td.startswith("Voting Nay"): func = vote.no elif td.startswith("Not Voting"): func = vote.other elif td.startswith("Excused"): func = vote.other elif func: td = td.rstrip("*") func(td) return vote
def scrape_senate_vote(self, bill, url, date): try: filename, resp = self.urlretrieve(url) except scrapelib.HTTPError: self.warning("missing vote file %s" % url) return vote = VoteEvent( chamber="upper", start_date=date.strftime("%Y-%m-%d"), motion_text="Passage", # setting 'fail' for now. result="fail", classification="passage", bill=bill, ) vote.add_source(url) vote.pupa_id = url text = convert_pdf(filename, "text").decode("utf-8") os.remove(filename) if re.search(r"Yea:\s+\d+\s+Nay:\s+\d+\s+Absent:\s+\d+", text): yield from self.scrape_senate_vote_3col(bill, vote, text, url, date) return data = re.split(r"(Yea|Nay|Absent)s?:", text)[::-1] data = filter(None, data) keymap = dict(yea="yes", nay="no") actual_vote = collections.defaultdict(int) vote_count = {"yes": 0, "no": 0, "other": 0} while True: if not data: break vote_val = data.pop() key = keymap.get(vote_val.lower(), "other") values = data.pop() for name in re.split(r"(?:[\s,]+and\s|[\s,]{2,})", values): if name.lower().strip() == "none.": continue name = name.replace("..", "") name = re.sub(r"\.$", "", name) name = name.strip("-1234567890 \n") if not name: continue vote.vote(key, name) actual_vote[vote_val] += 1 vote_count[key] += 1 assert actual_vote[vote_val] == vote_count[key] for key, value in vote_count.items(): vote.set_count(key, value) # updating result with actual value vote.result = ( "pass" if vote_count["yes"] > (vote_count["no"] + vote_count["other"]) else "fail" ) yield vote
def add_vote(self, bill, chamber, date, text, url): votes = re.findall(r'Ayes,?[\s]?(\d+)[,;]\s+N(?:oes|ays),?[\s]?(\d+)', text) yes, no = int(votes[0][0]), int(votes[0][1]) vtype = 'other' for regex, type in motion_classifiers.items(): if re.match(regex, text): vtype = type break v = VoteEvent( chamber=chamber, start_date=TIMEZONE.localize(date), motion_text=text, result='pass' if yes > no else 'fail', classification=vtype, bill=bill, ) v.pupa_id = url.split('/')[-1] v.set_count('yes', yes) v.set_count('no', no) # fetch the vote itself if url: v.add_source(url) if 'av' in url: self.add_house_votes(v, url) elif 'sv' in url: self.add_senate_votes(v, url) return v
def build_vote(session, bill_id, url, vote_record, chamber, motion_text): # When they vote in a substitute they mark it as XHB bill_id = bill_id.replace('XHB', 'HB') passed = len(vote_record['yes']) > len(vote_record['no']) vote_event = VoteEvent( result='pass' if passed else 'fail', chamber=chamber, start_date=vote_record['date'].strftime('%Y-%m-%d'), motion_text=motion_text, classification='passage', legislative_session=session, bill=bill_id, bill_chamber='upper' if bill_id[0] == 'S' else 'lower' ) vote_event.pupa_id = url vote_event.set_count('yes', len(vote_record['yes'])) vote_event.set_count('no', len(vote_record['no'])) vote_event.set_count('excused', len(vote_record['excused'])) vote_event.set_count('absent', len(vote_record['absent'])) vote_event.set_count('other', len(vote_record['other'])) for vote_type in ['yes', 'no', 'excused', 'absent', 'other']: for voter in vote_record[vote_type]: vote_event.vote(vote_type, voter) vote_event.add_source(url) return vote_event
def build_vote(session, bill_id, url, vote_record, chamber, motion_text): # When they vote in a substitute they mark it as XHB bill_id = bill_id.replace('XHB', 'HB') passed = len(vote_record['yes']) > len(vote_record['no']) vote_event = VoteEvent( result='pass' if passed else 'fail', chamber=chamber, start_date=vote_record['date'].strftime('%Y-%m-%d'), motion_text=motion_text, classification='passage', legislative_session=session, bill=bill_id, bill_chamber='upper' if bill_id[0] == 'S' else 'lower') vote_event.pupa_id = url vote_event.set_count('yes', len(vote_record['yes'])) vote_event.set_count('no', len(vote_record['no'])) vote_event.set_count('excused', len(vote_record['excused'])) vote_event.set_count('absent', len(vote_record['absent'])) vote_event.set_count('other', len(vote_record['other'])) for vote_type in ['yes', 'no', 'excused', 'absent', 'other']: for voter in vote_record[vote_type]: vote_event.vote(vote_type, voter) vote_event.add_source(url) return vote_event
def scrape_votes(self, bill, bill_page, chamber): vote_links = bill_page.xpath( '//div[contains(@class, "col-sm-8")]//a[contains(@href, "view_votes")]') for vote_link in vote_links: vote_url = vote_link.attrib['href'] date_td, motion_td, *_ = vote_link.xpath('ancestor::tr/td') date = datetime.strptime(date_td.text, '%b %d, %Y') motion_text = motion_td.text_content() vote_page = self.lxmlize(vote_url) passed = ( 'Passed' in motion_text or 'Advanced' in motion_text ) cells = vote_page.xpath('//table[contains(@class, "calendar-table")]//td') vote = VoteEvent( bill=bill, chamber=chamber, start_date=TIMEZONE.localize(date), motion_text=motion_text, classification='passage', result='pass' if passed else 'fail', ) query_params = urllib.parse.parse_qs(urllib.parse.urlparse(vote_url).query) vote.pupa_id = query_params['KeyID'][0] vote.add_source(vote_url) for chunk in range(0, len(cells), 2): name = cells[chunk].text vote_type = cells[chunk + 1].text if name and vote_type: vote.vote(VOTE_TYPE_MAP.get(vote_type.lower(), 'other'), name) yield vote
def add_vote(self, bill, chamber, date, text, url): votes = re.findall(r"Ayes,?[\s]?(\d+)[,;]\s+N(?:oes|ays),?[\s]?(\d+)", text) yes, no = int(votes[0][0]), int(votes[0][1]) vtype = "other" for regex, type in motion_classifiers.items(): if re.match(regex, text): vtype = type break v = VoteEvent( chamber=chamber, start_date=TIMEZONE.localize(date), motion_text=text, result="pass" if yes > no else "fail", classification=vtype, bill=bill, ) v.pupa_id = url.split("/")[-1] v.set_count("yes", yes) v.set_count("no", no) # fetch the vote itself if url: v.add_source(url) if "av" in url: self.add_house_votes(v, url) elif "sv" in url: self.add_senate_votes(v, url) return v
def build_vote(session, bill_id, url, vote_record, chamber, motion_text): # When they vote in a substitute they mark it as XHB bill_id = bill_id.replace("XHB", "HB") passed = len(vote_record["yes"]) > len(vote_record["no"]) vote_event = VoteEvent( result="pass" if passed else "fail", chamber=chamber, start_date=vote_record["date"].strftime("%Y-%m-%d"), motion_text=motion_text, classification="passage", legislative_session=session, bill=bill_id, bill_chamber="upper" if bill_id[0] == "S" else "lower", ) vote_event.pupa_id = url vote_event.set_count("yes", len(vote_record["yes"])) vote_event.set_count("no", len(vote_record["no"])) vote_event.set_count("excused", len(vote_record["excused"])) vote_event.set_count("absent", len(vote_record["absent"])) vote_event.set_count("other", len(vote_record["other"])) for vote_type in ["yes", "no", "excused", "absent", "other"]: for voter in vote_record[vote_type]: vote_event.vote(vote_type, voter) vote_event.add_source(url) return vote_event
def scrape_vote(self, chamber, session, bill_id, vote_url): NO_VOTE_URL = 'http://www.house.leg.state.mn.us/votes/novotefound.asp' resp = self.get(vote_url) html = resp.text # sometimes the link is broken, will redirect to NO_VOTE_URL if resp.url == NO_VOTE_URL: return doc = lxml.html.fromstring(html) try: motion = doc.xpath("//div[@id='leg_PageContent']/div/h2/text()")[0] except IndexError: self.logger.warning("Bill was missing a motion number, skipping") return vote_count = doc.xpath( ".//div[@id='leg_PageContent']/div/h3/text()")[1].split() yeas = int(vote_count[0]) nays = int(vote_count[3]) # second paragraph has date paragraphs = doc.xpath(".//div[@id='leg_PageContent']/div/p/text()") date = None for p in paragraphs: try: date = datetime.datetime.strptime(p.strip(), '%m/%d/%Y').date() break except ValueError: pass if date is None: self.logger.warning("No date could be found for vote on %s" % motion) return vote = VoteEvent(chamber='lower', start_date=date, motion_text=motion, result='pass' if yeas > nays else 'fail', classification='passage', legislative_session=session, bill=bill_id, bill_chamber=chamber) vote.set_count('yes', yeas) vote.set_count('no', nays) vote.add_source(vote_url) vote.pupa_id = vote_url # first table has YEAs for name in doc.xpath('//table[1]/tr/td/font/text()'): vote.yes(name.strip()) # second table is nays for name in doc.xpath('//table[2]/tr/td/font/text()'): vote.no(name.strip()) yield vote
def parse_vote(self, bill, link): member_doc = lxml.html.fromstring(self.get(link).text) motion = member_doc.xpath("//div[@id='main_content']/h4/text()") opinions = member_doc.xpath("//div[@id='main_content']/h3/text()") if len(opinions) > 0: temp = opinions[0].split() vote_chamber = temp[0] vote_date = datetime.datetime.strptime(temp[-1], '%m/%d/%Y') vote_status = " ".join(temp[2:-2]) vote_status = vote_status if vote_status.strip() else motion[0] vote_chamber = 'upper' if vote_chamber == 'Senate' else 'lower' for i in opinions: try: count = int(i[i.find("(") + 1:i.find(")")]) except ValueError: # This is likely not a vote-count text chunk # It's probably '`On roll call the vote was:` pass else: if "yea" in i.lower(): yes_count = count elif "nay" in i.lower(): no_count = count elif "present" in i.lower(): p_count = count elif "absent" in i.lower(): a_count = count vote = VoteEvent( bill=bill, start_date=vote_date.strftime('%Y-%m-%d'), chamber=vote_chamber, motion_text=vote_status, result='pass' if yes_count > no_count else 'fail', classification='passage', ) vote.pupa_id = link vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('abstain', p_count) vote.set_count('absent', a_count) vote.add_source(link) a_links = member_doc.xpath("//div[@id='main_content']/a/text()") for i in range(1, len(a_links)): if i <= yes_count: vote.vote('yes', re.sub(',', '', a_links[i]).split()[0]) elif no_count != 0 and i > yes_count and i <= yes_count + no_count: vote.vote('no', re.sub(',', '', a_links[i]).split()[0]) else: vote.vote('other', re.sub(',', '', a_links[i]).split()[0]) yield vote else: self.warning("No Votes for: %s", link)
def scrape_senate_vote(self, bill, url, date): try: filename, resp = self.urlretrieve(url) except scrapelib.HTTPError: self.warning("missing vote file %s" % url) return vote = VoteEvent( chamber='upper', start_date=date.strftime("%Y-%m-%d"), motion_text='Passage', # setting 'fail' for now. result='fail', classification='passage', bill=bill) vote.add_source(url) vote.pupa_id = url text = convert_pdf(filename, 'text').decode('utf-8') os.remove(filename) if re.search(r'Yea:\s+\d+\s+Nay:\s+\d+\s+Absent:\s+\d+', text): yield from self.scrape_senate_vote_3col(bill, vote, text, url, date) return data = re.split(r'(Yea|Nay|Absent)s?:', text)[::-1] data = filter(None, data) keymap = dict(yea='yes', nay='no') actual_vote = collections.defaultdict(int) vote_count = {'yes': 0, 'no': 0, 'other': 0} while True: if not data: break vote_val = data.pop() key = keymap.get(vote_val.lower(), 'other') values = data.pop() for name in re.split(r'(?:[\s,]+and\s|[\s,]{2,})', values): if name.lower().strip() == 'none.': continue name = name.replace('..', '') name = re.sub(r'\.$', '', name) name = name.strip('-1234567890 \n') if not name: continue vote.vote(key, name) actual_vote[vote_val] += 1 vote_count[key] += 1 assert actual_vote[vote_val] == vote_count[key] for key, value in vote_count.items(): vote.set_count(key, value) # updating result with actual value vote.result = 'pass' if vote_count['yes'] > ( vote_count['no'] + vote_count['other']) else 'fail' yield vote
def parse_vote(self, bill, link): member_doc = lxml.html.fromstring(self.get(link).text) motion = member_doc.xpath("//div[@id='main_content']/h4/text()") opinions = member_doc.xpath("//div[@id='main_content']/h3/text()") if len(opinions) > 0: temp = opinions[0].split() vote_chamber = temp[0] vote_date = datetime.datetime.strptime(temp[-1], '%m/%d/%Y') vote_status = " ".join(temp[2:-2]) vote_status = vote_status if vote_status.strip() else motion[0] vote_chamber = 'upper' if vote_chamber == 'Senate' else 'lower' for i in opinions: try: count = int(i[i.find("(") + 1:i.find(")")]) except: pass if "yea" in i.lower(): yes_count = count elif "nay" in i.lower(): no_count = count elif "present" in i.lower(): p_count = count elif "absent" in i.lower(): a_count = count vote = VoteEvent( bill=bill, start_date=vote_date.strftime('%Y-%m-%d'), chamber=vote_chamber, motion_text=vote_status, result='pass' if yes_count > no_count else 'fail', classification='passage', ) vote.pupa_id = link vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('abstain', p_count) vote.set_count('absent', a_count) vote.add_source(link) a_links = member_doc.xpath("//div[@id='main_content']/a/text()") for i in range(1, len(a_links)): if i <= yes_count: vote.vote('yes', re.sub(',', '', a_links[i]).split()[0]) elif no_count != 0 and i > yes_count and i <= yes_count + no_count: vote.vote('no', re.sub(',', '', a_links[i]).split()[0]) else: vote.vote('other', re.sub(',', '', a_links[i]).split()[0]) yield vote else: self.warning("No Votes for: %s", link)
def scrape_vote(self, chamber, session, bill_id, vote_url): NO_VOTE_URL = 'http://www.house.leg.state.mn.us/votes/novotefound.asp' resp = self.get(vote_url) html = resp.text # sometimes the link is broken, will redirect to NO_VOTE_URL if resp.url == NO_VOTE_URL: return doc = lxml.html.fromstring(html) try: motion = doc.xpath("//div[@id='leg_PageContent']/div/h2/text()")[0] except IndexError: self.logger.warning("Bill was missing a motion number, skipping") return vote_count = doc.xpath(".//div[@id='leg_PageContent']/div/h3/text()")[1].split() yeas = int(vote_count[0]) nays = int(vote_count[3]) # second paragraph has date paragraphs = doc.xpath(".//div[@id='leg_PageContent']/div/p/text()") date = None for p in paragraphs: try: date = datetime.datetime.strptime(p.strip(), '%m/%d/%Y').date() break except ValueError: pass if date is None: self.logger.warning("No date could be found for vote on %s" % motion) return vote = VoteEvent(chamber='lower', start_date=date, motion_text=motion, result='pass' if yeas > nays else 'fail', classification='passage', legislative_session=session, bill=bill_id, bill_chamber=chamber) vote.set_count('yes', yeas) vote.set_count('no', nays) vote.add_source(vote_url) vote.pupa_id = vote_url # first table has YEAs for name in doc.xpath('//table[1]/tr/td/font/text()'): vote.yes(name.strip()) # second table is nays for name in doc.xpath('//table[2]/tr/td/font/text()'): vote.no(name.strip()) yield vote
def scrape_chamber_votes(self, chamber, session): url = { "upper": "%s/%s" % (RI_URL_BASE, "SVotes"), "lower": "%s/%s" % (RI_URL_BASE, "HVotes") }[chamber] action = "%s/%s" % (url, "votes.asp") dates = self.get_vote_dates(url, session) for date in dates: votes = self.parse_vote_page(self.post_to(action, date), url, session) for vote_dict in votes: for vote in vote_dict.values(): count = vote['count'] chamber = { "H": "lower", "S": "upper" }[vote['meta']['chamber']] try: bill_id = self._bill_id_by_type[(chamber, vote['meta']['bill'])] except: self.warning('no such bill_id %s %s', chamber, vote['meta']['bill']) continue v = VoteEvent( chamber=chamber, start_date=vote['time'].strftime('%Y-%m-%d'), motion_text=vote['meta']['extra']['motion'], result='pass' if count['passage'] else 'fail', classification='passage', legislative_session=session, bill=bill_id, bill_chamber=chamber, ) v.set_count('yes', int(count['YEAS'])) v.set_count('no', int(count['NAYS'])) v.set_count('other', int(count['NOT VOTING'])) v.add_source(vote['source']) v.pupa_id = vote['source'] for vt in vote['votes']: key = { 'Y': 'yes', 'N': 'no', }.get(vt['vote'], 'other') v.vote(key, vt['name']) yield v
def parse_committee_votes(self, bill, url): bill.add_source(url) html = self.get(url).text doc = lxml.html.fromstring(html) doc.make_links_absolute(url) chamber = ('upper' if 'Senate' in doc.xpath('string(//h1)') else 'lower') committee = tuple(doc.xpath('//h2')[0].itertext())[-2].strip() for link in doc.xpath("//a[contains(@href, 'listVoteSummary.cfm')]"): # Date for fmt in ("%m/%d/%Y", "%m-%d-%Y"): date = link.xpath('../../td')[0].text_content() try: date = datetime.datetime.strptime(date, fmt) except ValueError: continue break # Motion motion = link.text_content().split(' - ')[-1].strip() motion = 'Committee vote (%s): %s' % (committee, motion) # Roll call vote_url = link.attrib['href'] rollcall = self.parse_upper_committee_vote_rollcall(bill, vote_url) vote = VoteEvent( chamber=chamber, start_date=tz.localize(date), motion_text=motion, classification='other', result='pass' if rollcall['passed'] else 'fail', bill=bill, ) vote.pupa_id = vote_url vote.set_count('yes', rollcall['yes_count']) vote.set_count('no', rollcall['no_count']) vote.set_count('other', rollcall['other_count']) for voteval in ('yes', 'no', 'other'): for name in rollcall.get(voteval + '_votes', []): vote.vote(voteval, name) vote.add_source(url) vote.add_source(vote_url) yield vote
def scrape_votes(self, bill, bill_page, chamber): vote_links = bill_page.xpath( '//table[contains(@class,"history")]//a[contains(@href, "view_votes")]' ) for vote_link in vote_links: vote_url = vote_link.attrib['href'] date_td, motion_td, *_ = vote_link.xpath('ancestor::tr/td') date = datetime.strptime(date_td.text, '%b %d, %Y') motion_text = motion_td.text_content() vote_page = self.lxmlize(vote_url) passed = ('Passed' in motion_text or 'Advanced' in motion_text) cells = vote_page.xpath( '//div[contains(@class,"table-responsive")]/table//td') vote = VoteEvent( bill=bill, chamber=chamber, start_date=TIMEZONE.localize(date), motion_text=motion_text, classification='passage', result='pass' if passed else 'fail', ) yes_count = self.process_count(vote_page, 'Yes:') no_count = self.process_count(vote_page, 'No:') exc_count = self.process_count(vote_page, 'Excused - Not Voting:') absent_count = self.process_count(vote_page, 'Absent - Not Voting:') present_count = self.process_count(vote_page, 'Present - Not Voting:') vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('excused', exc_count) vote.set_count('absent', absent_count) vote.set_count('abstain', present_count) query_params = urllib.parse.parse_qs( urllib.parse.urlparse(vote_url).query) vote.pupa_id = query_params['KeyID'][0] vote.add_source(vote_url) for chunk in range(0, len(cells), 2): name = cells[chunk].text vote_type = cells[chunk + 1].text if name and vote_type: vote.vote(VOTE_TYPE_MAP.get(vote_type.lower(), 'other'), name) yield vote
def parse_committee_votes(self, bill, url): bill.add_source(url) html = self.get(url).text doc = lxml.html.fromstring(html) doc.make_links_absolute(url) chamber = "upper" if "Senate" in doc.xpath("string(//h1)") else "lower" committee = tuple(doc.xpath("//h2")[0].itertext())[-2].strip() for link in doc.xpath("//a[contains(@href, 'listVoteSummary.cfm')]"): # Date for fmt in ("%m/%d/%Y", "%m-%d-%Y"): date = link.xpath("../../td")[0].text_content() try: date = datetime.datetime.strptime(date, fmt) except ValueError: continue break # Motion motion = link.text_content().split(" - ")[-1].strip() motion = "Committee vote (%s): %s" % (committee, motion) # Roll call vote_url = link.attrib["href"] rollcall = self.parse_upper_committee_vote_rollcall(bill, vote_url) vote = VoteEvent( chamber=chamber, start_date=tz.localize(date), motion_text=motion, classification="other", result="pass" if rollcall["passed"] else "fail", bill=bill, ) vote.pupa_id = vote_url vote.set_count("yes", rollcall["yes_count"]) vote.set_count("no", rollcall["no_count"]) vote.set_count("other", rollcall["other_count"]) for voteval in ("yes", "no", "other"): for name in rollcall.get(voteval + "_votes", []): vote.vote(voteval, name) vote.add_source(url) vote.add_source(vote_url) yield vote
def scrape_votes(self, bill, bill_page, chamber): vote_links = bill_page.xpath( '//table[contains(@class,"history")]//a[contains(@href, "view_votes")]' ) for vote_link in vote_links: vote_url = vote_link.attrib["href"] date_td, motion_td, *_ = vote_link.xpath("ancestor::tr/td") date = datetime.strptime(date_td.text, "%b %d, %Y") motion_text = motion_td.text_content() vote_page = self.lxmlize(vote_url) passed = "Passed" in motion_text or "Advanced" in motion_text cells = vote_page.xpath( '//div[contains(@class,"table-responsive")]/table//td') vote = VoteEvent( bill=bill, chamber=chamber, start_date=TIMEZONE.localize(date), motion_text=motion_text, classification="passage", result="pass" if passed else "fail", ) yes_count = self.process_count(vote_page, "Yes:") no_count = self.process_count(vote_page, "No:") exc_count = self.process_count(vote_page, "Excused - Not Voting:") absent_count = self.process_count(vote_page, "Absent - Not Voting:") present_count = self.process_count(vote_page, "Present - Not Voting:") vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("excused", exc_count) vote.set_count("absent", absent_count) vote.set_count("abstain", present_count) query_params = urllib.parse.parse_qs( urllib.parse.urlparse(vote_url).query) vote.pupa_id = query_params["KeyID"][0] vote.add_source(vote_url) for chunk in range(0, len(cells), 2): name = cells[chunk].text vote_type = cells[chunk + 1].text if name and vote_type: vote.vote(VOTE_TYPE_MAP.get(vote_type.lower(), "other"), name) yield vote
def scrape_chamber_votes(self, chamber, session): url = { "upper": "%s/%s" % (RI_URL_BASE, "SVotes"), "lower": "%s/%s" % (RI_URL_BASE, "HVotes"), }[chamber] action = "%s/%s" % (url, "votes.asp") dates = self.get_vote_dates(url, session) for date in dates: votes = self.parse_vote_page(self.post_to(action, date), url, session) for vote_dict in votes: for vote in vote_dict.values(): count = vote["count"] chamber = { "H": "lower", "S": "upper" }[vote["meta"]["chamber"]] try: bill_id = self._bill_id_by_type[(chamber, vote["meta"]["bill"])] except KeyError: self.warning("no such bill_id %s %s", chamber, vote["meta"]["bill"]) continue v = VoteEvent( chamber=chamber, start_date=vote["time"].strftime("%Y-%m-%d"), motion_text=vote["meta"]["extra"]["motion"], result="pass" if count["passage"] else "fail", classification="passage", legislative_session=session, bill=bill_id, bill_chamber=chamber, ) v.set_count("yes", int(count["YEAS"])) v.set_count("no", int(count["NAYS"])) v.set_count("other", int(count["NOT VOTING"])) v.add_source(vote["source"]) v.pupa_id = vote["source"] for vt in vote["votes"]: key = {"Y": "yes", "N": "no"}.get(vt["vote"], "other") v.vote(key, vt["name"]) yield v
def scrape_votes(self, bill, page): base_url = 'https://apps.azleg.gov/api/BillStatusFloorAction' for header in page['FloorHeaders']: params = { 'billStatusId': page['BillId'], 'billStatusActionId': header['BillStatusActionId'], 'includeVotes': 'true', } resp = self.get(base_url, params=params) actions = json.loads(resp.content.decode('utf-8')) for action in actions: if action['Action'] == 'No Action': continue action_date = datetime.datetime.strptime(action['ReportDate'], '%Y-%m-%dT%H:%M:%S') vote = VoteEvent( chamber={ 'S': 'upper', 'H': 'lower', }[header['LegislativeBody']], motion_text=action['Action'], classification='passage', result=( 'pass' if action['UnanimouslyAdopted'] or action['Ayes'] > action['Nays'] else 'fail' ), start_date=action_date.strftime('%Y-%m-%d'), bill=bill, ) vote.add_source(resp.url) vote.set_count('yes', action['Ayes'] or 0) vote.set_count('no', action['Nays'] or 0) vote.set_count('other', (action['Present'] or 0)) vote.set_count('absent', (action['Absent'] or 0)) vote.set_count('excused', (action['Excused'] or 0)) vote.set_count('not voting', (action['NotVoting'] or 0)) for v in action['Votes']: vote_type = { 'Y': 'yes', 'N': 'no', }.get(v['Vote'], 'other') vote.vote(vote_type, v['Legislator']['FullName']) vote.pupa_id = resp.url+str(action['ReferralNumber']) yield vote
def scrape_chamber_votes(self, chamber, session): url = { "upper": "%s/%s" % (RI_URL_BASE, "SVotes"), "lower": "%s/%s" % (RI_URL_BASE, "HVotes") }[chamber] action = "%s/%s" % (url, "votes.asp") dates = self.get_vote_dates(url, session) for date in dates: votes = self.parse_vote_page(self.post_to(action, date), url, session) for vote_dict in votes: for vote in vote_dict.values(): count = vote['count'] chamber = { "H": "lower", "S": "upper" }[vote['meta']['chamber']] try: bill_id = self._bill_id_by_type[(chamber, vote['meta']['bill'])] except KeyError: self.warning('no such bill_id %s %s', chamber, vote['meta']['bill']) continue v = VoteEvent( chamber=chamber, start_date=vote['time'].strftime('%Y-%m-%d'), motion_text=vote['meta']['extra']['motion'], result='pass' if count['passage'] else 'fail', classification='passage', legislative_session=session, bill=bill_id, bill_chamber=chamber, ) v.set_count('yes', int(count['YEAS'])) v.set_count('no', int(count['NAYS'])) v.set_count('other', int(count['NOT VOTING'])) v.add_source(vote['source']) v.pupa_id = vote['source'] for vt in vote['votes']: key = { 'Y': 'yes', 'N': 'no', }.get(vt['vote'], 'other') v.vote(key, vt['name']) yield v
def scrape_votes(self, bill, page): base_url = 'https://apps.azleg.gov/api/BillStatusFloorAction' for header in page['FloorHeaders']: params = { 'billStatusId': page['BillId'], 'billStatusActionId': header['BillStatusActionId'], 'includeVotes': 'true', } resp = self.get(base_url, params=params) actions = json.loads(resp.content.decode('utf-8')) for action in actions: if action['Action'] == 'No Action': continue action_date = datetime.datetime.strptime( action['ReportDate'], '%Y-%m-%dT%H:%M:%S') vote = VoteEvent( chamber={ 'S': 'upper', 'H': 'lower', }[header['LegislativeBody']], motion_text=action['Action'], classification='passage', result=('pass' if action['UnanimouslyAdopted'] or action['Ayes'] > action['Nays'] else 'fail'), start_date=action_date.strftime('%Y-%m-%d'), bill=bill, ) vote.add_source(resp.url) vote.set_count('yes', action['Ayes'] or 0) vote.set_count('no', action['Nays'] or 0) vote.set_count('other', (action['Present'] or 0)) vote.set_count('absent', (action['Absent'] or 0)) vote.set_count('excused', (action['Excused'] or 0)) vote.set_count('not voting', (action['NotVoting'] or 0)) for v in action['Votes']: vote_type = { 'Y': 'yes', 'N': 'no', }.get(v['Vote'], 'other') vote.vote(vote_type, v['Legislator']['FullName']) vote.pupa_id = resp.url + str(action['ReferralNumber']) yield vote
def scrape_votes(self, bill, page): base_url = "https://apps.azleg.gov/api/BillStatusFloorAction" for header in page["FloorHeaders"]: params = { "billStatusId": page["BillId"], "billStatusActionId": header["BillStatusActionId"], "includeVotes": "true", } resp = self.get(base_url, params=params) actions = json.loads(resp.content.decode("utf-8")) for action in actions: if action["Action"] == "No Action": continue action_date = datetime.datetime.strptime( action["ReportDate"], "%Y-%m-%dT%H:%M:%S") vote = VoteEvent( chamber={ "S": "upper", "H": "lower" }[header["LegislativeBody"]], motion_text=action["Action"], classification="passage", result=("pass" if action["UnanimouslyAdopted"] or action["Ayes"] > action["Nays"] else "fail"), start_date=action_date.strftime("%Y-%m-%d"), bill=bill, ) vote.add_source(resp.url) vote.set_count("yes", action["Ayes"] or 0) vote.set_count("no", action["Nays"] or 0) vote.set_count("other", (action["Present"] or 0)) vote.set_count("absent", (action["Absent"] or 0)) vote.set_count("excused", (action["Excused"] or 0)) vote.set_count("not voting", (action["NotVoting"] or 0)) for v in action["Votes"]: vote_type = {"Y": "yes", "N": "no"}.get(v["Vote"], "other") vote.vote(vote_type, v["Legislator"]["FullName"]) vote.pupa_id = resp.url + str(action["ReferralNumber"]) yield vote
def asvote(self): v = VoteEvent( chamber=self.chamber(), start_date=self.date(), motion_text=self.motion(), result='pass' if self.passed() else 'fail', classification='passage', bill=self.bill, ) v.pupa_id = self.url # URL contains sequence number v.set_count('yes', self.yes_count()) v.set_count('no', self.no_count()) v.set_count('other', self.other_count()) for voter in self.yes_votes(): v.yes(voter) for voter in self.no_votes(): v.no(voter) for voter in self.other_votes(): v.vote('other', voter) v.add_source(self.url) return v
def asvote(self): v = VoteEvent( chamber=self.chamber(), start_date=self.date(), motion_text=self.motion(), result="pass" if self.passed() else "fail", classification="passage", bill=self.bill, ) v.pupa_id = self.url # URL contains sequence number v.set_count("yes", self.yes_count()) v.set_count("no", self.no_count()) v.set_count("other", self.other_count()) for voter in self.yes_votes(): v.yes(voter) for voter in self.no_votes(): v.no(voter) for voter in self.other_votes(): v.vote("other", voter) v.add_source(self.url) return v
def process_vote(self, votes, url, base_url, bill, legislators, chamber_dict, vote_results): for v in votes["items"]: try: v["yeas"] except KeyError: # sometimes the actual vote is buried a second layer deep v = self.get(base_url+v["link"]).json() try: v["yeas"] except KeyError: self.logger.warning("No vote info available, skipping") continue try: chamber = chamber_dict[v["chamber"]] except KeyError: chamber = "lower" if "house" in v["apn"] else "upper" try: date = self._tz.localize(datetime.datetime.strptime(v["date"], "%m/%d/%y")) date = "{:%Y-%m-%d}".format(date) except KeyError: try: date = self._tz.localize(datetime.datetime.strptime(v["occurred"], "%m/%d/%y")) date = "{:%Y-%m-%d}".format(date) except KeyError: self.logger.warning("No date found for vote, skipping") continue try: motion = v["action"] except KeyError: motion = v["motiontype"] # Sometimes Ohio's SOLAR will only return part of the JSON, so in that case skip if (not motion and isinstance(v['yeas'], str) and isinstance(v['nays'], str)): waringText = 'Malformed JSON found for vote ("revno" of {}); skipping' self.warning(waringText.format(v['revno'])) continue result = v.get("results") or v.get("passed") if result is None: if len(v['yeas']) > len(v['nays']): result = "passed" else: result = "failed" passed = vote_results[result.lower()] if "committee" in v: vote = VoteEvent(chamber=chamber, start_date=date, motion_text=motion, result='pass' if passed else 'fail', # organization=v["committee"], bill=bill, classification='passed' ) else: vote = VoteEvent(chamber=chamber, start_date=date, motion_text=motion, result='pass' if passed else 'fail', classification='passed', bill=bill ) # Concatenate the bill identifier and vote identifier to avoid collisions vote.pupa_id = '{}:{}'.format(bill.identifier.replace(' ', ''), v['revno']) # the yea and nay counts are not displayed, but vote totals are # and passage status is. yes_count = 0 no_count = 0 absent_count = 0 excused_count = 0 for voter_id in v["yeas"]: vote.yes(legislators[voter_id]) yes_count += 1 for voter_id in v["nays"]: vote.no(legislators[voter_id]) no_count += 1 if "absent" in v: for voter_id in v["absent"]: vote.vote('absent', legislators[voter_id]) absent_count += 1 if "excused" in v: for voter_id in v["excused"]: vote.vote('excused', legislators[voter_id]) excused_count += 1 vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('absent', absent_count) vote.set_count('excused', excused_count) # check to see if there are any other things that look # like vote categories, throw a warning if so for key, val in v.items(): if (type(val) == list and len(val) > 0 and key not in ["yeas", "nays", "absent", "excused"]): if val[0] in legislators: self.logger.warning("{k} looks like a vote type that's not being counted." " Double check it?".format(k=key)) vote.add_source(url) yield vote
def scrape_votes(self, url, motion, date, chamber, bill): vote_pdf, resp = self.urlretrieve(url) text = convert_pdf(vote_pdf, 'text') os.remove(vote_pdf) # this way we get a key error on a missing vote type motion, passed = self._vote_mapping[motion] yes_votes = [] no_votes = [] other_votes = [] absent_votes = [] not_voting_votes = [] # point at array to add names to cur_array = None precursors = ( ('yeas--', yes_votes), ('nays--', no_votes), ('absent or those not voting--', absent_votes), ('absent and those not voting--', absent_votes), ('not voting--', not_voting_votes), ('voting present--', other_votes), ('present--', other_votes), ('disclaimer', None), ) # split lines on newline, recombine lines that don't end in punctuation lines = _combine_lines(text.decode().split('\n')) for line in lines: # check if the line starts with a precursor, switch to that array for pc, arr in precursors: if pc in line.lower(): cur_array = arr line = line.replace(pc, '') # split names for name in line.split(','): name = name.strip() # move on if that's all there was if not name: continue # None or a Total indicate the end of a section if 'None.' in name: cur_array = None match = re.match(r'(.+?)\. Total--.*', name) if match: cur_array.append(match.groups()[0]) cur_array = None # append name if it looks ok junk_in_name = False for junk in ('on final passage', 'Necessary', 'who would have', 'being a tie', 'therefore', 'Vacancies', 'a pair', 'Total-', 'ATTORNEY', 'on final passage', 'SPEAKER', 'BOARD', 'TREASURER', 'GOVERNOR', 'ARCHIVES', 'SECRETARY'): if junk in name: junk_in_name = True break if cur_array is not None and not junk_in_name: # strip trailing . if name[-1] == '.': name = name[:-1] cur_array.append(name) # return vote object yes_count = len(yes_votes) no_count = len(no_votes) absent_count = len(absent_votes) not_voting_count = len(not_voting_votes) other_count = len(other_votes) vote = VoteEvent(chamber=chamber, start_date=self._tz.localize(date), motion_text=motion, result='pass' if passed else 'fail', classification='passage', bill=bill) vote.pupa_id = url + '#' + bill.identifier vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('absent', absent_count) vote.set_count('not voting', not_voting_count) vote.set_count('other', other_count) vote.add_source(url) for yes_vote in yes_votes: vote.vote('yes', yes_vote) for no_vote in no_votes: vote.vote('no', no_vote) for absent_vote in absent_votes: vote.vote('absent', absent_vote) for not_voting_vote in not_voting_votes: vote.vote('not voting', not_voting_vote) for other_vote in other_votes: vote.vote('other', other_vote) yield vote
def process_vote(self, votes, url, base_url, bill, legislators, chamber_dict, vote_results): for v in votes["items"]: try: v["yeas"] except KeyError: # sometimes the actual vote is buried a second layer deep v = self.get(base_url+v["link"]).json() try: v["yeas"] except KeyError: self.logger.warning("No vote info available, skipping") continue try: chamber = chamber_dict[v["chamber"]] except KeyError: chamber = "lower" if "house" in v["apn"] else "upper" try: date = self._tz.localize(datetime.datetime.strptime(v["date"], "%m/%d/%y")) date = "{:%Y-%m-%d}".format(date) except KeyError: try: date = self._tz.localize(datetime.datetime.strptime(v["occurred"], "%m/%d/%y")) date = "{:%Y-%m-%d}".format(date) except KeyError: self.logger.warning("No date found for vote, skipping") continue try: motion = v["action"] except KeyError: motion = v["motiontype"] # Sometimes Ohio's SOLAR will only return part of the JSON, so in that case skip if (not motion and isinstance(v['yeas'], str) and isinstance(v['nays'], str)): waringText = 'Malformed JSON found for vote ("revno" of {}); skipping' self.warning(waringText.format(v['revno'])) continue result = v.get("results") or v.get("passed") if result is None: if len(v['yeas']) > len(v['nays']): result = "passed" else: result = "failed" passed = vote_results[result.lower()] if "committee" in v: vote = VoteEvent(chamber=chamber, start_date=date, motion_text=motion, result='pass' if passed else 'fail', # organization=v["committee"], bill=bill, classification='passed' ) else: vote = VoteEvent(chamber=chamber, start_date=date, motion_text=motion, result='pass' if passed else 'fail', classification='passed', bill=bill ) vote.pupa_id = str(v['revno']) # the yea and nay counts are not displayed, but vote totals are # and passage status is. yes_count = 0 no_count = 0 absent_count = 0 excused_count = 0 for voter_id in v["yeas"]: vote.yes(legislators[voter_id]) yes_count += 1 for voter_id in v["nays"]: vote.no(legislators[voter_id]) no_count += 1 if "absent" in v: for voter_id in v["absent"]: vote.vote('absent', legislators[voter_id]) absent_count += 1 if "excused" in v: for voter_id in v["excused"]: vote.vote('excused', legislators[voter_id]) excused_count += 1 vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('absent', absent_count) vote.set_count('excused', excused_count) # check to see if there are any other things that look # like vote categories, throw a warning if so for key, val in v.items(): if (type(val) == list and len(val) > 0 and key not in ["yeas", "nays", "absent", "excused"]): if val[0] in legislators: self.logger.warning("{k} looks like a vote type that's not being counted." " Double check it?".format(k=key)) vote.add_source(url) yield vote
def scrape_votes(self, bill, url): page = lxml.html.fromstring(self.get(url).text.replace(u"\xa0", " ")) seen_rcs = set() re_ns = "http://exslt.org/regular-expressions" path = r"//p[re:test(text(), 'OKLAHOMA\s+(HOUSE|STATE\s+SENATE)')]" for header in page.xpath(path, namespaces={"re": re_ns}): bad_vote = False # Each chamber has the motion name on a different line of the file if "HOUSE" in header.xpath("string()"): chamber = "lower" motion_index = 8 else: chamber = "upper" motion_index = 13 motion = header.xpath( "string(following-sibling::p[%d])" % motion_index ).strip() motion = re.sub(r"\s+", " ", motion) if not motion.strip(): self.warning("Motion text not found") return match = re.match(r"^(.*) (PASSED|FAILED)$", motion) if match: motion = match.group(1) passed = match.group(2) == "PASSED" else: passed = None rcs_p = header.xpath("following-sibling::p[contains(., 'RCS#')]")[0] rcs_line = rcs_p.xpath("string()").replace(u"\xa0", " ") rcs = re.search(r"RCS#\s+(\d+)", rcs_line).group(1) if rcs in seen_rcs: continue else: seen_rcs.add(rcs) date_line = rcs_p.getnext().xpath("string()") date = re.search(r"\d+/\d+/\d+", date_line).group(0) date = datetime.datetime.strptime(date, "%m/%d/%Y").date() vtype = None counts = collections.defaultdict(int) votes = collections.defaultdict(list) seen_yes = False for sib in header.xpath("following-sibling::p")[13:]: line = sib.xpath("string()").replace("\r\n", " ").strip() if "*****" in line: break regex = ( r"(YEAS|NAYS|EXCUSED|VACANT|CONSTITUTIONAL " r"PRIVILEGE|NOT VOTING|N/V)\s*:\s*(\d+)(.*)" ) match = re.match(regex, line) if match: if match.group(1) == "YEAS" and "RCS#" not in line: vtype = "yes" seen_yes = True elif match.group(1) == "NAYS" and seen_yes: vtype = "no" elif match.group(1) == "VACANT": continue # skip these elif seen_yes: vtype = "other" if seen_yes and match.group(3).strip(): self.warning("Bad vote format, skipping.") bad_vote = True counts[vtype] += int(match.group(2)) elif seen_yes: for name in line.split(" "): if not name: continue if "HOUSE" in name or "SENATE " in name: continue votes[vtype].append(name.strip()) if bad_vote: continue if passed is None: passed = counts["yes"] > (counts["no"] + counts["other"]) vote = Vote( chamber=chamber, start_date=date.strftime("%Y-%m-%d"), motion_text=motion, result="pass" if passed else "fail", bill=bill, classification="passage", ) vote.set_count("yes", counts["yes"]) vote.set_count("no", counts["no"]) vote.set_count("other", counts["other"]) vote.pupa_id = url + "#" + rcs vote.add_source(url) for name in votes["yes"]: vote.yes(name) for name in votes["no"]: if ":" in name: raise Exception(name) vote.no(name) for name in votes["other"]: vote.vote("other", name) yield vote
def parse_vote_page(self, vote_url, bill): vote_html = self.get(vote_url).text doc = lxml.html.fromstring(vote_html) # chamber if 'senate' in vote_url: chamber = 'upper' else: chamber = 'lower' # date in the following format: Mar 23, 2009 date = doc.xpath('//td[starts-with(text(), "Legislative")]')[0].text date = date.replace(u'\xa0', ' ') date = datetime.datetime.strptime(date[18:], '%b %d, %Y') # motion motion = ''.join(x.text_content() for x in doc.xpath('//td[@colspan="23"]')) if motion == '': motion = "No motion given" # XXX: Double check this. See SJ 3. motion = motion.replace(u'\xa0', ' ') # totals tot_class = doc.xpath('//td[contains(text(), "Yeas")]')[0].get('class') totals = doc.xpath('//td[@class="%s"]/text()' % tot_class)[1:] yes_count = int(totals[0].split()[-1]) no_count = int(totals[1].split()[-1]) other_count = int(totals[2].split()[-1]) other_count += int(totals[3].split()[-1]) other_count += int(totals[4].split()[-1]) passed = yes_count > no_count vote = VoteEvent( bill=bill, chamber=chamber, start_date=date.strftime('%Y-%m-%d'), motion_text=motion, classification='passage', result='pass' if passed else 'fail', ) vote.pupa_id = vote_url # contains sequence number vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('other', other_count) # go through, find Voting Yea/Voting Nay/etc. and next tds are voters func = None for td in doc.xpath('//td/text()'): td = td.replace(u'\xa0', ' ') if td.startswith('Voting Yea'): func = vote.yes elif td.startswith('Voting Nay'): func = vote.no elif td.startswith('Not Voting'): func = vote.other elif td.startswith('Excused'): func = vote.other elif func: td = td.rstrip('*') func(td) return vote
def scrape_vote(self, bill, date, url): page = self.get(url).text page = lxml.html.fromstring(page) header = page.xpath("string(//h3[contains(@id, 'hdVote')])") if 'No Bill Action' in header: self.warning("bad vote header -- skipping") return location = header.split(', ')[1] if location.startswith('House'): chamber = 'lower' elif location.startswith('Senate'): chamber = 'upper' elif location.startswith('Joint'): chamber = 'legislature' else: raise ScrapeError("Bad chamber: %s" % location) motion = ', '.join(header.split(', ')[2:]).strip() if motion: # If we can't detect a motion, skip this vote yes_count = int( page.xpath("string(//span[contains(@id, 'tdAyes')])")) no_count = int( page.xpath("string(//span[contains(@id, 'tdNays')])")) excused_count = int( page.xpath("string(//span[contains(@id, 'tdExcused')])")) absent_count = int( page.xpath("string(//span[contains(@id, 'tdAbsent')])")) passed = yes_count > no_count if motion.startswith('Do Pass'): type = 'passage' elif motion == 'Concurred in amendments': type = 'amendment' elif motion == 'Veto override': type = 'veto_override' else: type = 'other' vote = VoteEvent(chamber=chamber, start_date=date, motion_text=motion, result='pass' if passed else 'fail', classification=type, bill=bill) # The vote page URL has a unique ID # However, some votes are "consent calendar" events, # and relate to the passage of _multiple_ bills # These can't be modeled yet in Pupa, but for now we can # append a bill ID to the URL that forms the `pupa_id` # https://github.com/opencivicdata/pupa/issues/308 vote.pupa_id = '{}#{}'.format(url, bill.identifier.replace(' ', '')) vote.add_source(url) vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('excused', excused_count) vote.set_count('absent', absent_count) for td in page.xpath("//table[@id='tblVoteTotals']/tbody/tr/td"): option_or_person = td.text.strip() if option_or_person in ('Aye', 'Yea'): vote.yes(td.getprevious().text.strip()) elif option_or_person == 'Nay': vote.no(td.getprevious().text.strip()) elif option_or_person == 'Excused': vote.vote('excused', td.getprevious().text.strip()) elif option_or_person == 'Absent': vote.vote('absent', td.getprevious().text.strip()) yield vote
def _build_lower_votes(self): url = self.shared_url + '&Floor%26nbspVotes=Y' self.urls.add(votes=url) self.bill.add_source(url) doc = self.urls.votes.doc if doc is None: return # Grab bill information. try: pre = doc.xpath('//pre')[0].text_content().strip() no_votes = 'There are no votes for this bill in this legislative ' if pre == no_votes: raise ValueError('No votes for this bill.') # Skip bill if votes can't be found. except (IndexError, ValueError): return for table in doc.xpath('//table'): date = table.xpath('caption/span[contains(., "DATE:")]') date = next(date[0].itersiblings()).text date = datetime.datetime.strptime(date, '%m/%d/%Y') date = date.replace(tzinfo=timezone('UTC')) spanText = table.xpath('caption/span/text()') motion = spanText[2].strip()+spanText[3].strip() votes = table.xpath('caption/span/span')[0].text.split(':')[1].split('/') yes_count, no_count = map(int, votes) passed = yes_count > no_count vote = VoteEvent( chamber='lower', start_date=date, motion_text=motion, bill=self.bill, result='pass' if passed else 'fail', classification='passage' ) vote.set_count('yes', yes_count) vote.set_count('no', no_count) absent_count = 0 excused_count = 0 tds = table.xpath('tr/td/text()') votes = [tds[i:i+2] for i in range(0, len(tds), 2)] vote_dictionary = { 'Y': 'yes', 'NO': 'no', 'ER': 'excused', 'AB': 'absent', 'NV': 'not voting' } for vote_pair in votes: name, vote_val = vote_pair vote.vote(vote_dictionary[vote_val], name) if vote_val == 'AB': absent_count += 1 elif vote_val == 'ER': excused_count += 1 vote.set_count('absent', absent_count) vote.set_count('excused', excused_count) vote.add_source(url) vote.pupa_id = url + motion + spanText[1] yield vote
def scrape_bill_type(self, chamber, session, bill_type, type_abbr, committee_abbr_regex=get_committee_name_regex()): bills = self.session.query(CABill).filter_by( session_year=session).filter_by( measure_type=type_abbr) for bill in bills: bill_session = session if bill.session_num != '0': bill_session += ' Special Session %s' % bill.session_num bill_id = bill.short_bill_id fsbill = Bill(bill_id, session, title='', chamber=chamber) if ((bill_id.startswith('S') and chamber == 'lower') or (bill_id.startswith('A') and chamber == 'upper')): print("!!!! BAD ID/CHAMBER PAIR !!!!", bill) continue # # Construct session for web query, going from '20092010' to '0910' # source_session = session[2:4] + session[6:8] # # Turn 'AB 10' into 'ab_10' # source_num = "%s_%s" % (bill.measure_type.lower(), # bill.measure_num) # Construct a fake source url source_url = ('http://leginfo.legislature.ca.gov/faces/' 'billNavClient.xhtml?bill_id=%s') % bill.bill_id fsbill.add_source(source_url) fsbill.add_version_link(bill_id, source_url, media_type='text/html') title = '' type_ = ['bill'] subject = '' all_titles = set() # Get digest test (aka "summary") from latest version. if bill.versions: version = bill.versions[-1] nsmap = version.xml.nsmap xpath = '//caml:DigestText/xhtml:p' els = version.xml.xpath(xpath, namespaces=nsmap) chunks = [] for el in els: t = etree_text_content(el) t = re.sub(r'\s+', ' ', t) t = re.sub(r'\)(\S)', lambda m: ') %s' % m.group(1), t) chunks.append(t) summary = '\n\n'.join(chunks) for version in bill.versions: if not version.bill_xml: continue version_date = self._tz.localize(version.bill_version_action_date) # create a version name to match the state's format # 02/06/17 - Enrolled version_date_human = version_date.strftime( '%m/%d/%y') version_name = "{} - {}".format( version_date_human, version.bill_version_action) version_base = "https://leginfo.legislature.ca.gov/faces" version_url_pdf = "{}/billPdf.xhtml?bill_id={}&version={}".format( version_base, version.bill_id, version.bill_version_id) fsbill.add_version_link( version_name, version_url_pdf, media_type='application/pdf', date=version_date.date()) # CA is inconsistent in that some bills have a short title # that is longer, more descriptive than title. if bill.measure_type in ('AB', 'SB'): impact_clause = clean_title(version.title) title = clean_title(version.short_title) else: impact_clause = None if len(version.title) < len(version.short_title) and \ not version.title.lower().startswith('an act'): title = clean_title(version.short_title) else: title = clean_title(version.title) if title: all_titles.add(title) type_ = [bill_type] if version.appropriation == 'Yes': type_.append('appropriation') tags = [] if version.fiscal_committee == 'Yes': tags.append('fiscal committee') if version.local_program == 'Yes': tags.append('local program') if version.urgency == 'Yes': tags.append('urgency') if version.taxlevy == 'Yes': tags.append('tax levy') if version.subject: subject = clean_title(version.subject) if not title: self.warning("Couldn't find title for %s, skipping" % bill_id) continue fsbill.title = title if summary: fsbill.add_abstract(summary, note='summary') fsbill.classification = type_ fsbill.subject = [subject] if subject else [] fsbill.extras['impact_clause'] = impact_clause fsbill.extras['tags'] = tags # We don't want the current title in alternate_titles all_titles.remove(title) for title in all_titles: fsbill.add_title(title) for author in version.authors: fsbill.add_sponsorship( author.name, classification=SPONSOR_TYPES[author.contribution], primary=author.primary_author_flg == 'Y', entity_type='person', ) # fsbill.sponsorships[-1]['extras'] = {'official_type': author.contribution} seen_actions = set() for action in bill.actions: if not action.action: # NULL action text seems to be an error on CA's part, # unless it has some meaning I'm missing continue actor = action.actor or chamber actor = actor.strip() match = re.match(r'(Assembly|Senate)($| \(Floor)', actor) if match: actor = {'Assembly': 'lower', 'Senate': 'upper'}[match.group(1)] elif actor.startswith('Governor'): actor = 'executive' else: def replacer(matchobj): if matchobj: return {'Assembly': 'lower', 'Senate': 'upper'}[matchobj.group()] else: return matchobj.group() actor = re.sub(r'^(Assembly|Senate)', replacer, actor) type_ = [] act_str = action.action act_str = re.sub(r'\s+', ' ', act_str) attrs = self.categorizer.categorize(act_str) # Add in the committee strings of the related committees, if any. kwargs = attrs matched_abbrs = committee_abbr_regex.findall(action.action) if re.search(r'Com[s]?. on', action.action) and not matched_abbrs: msg = 'Failed to extract committee abbr from %r.' self.logger.warning(msg % action.action) if matched_abbrs: committees = [] for abbr in matched_abbrs: try: name = self.committee_abbr_to_name(chamber, abbr) committees.append(name) except KeyError: msg = ('Mapping contains no committee name for ' 'abbreviation %r. Action text was %r.') args = (abbr, action.action) raise KeyError(msg % args) committees = filter(None, committees) kwargs['committees'] = committees code = re.search(r'C[SXZ]\d+', actor) if code is not None: code = code.group() kwargs['actor_info'] = {'committee_code': code} assert len(list(committees)) == len(matched_abbrs) for committee, abbr in zip(committees, matched_abbrs): act_str = act_str.replace('Coms. on ', '') act_str = act_str.replace('Com. on ' + abbr, committee) act_str = act_str.replace(abbr, committee) if not act_str.endswith('.'): act_str = act_str + '.' # Determine which chamber the action originated from. changed = False for committee_chamber in ['upper', 'lower', 'legislature']: if actor.startswith(committee_chamber): actor = committee_chamber changed = True break if not changed: actor = 'legislature' if actor != action.actor: actor_info = kwargs.get('actor_info', {}) actor_info['details'] = action.actor kwargs['actor_info'] = actor_info # Add strings for related legislators, if any. rgx = r'(?:senator|assembly[mwp][^ .,:;]+)\s+[^ .,:;]+' legislators = re.findall(rgx, action.action, re.I) if legislators: kwargs['legislators'] = legislators date = action.action_date date = self._tz.localize(date) date = date.date() if (actor, act_str, date) in seen_actions: continue kwargs.update(self.categorizer.categorize(act_str)) action = fsbill.add_action(act_str, date.strftime('%Y-%m-%d'), chamber=actor, classification=kwargs['classification']) for committee in kwargs.get('committees', []): action.add_related_entity( committee, entity_type='organization') seen_actions.add((actor, act_str, date)) for vote_num, vote in enumerate(bill.votes): if vote.vote_result == '(PASS)': result = True else: result = False if not vote.location: continue full_loc = vote.location.description first_part = full_loc.split(' ')[0].lower() if first_part in ['asm', 'assembly']: vote_chamber = 'lower' # vote_location = ' '.join(full_loc.split(' ')[1:]) elif first_part.startswith('sen'): vote_chamber = 'upper' # vote_location = ' '.join(full_loc.split(' ')[1:]) else: raise ScrapeError("Bad location: %s" % full_loc) if vote.motion: motion = vote.motion.motion_text or '' else: motion = '' if "Third Reading" in motion or "3rd Reading" in motion: vtype = 'passage' elif "Do Pass" in motion: vtype = 'passage' else: vtype = 'other' motion = motion.strip() # Why did it take until 2.7 to get a flags argument on re.sub? motion = re.compile(r'(\w+)( Extraordinary)? Session$', re.IGNORECASE).sub('', motion) motion = re.compile(r'^(Senate|Assembly) ', re.IGNORECASE).sub('', motion) motion = re.sub(r'^(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? ', '', motion) motion = re.sub(r' \(\w+\)$', '', motion) motion = re.sub(r'(SCR|SB|AB|AJR|ACR)\s?\d+ \w+\.?$', '', motion) motion = re.sub(r'(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? ' r'Urgency Clause$', '(Urgency Clause)', motion) motion = re.sub(r'\s+', ' ', motion) if not motion: self.warning("Got blank motion on vote for %s" % bill_id) continue # XXX this is responsible for all the CA 'committee' votes, not # sure if that's a feature or bug, so I'm leaving it as is... # vote_classification = chamber if (vote_location == 'Floor') else 'committee' # org = { # 'name': vote_location, # 'classification': vote_classification # } fsvote = VoteEvent( motion_text=motion, start_date=self._tz.localize(vote.vote_date_time), result='pass' if result else 'fail', classification=vtype, # organization=org, chamber=vote_chamber, bill=fsbill, ) fsvote.extras = {'threshold': vote.threshold} source_url = ( 'http://leginfo.legislature.ca.gov/faces' '/billVotesClient.xhtml?bill_id={}' ).format(fsbill.identifier) fsvote.add_source(source_url) fsvote.pupa_id = source_url + '#' + str(vote_num) rc = {'yes': [], 'no': [], 'other': []} for record in vote.votes: if record.vote_code == 'AYE': rc['yes'].append(record.legislator_name) elif record.vote_code.startswith('NO'): rc['no'].append(record.legislator_name) else: rc['other'].append(record.legislator_name) # Handle duplicate votes for key in rc.keys(): rc[key] = list(set(rc[key])) for key, voters in rc.items(): for voter in voters: fsvote.vote(key, voter) # Set counts by summed votes for accuracy fsvote.set_count(key, len(voters)) yield fsvote yield fsbill self.session.expire_all()
def parse_vote(self, bill, link): # Server sometimes sends proper error headers, # sometimes not try: self.info("Get {}".format(link)) text = requests.get(link).text except requests.exceptions.HTTPError as err: self.warning("{} fetching vote {}, skipping".format(err, link)) return if 'Varnish cache server' in text: self.warning("Scrape rate is too high, try re-scraping with " "The --rpm set to a lower number") return if 'Page Not Found' in text or 'Page Unavailable' in text: self.warning("missing vote, skipping") return member_doc = lxml.html.fromstring(text) motion = member_doc.xpath("//div[@id='main_content']/h4/text()") chamber_date_line = ''.join(member_doc.xpath("//div[@id='main_content']/h3[1]//text()")) chamber_date_line_words = chamber_date_line.split() vote_chamber = chamber_date_line_words[0] vote_date = datetime.datetime.strptime(chamber_date_line_words[-1], '%m/%d/%Y') vote_status = " ".join(chamber_date_line_words[2:-2]) opinions = member_doc.xpath("//div[@id='main_content']/h3[position() > 1]/text()") if len(opinions) > 0: vote_status = vote_status if vote_status.strip() else motion[0] vote_chamber = 'upper' if vote_chamber == 'Senate' else 'lower' for i in opinions: try: count = int(i[i.find("(") + 1:i.find(")")]) except ValueError: # This is likely not a vote-count text chunk # It's probably '`On roll call the vote was:` pass else: if "yea" in i.lower(): yes_count = count elif "nay" in i.lower(): no_count = count elif "present" in i.lower(): p_count = count elif "absent" in i.lower(): a_count = count vote = VoteEvent( bill=bill, start_date=vote_date.strftime('%Y-%m-%d'), chamber=vote_chamber, motion_text=vote_status, result='pass' if yes_count > no_count else 'fail', classification='passage', ) vote.pupa_id = link vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('abstain', p_count) vote.set_count('absent', a_count) vote.add_source(link) a_links = member_doc.xpath("//div[@id='main_content']/a/text()") for i in range(1, len(a_links)): if i <= yes_count: vote.vote('yes', re.sub(',', '', a_links[i]).split()[0]) elif no_count != 0 and i > yes_count and i <= yes_count + no_count: vote.vote('no', re.sub(',', '', a_links[i]).split()[0]) else: vote.vote('other', re.sub(',', '', a_links[i]).split()[0]) yield vote else: self.warning("No Votes for: %s", link)
def scrape_vote(self, bill, vote_url, chamber, date): page = self.lxmlize(vote_url) try: motion = page.xpath("//font/text()")[2] except IndexError: self.warning("Vote Summary Page Broken ") return # eg. http://leg.colorado.gov/content/sb18-033vote563ce6 if ('AM' in motion or 'PM' in motion) and '/' in motion: motion = "Motion not given." if 'withdrawn' not in motion: yes_no_counts = page.xpath("//tr/td[preceding-sibling::td/descendant::" "font[contains(text(),'Aye')]]/font/text()") other_counts = page.xpath("//tr/td[preceding-sibling::td/descendant::" "font[contains(text(),'Absent')]]/font/text()") abstain_counts = page.xpath("//tr/td[preceding-sibling::td/descendant::" "font[contains(text(),'17C')]]/font/text()") yes_count = int(yes_no_counts[0]) no_count = int(yes_no_counts[2]) exc_count = int(other_counts[2]) absent_count = int(other_counts[0]) abstain_count = 0 if abstain_counts: abstain_count = int(abstain_counts[0]) # fix for # http://leg.colorado.gov/content/hb19-1029vote65e72e if absent_count == -1: absent_count = 0 passed = yes_count > no_count vote = VoteEvent(chamber=chamber, start_date=self._tz.localize(date), motion_text=motion, result='pass' if passed else 'fail', bill=bill, classification='passage', ) vote.pupa_id = vote_url vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('excused', exc_count) vote.set_count('absent', absent_count) vote.set_count('abstain', abstain_count) vote.add_source(vote_url) rolls = page.xpath("//tr[preceding-sibling::tr/descendant::" "td/div/b/font[contains(text(),'Vote')]]") vote_abrv = {'Y': 'yes', 'N': 'no', 'E': 'excused', 'A': 'absent', '-': 'absent', '17C': 'abstain'} for roll in rolls: voted = roll.xpath(".//td/div/font/text()")[0].strip() voter = roll.xpath(".//td/font/text()")[0].strip() if voted == 'V': continue vote.vote(vote_abrv[voted], voter) yield vote
def scrape_votes(self, bill_page, page_url, bill, insert, year): root = lxml.html.fromstring(bill_page) trs = root.xpath('/html/body/div/table[6]//tr') assert len(trs) >= 1, "Didn't find the Final Passage Votes' table" for tr in trs[1:]: links = tr.xpath('td/a[contains(text(), "Passage")]') if len(links) == 0: self.warning("Non-passage vote found for {}; ".format(bill.identifier) + "probably a motion for the calendar. It will be skipped.") else: assert len(links) == 1, \ "Too many votes found for XPath query, on bill {}".format(bill.identifier) link = links[0] motion = link.text if 'Assembly' in motion: chamber = 'lower' else: chamber = 'upper' votes = {} tds = tr.xpath('td') for td in tds: if td.text: text = td.text.strip() date = re.match('... .*?, ....', text) count = re.match('(?P<category>.*?) (?P<votes>[0-9]+)[,]?', text) if date: vote_date = datetime.strptime(text, '%b %d, %Y') elif count: votes[count.group('category')] = int(count.group('votes')) yes = votes['Yea'] no = votes['Nay'] excused = votes['Excused'] not_voting = votes['Not Voting'] absent = votes['Absent'] other = excused + not_voting + absent passed = yes > no vote = VoteEvent(chamber=chamber, start_date=self._tz.localize(vote_date), motion_text=motion, result='pass' if passed else 'fail', classification='passage', bill=bill, ) vote.set_count('yes', yes) vote.set_count('no', no) vote.set_count('other', other) vote.set_count('not voting', not_voting) vote.set_count('absent', absent) # try to get vote details try: vote_url = 'http://www.leg.state.nv.us/Session/%s/Reports/%s' % ( insert, link.get('href')) vote.pupa_id = vote_url vote.add_source(vote_url) if vote_url in self._seen_votes: self.warning('%s is included twice, skipping second', vote_url) continue else: self._seen_votes.add(vote_url) page = self.get(vote_url).text page = page.replace(u"\xa0", " ") root = lxml.html.fromstring(page) for el in root.xpath('//table[2]/tr'): tds = el.xpath('td') name = tds[1].text_content().strip() vote_result = tds[2].text_content().strip() if vote_result == 'Yea': vote.yes(name) elif vote_result == 'Nay': vote.no(name) else: vote.vote('other', name) vote.add_source(page_url) except scrapelib.HTTPError: self.warning("failed to fetch vote page, adding vote without details") yield vote
def scrape_vote(self, bill, date, url): page = self.get(url).text page = lxml.html.fromstring(page) header = page.xpath("string(//h4[contains(@id, 'hdVote')])") if 'No Bill Action' in header: self.warning("bad vote header -- skipping") return location = header.split(', ')[1] if location.startswith('House'): chamber = 'lower' elif location.startswith('Senate'): chamber = 'upper' elif location.startswith('Joint'): chamber = 'legislature' else: raise ScrapeError("Bad chamber: %s" % location) # committee = ' '.join(location.split(' ')[1:]).strip() # if not committee or committee.startswith('of Representatives'): # committee = None motion = ', '.join(header.split(', ')[2:]).strip() if motion: # If we can't detect a motion, skip this vote yes_count = int( page.xpath("string(//td[contains(@id, 'tdAyes')])")) no_count = int( page.xpath("string(//td[contains(@id, 'tdNays')])")) excused_count = int( page.xpath("string(//td[contains(@id, 'tdExcused')])")) absent_count = int( page.xpath("string(//td[contains(@id, 'tdAbsent')])")) passed = yes_count > no_count if motion.startswith('Do Pass'): type = 'passage' elif motion == 'Concurred in amendments': type = 'amendment' elif motion == 'Veto override': type = 'veto_override' else: type = 'other' vote = VoteEvent(chamber=chamber, start_date=date, motion_text=motion, result='pass' if passed else 'fail', classification=type, bill=bill ) vote.pupa_id = url # vote id is in URL vote.add_source(url) vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('excused', excused_count) vote.set_count('absent', absent_count) for td in page.xpath("//table[contains(@id, 'tblVotes')]/tr/td"): if td.text in ('Aye', 'Yea'): vote.yes(td.getprevious().text.strip()) elif td.text == 'Nay': vote.no(td.getprevious().text.strip()) elif td.text == 'Excused': vote.vote('excused', td.getprevious().text.strip()) elif td.text == 'Absent': vote.vote('absent', td.getprevious().text.strip()) yield vote
def scrape_votes(self, session): votes = {} other_counts = defaultdict(int) last_line = [] vote_url = 'http://gencourt.state.nh.us/dynamicdatafiles/RollCallSummary.txt' lines = self.get(vote_url).content.decode('utf-8').splitlines() for line in lines: if len(line) < 2: continue if line.strip() == "": continue line = line.split('|') if len(line) < 14: if len(last_line + line[1:]) == 14: line = last_line self.warning('used bad vote line') else: last_line = line self.warning('bad vote line %s' % '|'.join(line)) session_yr = line[0].replace('\xef\xbb\xbf', '') body = line[1] vote_num = line[2] timestamp = line[3] bill_id = line[4].strip() yeas = int(line[5]) nays = int(line[6]) # present = int(line[7]) # absent = int(line[8]) motion = line[11].strip() or '[not available]' if session_yr == session and bill_id in self.bills_by_id: actor = 'lower' if body == 'H' else 'upper' time = dt.datetime.strptime(timestamp, '%m/%d/%Y %I:%M:%S %p') time = pytz.timezone('America/New_York').localize(time).isoformat() # TODO: stop faking passed somehow passed = yeas > nays vote = Vote(chamber=actor, start_date=time, motion_text=motion, result='pass' if passed else 'fail', classification='passage', bill=self.bills_by_id[bill_id]) vote.set_count('yes', yeas) vote.set_count('no', nays) vote.add_source(vote_url) vote.pupa_id = session_yr + body + vote_num # unique ID for vote votes[body+vote_num] = vote for line in self.get('http://gencourt.state.nh.us/dynamicdatafiles/RollCallHistory.txt') \ .content.decode('utf-8').splitlines(): if len(line) < 2: continue # 2016|H|2|330795||Yea| # 2012 | H | 2 | 330795 | 964 | HB309 | Yea | 1/4/2012 8:27:03 PM session_yr, body, v_num, _, employee, bill_id, vote, date = \ line.split('|') if not bill_id: continue if session_yr == session and bill_id.strip() in self.bills_by_id: try: leg = " ".join(self.legislators[employee]['name'].split()) except KeyError: self.warning("Error, can't find person %s" % employee) continue vote = vote.strip() if body+v_num not in votes: self.warning("Skipping processing this vote:") self.warning("Bad ID: %s" % (body+v_num)) continue # code = self.legislators[employee]['seat'] if vote == 'Yea': votes[body+v_num].yes(leg) elif vote == 'Nay': votes[body+v_num].no(leg) else: votes[body+v_num].vote('other', leg) # hack-ish, but will keep the vote count sync'd other_counts[body+v_num] += 1 votes[body+v_num].set_count('other', other_counts[body+v_num]) for vid, vote in votes.items(): yield vote
def _parse_votes(self, url, vote, bill): '''Given a vote url and a vote object, extract the voters and the vote counts from the vote page and update the vote object. ''' if url.lower().endswith('.pdf'): try: resp = self.get(url) except HTTPError: # This vote document wasn't found. msg = 'No document found at url %r' % url self.logger.warning(msg) return try: v = PDFCommitteeVote(url, resp.content, bill) return v.asvote() except PDFCommitteeVoteParseError: # Warn and skip. self.warning("Could't parse committee vote at %r" % url) return html = self.get(url).text doc = lxml.html.fromstring(html) doc.make_links_absolute(url) # Yes, no, excused, absent. try: vals = doc.xpath('//table')[1].xpath('tr/td/text()') except IndexError: # Most likely was a bogus link lacking vote data. return yes_count, no_count, excused_count, absent_count = map(int, vals) # Get the motion. try: motion = doc.xpath('//br')[-1].tail.strip() except IndexError: # Some of them mysteriously have no motion listed. motion = vote['action'] if not motion: motion = vote['action'] vote['motion'] = motion action = vote['action'] vote_url = vote['vote_url'] vote = VoteEvent( chamber=vote['chamber'], start_date=vote['date'], motion_text=vote['motion'], result='fail', # placeholder classification='passage', bill=bill, bill_action=vote['action'], ) vote.pupa_id = vote_url # URL contains sequence number vote.add_source(vote_url) vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('excused', excused_count) vote.set_count('absent', absent_count) for text in doc.xpath('//table')[2].xpath('tr/td/text()'): if not text.strip(u'\xa0'): continue v, name = filter(None, text.split(u'\xa0')) # Considering Name is brackets as short name regex = re.compile(r".*?\((.*?)\)") short_name = re.findall(regex, name) if len(short_name) > 0: note = 'Short Name: ' + short_name[0] else: note = '' # Name without brackets like 'Kary, Douglas' name = re.sub(r"[\(\[].*?[\)\]]", "", name) if v == 'Y': vote.yes(name, note=note) elif v == 'N': vote.no(name, note=note) elif v == 'E': vote.vote('excused', name, note=note) elif v == 'A': vote.vote('absent', name, note=note) # code to deterimine value of `passed` passed = None # some actions take a super majority, so we aren't just # comparing the yeas and nays here. for i in vote_passage_indicators: if i in action: passed = True break for i in vote_failure_indicators: if i in action and passed: # a quick explanation: originally an exception was # thrown if both passage and failure indicators were # present because I thought that would be a bug in my # lists. Then I found 2007 HB 160. # Now passed = False if the nays outnumber the yays.. # I won't automatically mark it as passed if the yays # ounumber the nays because I don't know what requires # a supermajority in MT. if no_count >= yes_count: passed = False break else: raise Exception("passage and failure indicator" "both present at: %s" % url) if i in action and passed is None: passed = False break for i in vote_ambiguous_indicators: if i in action: passed = yes_count > no_count break if passed is None: raise Exception("Unknown passage at: %s" % url) vote.result = 'pass' if passed else 'fail' return vote
def scrape_vote(self, bill, vote_id, session): vote_url = 'https://legis.delaware.gov/json/RollCall/GetRollCallVoteByRollCallId' form = { 'rollCallId': vote_id, 'sort': '', 'group': '', 'filter': '', } self.info('Fetching vote {} for {}'.format(vote_id, bill.identifier)) page = self.post(url=vote_url, data=form, allow_redirects=True).json() if page: roll = page['Model'] vote_chamber = self.chamber_map[roll['ChamberName']] # "7/1/16 01:00 AM" vote_date = dt.datetime.strptime(roll['TakenAtDateTime'], '%m/%d/%y %I:%M %p').strftime('%Y-%m-%d') # TODO: What does this code mean? vote_motion = roll['RollCallVoteType'] vote_passed = 'pass' if roll['RollCallStatus'] == 'Passed' else 'fail' other_count = (int(roll['NotVotingCount']) + int(roll['VacantVoteCount']) + int(roll['AbsentVoteCount']) + int(roll['ConflictVoteCount']) ) vote = VoteEvent(chamber=vote_chamber, start_date=vote_date, motion_text=vote_motion, result=vote_passed, classification='other', bill=bill, legislative_session=session ) vote_pdf_url = 'https://legis.delaware.gov' \ '/json/RollCallController/GenerateRollCallPdf' \ '?rollCallId={}&chamberId={}'.format(vote_id, self.chamber_codes[vote_chamber]) # Vote URL is just a generic search URL with POSTed data, # so provide a different link vote.add_source(vote_pdf_url) vote.pupa_id = vote_pdf_url vote.set_count('yes', roll['YesVoteCount']) vote.set_count('no', roll['NoVoteCount']) vote.set_count('other', other_count) for row in roll['AssemblyMemberVotes']: # AssemblyMemberId looks like it should work here, # but for some sessions it's bugged to only return session try: voter = self.legislators_by_short[str(row['ShortName'])] name = voter['DisplayName'] except KeyError: self.warning('could not find legislator short name %s', row['ShortName']) name = row['ShortName'] if row['SelectVoteTypeCode'] == 'Y': vote.yes(name) elif row['SelectVoteTypeCode'] == 'N': vote.no(name) else: vote.vote('other', name) yield vote
def scrape_vote_history(self, bill, vurl): """ Obtain the information on a vote and link it to the related Bill :param bill: related bill :param vurl: source for the voteEvent information. :return: voteEvent object """ html = self.get(vurl).text doc = lxml.html.fromstring(html) doc.make_links_absolute(vurl) # skip first two rows for row in doc.xpath('//table/tr')[2:]: tds = row.getchildren() if len(tds) != 11: self.warning('irregular vote row: %s' % vurl) continue timestamp, motion, vote, yeas, nays, nv, exc, pres, abst, total, result = tds timestamp = timestamp.text.replace(u'\xa0', ' ') timestamp = datetime.datetime.strptime(timestamp, '%m/%d/%Y %H:%M %p') yeas = int(yeas.text) nays = int(nays.text) others = int(nv.text) + int(exc.text) + \ int(abst.text) + int(pres.text) assert yeas + nays + others == int(total.text) if result.text == 'Passed': passed = 'pass' else: passed = 'fail' vote_link = vote.xpath('a')[0] if '[H]' in vote_link.text: chamber = 'lower' else: chamber = 'upper' vote = VoteEvent( chamber=chamber, # 'upper' or 'lower' start_date=timestamp.strftime( '%Y-%m-%d'), # 'YYYY-MM-DD' format motion_text=motion.text, result=passed, classification='passage', # Can also be 'other' # Provide a Bill instance to link with the VoteEvent... bill=bill, ) vote.set_count('yes', yeas) vote.set_count('no', nays) vote.set_count('other', others) vote.add_source(vurl) # obtain vote rollcall from pdf and add it to the VoteEvent object rollcall_pdf = vote_link.get('href') self.scrape_rollcall(vote, rollcall_pdf) vote.add_source(rollcall_pdf) if rollcall_pdf in self._seen_vote_ids: self.warning('duplicate usage of %s, skipping', rollcall_pdf) continue else: self._seen_vote_ids.add(rollcall_pdf) vote.pupa_id = rollcall_pdf # distinct KEY for each one yield vote
def parse_vote(self, bill, action, act_chamber, act_date, url): re_vote_text = re.compile(r'The question (?:being|to be reconsidered):\s*"(.*?\?)"', re.S) re_header = re.compile(r'\d{2}-\d{2}-\d{4}\s{10,}\w{,20} Journal\s{10,}\d{,6}\s{,4}') html = self.get(url).text doc = lxml.html.fromstring(html) if len(doc.xpath('//pre')) < 2: return # Find all chunks of text representing voting reports. votes_text = doc.xpath('//pre')[1].text_content() votes_text = re_vote_text.split(votes_text) votes_data = zip(votes_text[1::2], votes_text[2::2]) iVoteOnPage = 0 # Process each. for motion, text in votes_data: iVoteOnPage += 1 yes = no = other = 0 tally = re.findall(r'\b([YNEA])[A-Z]+:\s{,3}(\d{,3})', text) for vtype, vcount in tally: vcount = int(vcount) if vcount != '-' else 0 if vtype == 'Y': yes = vcount elif vtype == 'N': no = vcount else: other += vcount vote = VoteEvent( bill=bill, start_date=act_date.strftime('%Y-%m-%d'), chamber=act_chamber, motion_text=motion, result='pass' if yes > no else 'fail', classification='passage', ) vote.set_count('yes', yes) vote.set_count('no', no) vote.set_count('other', other) vote.pupa_id = (url + ' ' + str(iVoteOnPage)) if iVoteOnPage > 1 else url # In lengthy documents, the "header" can be repeated in the middle # of content. This regex gets rid of it. vote_lines = re_header.sub('', text) vote_lines = vote_lines.split('\r\n') vote_type = None for vote_list in vote_lines: if vote_list.startswith('Yeas: '): vote_list, vote_type = vote_list[6:], 'yes' elif vote_list.startswith('Nays: '): vote_list, vote_type = vote_list[6:], 'no' elif vote_list.startswith('Excused: '): vote_list, vote_type = vote_list[9:], 'other' elif vote_list.startswith('Absent: '): vote_list, vote_type = vote_list[9:], 'other' elif vote_list.strip() == '': vote_type = None if vote_type: for name in vote_list.split(','): name = name.strip() if name: vote.vote(vote_type, name) vote.add_source(url) yield vote
def scrape_vote(self, bill, date, url): page = self.get(url).text page = lxml.html.fromstring(page) header = page.xpath("string(//h3[contains(@id, 'hdVote')])") if 'No Bill Action' in header: self.warning("bad vote header -- skipping") return location = header.split(', ')[1] if location.startswith('House'): chamber = 'lower' elif location.startswith('Senate'): chamber = 'upper' elif location.startswith('Joint'): chamber = 'legislature' else: raise ScrapeError("Bad chamber: %s" % location) motion = ', '.join(header.split(', ')[2:]).strip() if motion: # If we can't detect a motion, skip this vote yes_count = int( page.xpath("string(//span[contains(@id, 'tdAyes')])")) no_count = int( page.xpath("string(//span[contains(@id, 'tdNays')])")) excused_count = int( page.xpath("string(//span[contains(@id, 'tdExcused')])")) absent_count = int( page.xpath("string(//span[contains(@id, 'tdAbsent')])")) passed = yes_count > no_count if motion.startswith('Do Pass'): type = 'passage' elif motion == 'Concurred in amendments': type = 'amendment' elif motion == 'Veto override': type = 'veto_override' else: type = 'other' vote = VoteEvent(chamber=chamber, start_date=date, motion_text=motion, result='pass' if passed else 'fail', classification=type, bill=bill ) # The vote page URL has a unique ID # However, some votes are "consent calendar" events, # and relate to the passage of _multiple_ bills # These can't be modeled yet in Pupa, but for now we can # append a bill ID to the URL that forms the `pupa_id` # https://github.com/opencivicdata/pupa/issues/308 vote.pupa_id = '{}#{}'.format(url, bill.identifier.replace(' ', '')) vote.add_source(url) vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('excused', excused_count) vote.set_count('absent', absent_count) for td in page.xpath("//table[@id='tblVoteTotals']/tbody/tr/td"): option_or_person = td.text.strip() if option_or_person in ('Aye', 'Yea'): vote.yes(td.getprevious().text.strip()) elif option_or_person == 'Nay': vote.no(td.getprevious().text.strip()) elif option_or_person == 'Excused': vote.vote('excused', td.getprevious().text.strip()) elif option_or_person == 'Absent': vote.vote('absent', td.getprevious().text.strip()) yield vote
def parse_vote_pdf(self, vote_url, bill): filename, response = self.urlretrieve(vote_url) text = convert_pdf(filename, type='text').decode() lines = text.splitlines() if 'Senate' in vote_url: chamber = 'upper' else: chamber = 'lower' date_string = lines[0].split('Calendar Date:')[1].strip() date = datetime.datetime.strptime(date_string, "%b %d, %Y %I:%M (%p)") page_index = None for index, line in enumerate(lines): if 'Yeas' in line and 'Nays' in line: page_index = index break vote_counts = 5 * [0] vote_types = ['yes', 'no', 'not voting', 'excused', 'absent'] if page_index: counts = re.split(r'\s{2,}', lines[page_index].strip()) for index, count in enumerate(counts): number, string = count.split(' ', 1) number = int(number) vote_counts[index] = number else: raise ValueError("Vote Counts Not found at %s" % vote_url) passed = vote_counts[0] > vote_counts[1] # Consent calendar votes address multiple bills in one VoteEvent # eg, http://mgaleg.maryland.gov/2018RS/votes/Senate/0478.pdf is_consent_calendar = any( ['Consent Calendar' in line for line in lines[:page_index]]) consent_calendar_bills = None motion = "" if is_consent_calendar: motion = re.split(r'\s{2,}', lines[page_index - 4].strip())[0] consent_calendar_bills = re.split(r'\s{2,}', lines[page_index - 1].strip()) assert consent_calendar_bills, "Could not find bills for consent calendar vote" motion_keywords = [ 'favorable', 'reading', 'amendment', 'motion', 'introduced', 'bill pass', 'committee' ] motion_lines = [ 3, 2, 4, 5 ] # Relative LineNumbers to be checked for existence of motion for i in motion_lines: if any(motion_keyword in motion.lower() for motion_keyword in motion_keywords): break motion = re.split(r'\s{2,}', lines[page_index - i].strip())[0] else: if not any(motion_keyword in motion.lower() for motion_keyword in motion_keywords): # This condition covers for the bad formating in SB 1260 motion = lines[page_index - 3] if not any(motion_keyword in motion.lower() for motion_keyword in motion_keywords): # Check this one for SB 747 motion = "No motion given" self.warning("No motion given") vote = VoteEvent( bill=bill, chamber=chamber, start_date=date.strftime('%Y-%m-%d'), motion_text=motion, classification='passage', result='pass' if passed else 'fail', ) # Include bill ID to avoid duplication for consent calendars vote.pupa_id = '{}#{}'.format(vote_url, bill.identifier) for index, vote_type in enumerate(vote_types): vote.set_count(vote_type, vote_counts[index]) page_index = page_index + 2 # Keywords for identifying where names are located in the pdf show_stoppers = [ 'Voting Nay', 'Not Voting', 'COPY', 'Excused', 'indicates vote change' ] vote_index = 0 # For matching number of names extracted with vote counts(extracted independently) vote_name_counts = 5 * [0] while page_index < len(lines): current_line = lines[page_index].strip() if not current_line or 'Voting Yea' in current_line: page_index += 1 continue if any(show_stopper in current_line for show_stopper in show_stoppers): page_index += 1 vote_index = (vote_index + 1) continue names = re.split(r'\s{2,}', current_line) vote_name_counts[vote_index] += len(names) for name in names: vote.vote(vote_types[vote_index], name) page_index += 1 if vote_counts != vote_name_counts: raise ValueError("Votes Count and Number of Names don't match") return vote
def scrape_action_page(self, bill, page): action_rows = page.xpath('//tbody/tr') for row in action_rows: action_date = row.xpath('td[1]/text()')[0] action_date = datetime.strptime(action_date, '%m/%d/%Y') action_year = action_date.year action_date = action_date.strftime('%Y-%m-%d') if row.xpath('td[2]/text()'): action_actor = row.xpath('td[2]/text()')[0] action_actor = self.chamber_map_reverse[action_actor.strip()] action_name = row.xpath('string(td[3])') # House votes if "Supplement" in action_name: actor = "lower" vote_action = re.findall(r'(.+)-\s*\d+\s*YEAS', action_name)[0].strip() y = int(re.findall(r'(\d+)\s*YEAS', action_name)[0]) n = int(re.findall(r'(\d+)\s*NAYS', action_name)[0]) # get supplement number n_supplement = int(re.findall(r'No\.\s*(\d+)', action_name)[0]) cached_vote = VoteEvent( chamber=actor, start_date=action_date, motion_text=vote_action, result='pass' if y > n else 'fail', classification='passage', bill=bill, ) cached_vote.set_count('yes', y) cached_vote.set_count('no', n) housevote_pdf = 'https://malegislature.gov/Journal/House/{}/{}/RollCalls'.format( bill.legislative_session, action_year) self.scrape_house_vote(cached_vote, housevote_pdf, n_supplement) cached_vote.add_source(housevote_pdf) cached_vote.pupa_id = '{}#{}'.format(housevote_pdf, n_supplement) # XXX: disabled house votes on 8/1 to try to get MA importing again # will leaving this in and commented out once we resolve the ID issue # yield cached_vote # Senate votes if "Roll Call" in action_name: actor = "upper" # placeholder vote_action = action_name.split(' -')[0] try: y, n = re.search(r'(\d+) yeas .*? (\d+) nays', action_name.lower()).groups() y = int(y) n = int(n) except AttributeError: y = int(re.search(r"yeas\s+(\d+)", action_name.lower()).group(1)) n = int(re.search(r"nays\s+(\d+)", action_name.lower()).group(1)) # TODO: other count isn't included, set later cached_vote = VoteEvent( chamber=actor, start_date=action_date, motion_text=vote_action, result='pass' if y > n else 'fail', classification='passage', bill=bill, ) cached_vote.set_count('yes', y) cached_vote.set_count('no', n) rollcall_pdf = 'http://malegislature.gov' + row.xpath('string(td[3]/a/@href)') self.scrape_senate_vote(cached_vote, rollcall_pdf) cached_vote.add_source(rollcall_pdf) cached_vote.pupa_id = rollcall_pdf # XXX: also disabled, see above note # yield cached_vote attrs = self.categorizer.categorize(action_name) action = bill.add_action( action_name.strip(), action_date, chamber=action_actor, classification=attrs['classification'], ) for com in attrs.get('committees', []): action.add_related_entity(com, entity_type='organization')
def scrape_bill_type(self, chamber, session, bill_type, type_abbr, committee_abbr_regex=get_committee_name_regex()): bills = self.session.query(CABill).filter_by( session_year=session).filter_by(measure_type=type_abbr) for bill in bills: bill_session = session if bill.session_num != '0': bill_session += ' Special Session %s' % bill.session_num bill_id = bill.short_bill_id fsbill = Bill(bill_id, session, title='', chamber=chamber) if ((bill_id.startswith('S') and chamber == 'lower') or (bill_id.startswith('A') and chamber == 'upper')): print("!!!! BAD ID/CHAMBER PAIR !!!!", bill) continue # # Construct session for web query, going from '20092010' to '0910' # source_session = session[2:4] + session[6:8] # # Turn 'AB 10' into 'ab_10' # source_num = "%s_%s" % (bill.measure_type.lower(), # bill.measure_num) # Construct a fake source url source_url = ('http://leginfo.legislature.ca.gov/faces/' 'billNavClient.xhtml?bill_id=%s') % bill.bill_id fsbill.add_source(source_url) fsbill.add_version_link(bill_id, source_url, media_type='text/html') title = '' type_ = ['bill'] subject = '' all_titles = set() # Get digest test (aka "summary") from latest version. if bill.versions: version = bill.versions[-1] nsmap = version.xml.nsmap xpath = '//caml:DigestText/xhtml:p' els = version.xml.xpath(xpath, namespaces=nsmap) chunks = [] for el in els: t = etree_text_content(el) t = re.sub(r'\s+', ' ', t) t = re.sub(r'\)(\S)', lambda m: ') %s' % m.group(1), t) chunks.append(t) summary = '\n\n'.join(chunks) for version in bill.versions: if not version.bill_xml: continue version_date = self._tz.localize( version.bill_version_action_date) # create a version name to match the state's format # 02/06/17 - Enrolled version_date_human = version_date.strftime('%m/%d/%y') version_name = "{} - {}".format(version_date_human, version.bill_version_action) version_base = "https://leginfo.legislature.ca.gov/faces" version_url_pdf = "{}/billPdf.xhtml?bill_id={}&version={}".format( version_base, version.bill_id, version.bill_version_id) fsbill.add_version_link(version_name, version_url_pdf, media_type='application/pdf', date=version_date.date()) # CA is inconsistent in that some bills have a short title # that is longer, more descriptive than title. if bill.measure_type in ('AB', 'SB'): impact_clause = clean_title(version.title) title = clean_title(version.short_title) else: impact_clause = None if len(version.title) < len(version.short_title) and \ not version.title.lower().startswith('an act'): title = clean_title(version.short_title) else: title = clean_title(version.title) if title: all_titles.add(title) type_ = [bill_type] if version.appropriation == 'Yes': type_.append('appropriation') tags = [] if version.fiscal_committee == 'Yes': tags.append('fiscal committee') if version.local_program == 'Yes': tags.append('local program') if version.urgency == 'Yes': tags.append('urgency') if version.taxlevy == 'Yes': tags.append('tax levy') if version.subject: subject = clean_title(version.subject) if not title: self.warning("Couldn't find title for %s, skipping" % bill_id) continue fsbill.title = title if summary: fsbill.add_abstract(summary, note='summary') fsbill.classification = type_ fsbill.subject = [subject] if subject else [] fsbill.extras['impact_clause'] = impact_clause fsbill.extras['tags'] = tags # We don't want the current title in alternate_titles all_titles.remove(title) for title in all_titles: fsbill.add_title(title) for author in version.authors: fsbill.add_sponsorship( author.name, classification=SPONSOR_TYPES[author.contribution], primary=author.primary_author_flg == 'Y', entity_type='person', ) # fsbill.sponsorships[-1]['extras'] = {'official_type': author.contribution} seen_actions = set() for action in bill.actions: if not action.action: # NULL action text seems to be an error on CA's part, # unless it has some meaning I'm missing continue actor = action.actor or chamber actor = actor.strip() match = re.match(r'(Assembly|Senate)($| \(Floor)', actor) if match: actor = { 'Assembly': 'lower', 'Senate': 'upper' }[match.group(1)] elif actor.startswith('Governor'): actor = 'executive' else: def replacer(matchobj): if matchobj: return { 'Assembly': 'lower', 'Senate': 'upper' }[matchobj.group()] else: return matchobj.group() actor = re.sub(r'^(Assembly|Senate)', replacer, actor) type_ = [] act_str = action.action act_str = re.sub(r'\s+', ' ', act_str) attrs = self.categorizer.categorize(act_str) # Add in the committee strings of the related committees, if any. kwargs = attrs matched_abbrs = committee_abbr_regex.findall(action.action) if re.search(r'Com[s]?. on', action.action) and not matched_abbrs: msg = 'Failed to extract committee abbr from %r.' self.logger.warning(msg % action.action) if matched_abbrs: committees = [] for abbr in matched_abbrs: try: name = self.committee_abbr_to_name(chamber, abbr) committees.append(name) except KeyError: msg = ('Mapping contains no committee name for ' 'abbreviation %r. Action text was %r.') args = (abbr, action.action) raise KeyError(msg % args) committees = filter(None, committees) kwargs['committees'] = committees code = re.search(r'C[SXZ]\d+', actor) if code is not None: code = code.group() kwargs['actor_info'] = {'committee_code': code} assert len(list(committees)) == len(matched_abbrs) for committee, abbr in zip(committees, matched_abbrs): act_str = act_str.replace('Coms. on ', '') act_str = act_str.replace('Com. on ' + abbr, committee) act_str = act_str.replace(abbr, committee) if not act_str.endswith('.'): act_str = act_str + '.' # Determine which chamber the action originated from. changed = False for committee_chamber in ['upper', 'lower', 'legislature']: if actor.startswith(committee_chamber): actor = committee_chamber changed = True break if not changed: actor = 'legislature' if actor != action.actor: actor_info = kwargs.get('actor_info', {}) actor_info['details'] = action.actor kwargs['actor_info'] = actor_info # Add strings for related legislators, if any. rgx = r'(?:senator|assembly[mwp][^ .,:;]+)\s+[^ .,:;]+' legislators = re.findall(rgx, action.action, re.I) if legislators: kwargs['legislators'] = legislators date = action.action_date date = self._tz.localize(date) date = date.date() if (actor, act_str, date) in seen_actions: continue kwargs.update(self.categorizer.categorize(act_str)) action = fsbill.add_action( act_str, date.strftime('%Y-%m-%d'), chamber=actor, classification=kwargs['classification']) for committee in kwargs.get('committees', []): action.add_related_entity(committee, entity_type='organization') seen_actions.add((actor, act_str, date)) for vote_num, vote in enumerate(bill.votes): if vote.vote_result == '(PASS)': result = True else: result = False if not vote.location: continue full_loc = vote.location.description first_part = full_loc.split(' ')[0].lower() if first_part in ['asm', 'assembly']: vote_chamber = 'lower' # vote_location = ' '.join(full_loc.split(' ')[1:]) elif first_part.startswith('sen'): vote_chamber = 'upper' # vote_location = ' '.join(full_loc.split(' ')[1:]) else: raise ScrapeError("Bad location: %s" % full_loc) if vote.motion: motion = vote.motion.motion_text or '' else: motion = '' if "Third Reading" in motion or "3rd Reading" in motion: vtype = 'passage' elif "Do Pass" in motion: vtype = 'passage' else: vtype = 'other' motion = motion.strip() # Why did it take until 2.7 to get a flags argument on re.sub? motion = re.compile(r'(\w+)( Extraordinary)? Session$', re.IGNORECASE).sub('', motion) motion = re.compile(r'^(Senate|Assembly) ', re.IGNORECASE).sub('', motion) motion = re.sub(r'^(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? ', '', motion) motion = re.sub(r' \(\w+\)$', '', motion) motion = re.sub(r'(SCR|SB|AB|AJR|ACR)\s?\d+ \w+\.?$', '', motion) motion = re.sub( r'(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? ' r'Urgency Clause$', '(Urgency Clause)', motion) motion = re.sub(r'\s+', ' ', motion) if not motion: self.warning("Got blank motion on vote for %s" % bill_id) continue # XXX this is responsible for all the CA 'committee' votes, not # sure if that's a feature or bug, so I'm leaving it as is... # vote_classification = chamber if (vote_location == 'Floor') else 'committee' # org = { # 'name': vote_location, # 'classification': vote_classification # } fsvote = VoteEvent( motion_text=motion, start_date=self._tz.localize(vote.vote_date_time), result='pass' if result else 'fail', classification=vtype, # organization=org, chamber=vote_chamber, bill=fsbill, ) fsvote.extras = {'threshold': vote.threshold} source_url = ('http://leginfo.legislature.ca.gov/faces' '/billVotesClient.xhtml?bill_id={}').format( fsbill.identifier) fsvote.add_source(source_url) fsvote.pupa_id = source_url + '#' + str(vote_num) rc = {'yes': [], 'no': [], 'other': []} for record in vote.votes: if record.vote_code == 'AYE': rc['yes'].append(record.legislator_name) elif record.vote_code.startswith('NO'): rc['no'].append(record.legislator_name) else: rc['other'].append(record.legislator_name) # Handle duplicate votes for key in rc.keys(): rc[key] = list(set(rc[key])) for key, voters in rc.items(): for voter in voters: fsvote.vote(key, voter) # Set counts by summed votes for accuracy fsvote.set_count(key, len(voters)) yield fsvote yield fsbill self.session.expire_all()
def parse_roll_call(self, bill, link, chamber, date): url = link.attrib['href'] page = self.get(url).text page = lxml.html.fromstring(page) xpath = 'string(//div[@class="Column-OneFourth"]/div[3])' motion = page.xpath(xpath).strip() motion = re.sub(r'\s+', ' ', motion) if motion == 'FP': motion = 'FINAL PASSAGE' if motion == 'FINAL PASSAGE': type = 'passage' elif re.match(r'CONCUR(RENCE)? IN \w+ AMENDMENTS', motion): type = 'amendment' else: type = 'other' motion = link.text_content() yeas = int(page.xpath("//div[text() = 'YEAS']")[0].getnext().text) nays = int(page.xpath("//div[text() = 'NAYS']")[0].getnext().text) lve = int(page.xpath("//div[text() = 'LVE']")[0].getnext().text) nv = int(page.xpath("//div[text() = 'N/V']")[0].getnext().text) other = lve + nv vote = VoteEvent( chamber=chamber, start_date=tz.localize(date), motion_text=motion, classification=type, result='pass' if yeas > (nays + other) else 'fail', bill=bill, ) # pupa_id situation here is a bit weird, same vote can be used for # multiple bills see: # http://www.legis.state.pa.us/CFDOCS/Legis/RC/Public/rc_view_action2.cfm?sess_yr=2017&sess_ind=0&rc_body=H&rc_nbr=11 # noqa # so we toss the bill id onto the end of the URL vote.pupa_id = url + '#' + bill.identifier vote.add_source(url) vote.set_count('yes', yeas) vote.set_count('no', nays) vote.set_count('other', other) for div in page.xpath('//*[contains(@class, "RollCalls-Vote")]'): name = div.text_content().strip() name = re.sub(r'^[\s,]+', '', name) name = re.sub(r'[\s,]+$', '', name) class_attr = div.attrib['class'].lower() if 'yea' in class_attr: voteval = 'yes' elif 'nay' in class_attr: voteval = 'no' elif 'nvote' in class_attr: voteval = 'other' elif 'lve' in class_attr: voteval = 'other' else: msg = 'Unrecognized vote val: %s' % class_attr raise Exception(msg) vote.vote(voteval, name) return vote
def scrape_pdf_for_votes(self, session, actor, date, motion, href): warned = False # vote indicator, a few spaces, a name, newline or multiple spaces # VOTE_RE = re.compile('(Y|N|E|NV|A|P|-)\s{2,5}(\w.+?)(?:\n|\s{2})') COUNT_RE = re.compile( r'^(\d+)\s+YEAS?\s+(\d+)\s+NAYS?\s+(\d+)\s+PRESENT(?:\s+(\d+)\s+NOT\sVOTING)?\s*$' ) PASS_FAIL_WORDS = { 'PASSED': 'pass', 'PREVAILED': 'fail', 'ADOPTED': 'pass', 'CONCURRED': 'pass', 'FAILED': 'fail', 'LOST': 'fail', } pdflines = self.fetch_pdf_lines(href) if not pdflines: return False yes_count = no_count = present_count = 0 yes_votes = [] no_votes = [] present_votes = [] excused_votes = [] not_voting = [] absent_votes = [] passed = None counts_found = False vote_lines = [] for line in pdflines: # consider pass/fail as a document property instead of a result of the vote count # extract the vote count from the document instead of just using counts of names if not line.strip(): continue elif line.strip() in PASS_FAIL_WORDS: # Crash on duplicate pass/fail status that differs from previous status if passed is not None and passed != PASS_FAIL_WORDS[line.strip()]: raise Exception("Duplicate pass/fail matches in [%s]" % href) passed = PASS_FAIL_WORDS[line.strip()] elif COUNT_RE.match(line): (yes_count, no_count, present_count, not_voting_count) = COUNT_RE.match(line).groups() yes_count = int(yes_count) no_count = int(no_count) present_count = int(present_count) counts_found = True elif counts_found: for value in VOTE_VALUES: if re.search(r'^\s*({})\s+\w'.format(value), line): vote_lines.append(line) break votes = find_columns_and_parse(vote_lines) for name, vcode in votes.items(): if name == 'Mr. Speaker': name = session_details[session]['speaker'] elif name == 'Mr. President': name = session_details[session]['president'] else: # Converts "Davis,William" to "Davis, William". name = re.sub(r'\,([a-zA-Z])', r', \1', name) if vcode == 'Y': yes_votes.append(name) elif vcode == 'N': no_votes.append(name) elif vcode == 'P': present_votes.append(name) elif vcode == 'E': excused_votes.append(name) elif vcode == 'NV': not_voting.append(name) elif vcode == 'A': absent_votes.append(name) # fake the counts if yes_count == 0 and no_count == 0 and present_count == 0: yes_count = len(yes_votes) no_count = len(no_votes) else: # audit if yes_count != len(yes_votes): self.warning("Mismatched yes count [expect: %i] [have: %i]" % (yes_count, len(yes_votes))) warned = True if no_count != len(no_votes): self.warning("Mismatched no count [expect: %i] [have: %i]" % (no_count, len(no_votes))) warned = True if passed is None: if actor['classification'] == 'lower': # senate doesn't have these lines self.warning("No pass/fail word found; fall back to comparing yes and no vote.") warned = True passed = 'pass' if yes_count > no_count else 'fail' classification, _ = _categorize_action(motion) vote_event = VoteEvent(legislative_session=session, motion_text=motion, classification=classification, organization=actor, start_date=date, result=passed) for name in yes_votes: vote_event.yes(name) for name in no_votes: vote_event.no(name) for name in present_votes: vote_event.vote('other', name) for name in excused_votes: vote_event.vote('excused', name) for name in not_voting: vote_event.vote('not voting', name) for name in absent_votes: vote_event.vote('absent', name) vote_event.set_count('yes', yes_count) vote_event.set_count('no', no_count) vote_event.set_count('other', present_count) vote_event.set_count('excused', len(excused_votes)) vote_event.set_count('absent', len(absent_votes)) vote_event.set_count('not voting', len(not_voting)) vote_event.add_source(href) # for distinguishing between votes with the same id and on same day vote_event.pupa_id = href if warned: self.warning("Warnings were issued. Best to check %s" % href) return vote_event
def scrape_votes(self, bill, url): page = lxml.html.fromstring(self.get(url).text.replace(u'\xa0', ' ')) seen_rcs = set() re_ns = "http://exslt.org/regular-expressions" path = "//p[re:test(text(), 'OKLAHOMA\s+(HOUSE|STATE\s+SENATE)')]" for header in page.xpath(path, namespaces={'re': re_ns}): bad_vote = False # Each chamber has the motion name on a different line of the file if 'HOUSE' in header.xpath("string()"): chamber = 'lower' motion_index = 8 else: chamber = 'upper' motion_index = 13 motion = header.xpath( "string(following-sibling::p[%d])" % motion_index).strip() motion = re.sub(r'\s+', ' ', motion) if not motion.strip(): self.warning("Motion text not found") return match = re.match(r'^(.*) (PASSED|FAILED)$', motion) if match: motion = match.group(1) passed = match.group(2) == 'PASSED' else: passed = None rcs_p = header.xpath( "following-sibling::p[contains(., 'RCS#')]")[0] rcs_line = rcs_p.xpath("string()").replace(u'\xa0', ' ') rcs = re.search(r'RCS#\s+(\d+)', rcs_line).group(1) if rcs in seen_rcs: continue else: seen_rcs.add(rcs) date_line = rcs_p.getnext().xpath("string()") date = re.search(r'\d+/\d+/\d+', date_line).group(0) date = datetime.datetime.strptime(date, "%m/%d/%Y").date() vtype = None counts = collections.defaultdict(int) votes = collections.defaultdict(list) seen_yes = False for sib in header.xpath("following-sibling::p")[13:]: line = sib.xpath("string()").replace('\r\n', ' ').strip() if "*****" in line: break regex = (r'(YEAS|NAYS|EXCUSED|VACANT|CONSTITUTIONAL ' 'PRIVILEGE|NOT VOTING|N/V)\s*:\s*(\d+)(.*)') match = re.match(regex, line) if match: if match.group(1) == 'YEAS' and 'RCS#' not in line: vtype = 'yes' seen_yes = True elif match.group(1) == 'NAYS' and seen_yes: vtype = 'no' elif match.group(1) == 'VACANT': continue # skip these elif seen_yes: vtype = 'other' if seen_yes and match.group(3).strip(): self.warning("Bad vote format, skipping.") bad_vote = True counts[vtype] += int(match.group(2)) elif seen_yes: for name in line.split(' '): if not name: continue if 'HOUSE' in name or 'SENATE ' in name: continue votes[vtype].append(name.strip()) if bad_vote: continue if passed is None: passed = counts['yes'] > (counts['no'] + counts['other']) vote = Vote(chamber=chamber, start_date=date.strftime('%Y-%m-%d'), motion_text=motion, result='pass' if passed else 'fail', bill=bill, classification='passage') vote.set_count('yes', counts['yes']) vote.set_count('no', counts['no']) vote.set_count('other', counts['other']) vote.pupa_id = url + '#' + rcs vote.add_source(url) for name in votes['yes']: vote.yes(name) for name in votes['no']: if ':' in name: raise Exception(name) vote.no(name) for name in votes['other']: vote.vote('other', name) yield vote
def parse_vote(self, bill, link): # Server sometimes sends proper error headers, # sometimes not try: self.info("Get {}".format(link)) text = requests.get(link).text except requests.exceptions.HTTPError as err: self.warning("{} fetching vote {}, skipping".format(err, link)) return if "Varnish cache server" in text: self.warning("Scrape rate is too high, try re-scraping with " "The --rpm set to a lower number") return if "Page Not Found" in text or "Page Unavailable" in text: self.warning("missing vote, skipping") return member_doc = lxml.html.fromstring(text) motion = member_doc.xpath("//div[@id='main_content']/h4/text()") chamber_date_line = "".join( member_doc.xpath("//div[@id='main_content']/h3[1]//text()")) chamber_date_line_words = chamber_date_line.split() vote_chamber = chamber_date_line_words[0] vote_date = datetime.datetime.strptime(chamber_date_line_words[-1], "%m/%d/%Y") vote_status = " ".join(chamber_date_line_words[2:-2]) opinions = member_doc.xpath( "//div[@id='main_content']/h3[position() > 1]/text()") if len(opinions) > 0: vote_status = vote_status if vote_status.strip() else motion[0] vote_chamber = "upper" if vote_chamber == "Senate" else "lower" for i in opinions: try: count = int(i[i.find("(") + 1:i.find(")")]) except ValueError: # This is likely not a vote-count text chunk # It's probably '`On roll call the vote was:` pass else: if "yea" in i.lower(): yes_count = count elif "nay" in i.lower(): no_count = count elif "present" in i.lower(): p_count = count elif "absent" in i.lower(): a_count = count vote = VoteEvent( bill=bill, start_date=vote_date.strftime("%Y-%m-%d"), chamber=vote_chamber, motion_text=vote_status, result="pass" if yes_count > no_count else "fail", classification="passage", ) vote.pupa_id = link vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("abstain", p_count) vote.set_count("absent", a_count) vote.add_source(link) a_links = member_doc.xpath("//div[@id='main_content']/a/text()") for i in range(1, len(a_links)): if i <= yes_count: vote.vote("yes", re.sub(",", "", a_links[i]).split()[0]) elif no_count != 0 and i > yes_count and i <= yes_count + no_count: vote.vote("no", re.sub(",", "", a_links[i]).split()[0]) else: vote.vote("other", re.sub(",", "", a_links[i]).split()[0]) yield vote else: self.warning("No Votes for: %s", link)
def scrape_action_page(self, bill, page): action_rows = page.xpath('//tbody/tr') for row in action_rows: action_date = row.xpath('td[1]/text()')[0] action_date = datetime.strptime(action_date, '%m/%d/%Y') action_year = action_date.year action_date = action_date.strftime('%Y-%m-%d') if row.xpath('td[2]/text()'): action_actor = row.xpath('td[2]/text()')[0] action_actor = self.chamber_map_reverse[action_actor.strip()] action_name = row.xpath('string(td[3])') # House votes if "Supplement" in action_name: actor = "lower" vote_action = action_name.split(' -')[0] y = int(action_name.strip().split('-')[1].split('YEAS')[0]) n = int(action_name.strip().split('YEAS to')[1].split('NAYS')[0]) # get supplement number n_supplement = int(action_name.strip().split('No. ')[1].split(r')')[0]) cached_vote = VoteEvent( chamber=actor, start_date=action_date, motion_text=vote_action, result='pass' if y > n else 'fail', classification='passage', bill=bill, ) cached_vote.set_count('yes', y) cached_vote.set_count('no', n) housevote_pdf = 'http://www.mass.gov/legis/journal/combined{}RCs.pdf'.format( action_year ) # note: 2014-2015 different format and no data on website for years prior to 2014 self.scrape_house_vote(cached_vote, housevote_pdf, n_supplement) cached_vote.add_source(housevote_pdf) cached_vote.pupa_id = '{}#{}'.format(housevote_pdf, n_supplement) yield cached_vote # Senate votes if "Roll Call" in action_name: actor = "upper" # placeholder vote_action = action_name.split(' -')[0] try: y, n = re.search('(\d+) yeas .*? (\d+) nays', action_name.lower()).groups() y = int(y) n = int(n) except AttributeError: y = int(re.search(r"yeas\s*(\d*)", action_name.lower()).group(1)) n = int(re.search(r"nays\s*(\d*)", action_name.lower()).group(1)) # TODO: other count isn't included, set later cached_vote = VoteEvent( chamber=actor, start_date=action_date, motion_text=vote_action, result='pass' if y > n else 'fail', classification='passage', bill=bill, ) cached_vote.set_count('yes', y) cached_vote.set_count('no', n) rollcall_pdf = 'http://malegislature.gov' + row.xpath('string(td[3]/a/@href)') self.scrape_senate_vote(cached_vote, rollcall_pdf) cached_vote.add_source(rollcall_pdf) yield cached_vote attrs = self.categorizer.categorize(action_name) action = bill.add_action( action_name.strip(), action_date, chamber=action_actor, classification=attrs['classification'], ) for com in attrs.get('committees', []): action.add_related_entity(com, entity_type='organization')
def parse_vote_pdf(self, vote_url, bill): filename, response = self.urlretrieve(vote_url) text = convert_pdf(filename, type="text").decode() lines = text.splitlines() if "Senate" in vote_url: chamber = "upper" else: chamber = "lower" date_string = lines[0].split("Calendar Date:")[1].strip() date = datetime.datetime.strptime(date_string, "%b %d, %Y %I:%M (%p)") page_index = None for index, line in enumerate(lines): if "Yeas" in line and "Nays" in line: page_index = index break vote_counts = 5 * [0] vote_types = ["yes", "no", "not voting", "excused", "absent"] if page_index: counts = re.split(r"\s{2,}", lines[page_index].strip()) for index, count in enumerate(counts): number, string = count.split(" ", 1) number = int(number) vote_counts[index] = number else: raise ValueError("Vote Counts Not found at %s" % vote_url) passed = vote_counts[0] > vote_counts[1] # Consent calendar votes address multiple bills in one VoteEvent # eg, http://mgaleg.maryland.gov/2018RS/votes/Senate/0478.pdf is_consent_calendar = any( ["Consent Calendar" in line for line in lines[:page_index]] ) consent_calendar_bills = None motion = "" if is_consent_calendar: motion = re.split(r"\s{2,}", lines[page_index - 4].strip())[0] consent_calendar_bills = re.split(r"\s{2,}", lines[page_index - 1].strip()) assert ( consent_calendar_bills ), "Could not find bills for consent calendar vote" motion_keywords = [ "favorable", "reading", "amendment", "motion", "introduced", "bill pass", "committee", ] motion_lines = [ 3, 2, 4, 5, ] # Relative LineNumbers to be checked for existence of motion for i in motion_lines: if any( motion_keyword in motion.lower() for motion_keyword in motion_keywords ): break motion = re.split(r"\s{2,}", lines[page_index - i].strip())[0] else: if not any( motion_keyword in motion.lower() for motion_keyword in motion_keywords ): # This condition covers for the bad formating in SB 1260 motion = lines[page_index - 3] if not any( motion_keyword in motion.lower() for motion_keyword in motion_keywords ): # Check this one for SB 747 motion = "No motion given" self.warning("No motion given") vote = VoteEvent( bill=bill, chamber=chamber, start_date=date.strftime("%Y-%m-%d"), motion_text=motion, classification="passage", result="pass" if passed else "fail", ) # Include bill ID to avoid duplication for consent calendars vote.pupa_id = "{}#{}".format(vote_url, bill.identifier) for index, vote_type in enumerate(vote_types): vote.set_count(vote_type, vote_counts[index]) page_index = page_index + 2 # Keywords for identifying where names are located in the pdf show_stoppers = [ "Voting Nay", "Not Voting", "COPY", "Excused", "indicates vote change", "Indicates Vote Change", ] vote_index = 0 # For matching number of names extracted with vote counts(extracted independently) vote_name_counts = 5 * [0] while page_index < len(lines): current_line = lines[page_index].strip() if not current_line or "Voting Yea" in current_line: page_index += 1 continue if any(show_stopper in current_line for show_stopper in show_stoppers): page_index += 1 vote_index = vote_index + 1 continue names = re.split(r"\s{2,}", current_line) vote_name_counts[vote_index] += len(names) for name in names: vote.vote(vote_types[vote_index], name) page_index += 1 if vote_counts != vote_name_counts: raise ValueError("Votes Count and Number of Names don't match") return vote
def scrape_vote(self, bill, action_text, url): doc = lxml.html.fromstring(self.get(url).text) # process action_text - might look like "Vote - Senate Floor - # Third Reading Passed (46-0) - 01/16/12" if action_text.startswith('Vote - Senate Floor - '): action_text = action_text[22:] chamber = 'upper' elif action_text.startswith('Vote - House Floor - '): action_text = action_text[21:] chamber = 'lower' motion, unused_date = action_text.rsplit(' - ', 1) try: yes_count, no_count = re.findall('\((\d+)-(\d+)\)', motion)[0] yes_count = int(yes_count) no_count = int(no_count) except IndexError: self.info("Motion text didn't contain vote totals, will get them from elsewhere") yes_count = None no_count = None if 'Passed' in motion: motion = motion.split(' Passed')[0] passed = True elif 'Adopted' in motion: motion = motion.split(' Adopted')[0] passed = True elif 'Rejected' in motion: motion = motion.split(' Rejected')[0] passed = False elif 'Failed' in motion: motion = motion.split(' Failed')[0] passed = False elif 'Concur' in motion: passed = True elif 'Floor Amendment' in motion: if yes_count and no_count: passed = yes_count > no_count else: passed = None elif 'overridden' in motion.lower(): passed = True motion = 'Veto Override' elif 'Sustained' in motion: passed = False motion = 'Veto Override' else: raise Exception('unknown motion: %s' % motion) vote = VoteEvent( bill=bill, chamber=chamber, start_date=None, motion_text=motion, classification='passage', result='pass' if passed else 'fail', ) vote.set_count('yes', yes_count) vote.set_count('no', no_count) vfunc = None nobrs = doc.xpath('//nobr/text()') for text in nobrs: text = text.replace(u'\xa0', ' ') if text.startswith('Calendar Date: '): if vote.start_date: self.warning('two dates!, skipping rest of bill') break vote.start_date = datetime.datetime.strptime( text.split(': ', 1)[1], '%b %d, %Y %H:%M %p' ).strftime('%Y-%m-%d') elif 'Yeas' in text and 'Nays' in text and 'Not Voting' in text: yeas, nays, nv, exc, absent = re.match( ( '(\d+) Yeas\s+(\d+) Nays\s+(\d+) Not Voting\s+(\d+) Excused ' '\(Absent\)\s+(\d+) Absent' ), text).groups() vote.set_count('yes', int(yeas)) vote.set_count('no', int(nays)) vote.set_count('other', int(nv) + int(exc) + int(absent)) elif 'Voting Yea' in text: vfunc = 'yes' elif 'Voting Nay' in text: vfunc = 'no' elif 'Not Voting' in text or 'Excused' in text: vfunc = 'other' elif vfunc: if ' and ' in text: legs = text.split(' and ') else: legs = [text] for leg in legs: # Strip the occasional asterisk - see #1512 leg = leg.rstrip('*') vote.vote(vfunc, leg) vote.add_source(url) vote.pupa_id = url # contains vote sequence number yield vote