def test_fix_bill_id(): j = create_jurisdiction() j.legislative_sessions.create(name='1900', identifier='1900') org1 = ScrapeOrganization(name='House', classification='lower') bill = ScrapeBill('HB 1', '1900', 'Test Bill ID', classification='bill', chamber='lower') oi = OrganizationImporter('jid') oi.import_data([org1.as_dict()]) from pupa.settings import IMPORT_TRANSFORMERS IMPORT_TRANSFORMERS['bill'] = { 'identifier': lambda x: re.sub(r'([A-Z]*)\s*0*([-\d]+)', r'\1 \2', x, 1) } bi = BillImporter('jid', oi, DumbMockImporter()) bi.import_data([bill.as_dict()]) ve = ScrapeVoteEvent(legislative_session='1900', motion_text='passage', start_date='1900-04-02', classification='passage:bill', result='fail', bill_chamber='lower', bill='HB1', identifier='4', bill_action='passage', organization=org1._id) VoteEventImporter('jid', DumbMockImporter(), oi, bi).import_data([ ve.as_dict(), ]) IMPORT_TRANSFORMERS['bill'] = {} ve = VoteEvent.objects.get() ve.bill.identifier == 'HB 1'
def scrape_senate_vote(self, bill, url, date): try: filename, resp = self.urlretrieve(url) except scrapelib.HTTPError: self.warning("missing vote file %s" % url) return vote = Vote( chamber='upper', start_date=date.strftime("%Y-%m-%d"), motion_text='Passage', # setting 'fail' for now. result='fail', classification='passage', bill=bill ) vote.add_source(url) text = convert_pdf(filename, 'text').decode('utf-8') os.remove(filename) if re.search('Yea:\s+\d+\s+Nay:\s+\d+\s+Absent:\s+\d+', text): yield from self.scrape_senate_vote_3col(bill, vote, text, url, date) return data = re.split(r'(Yea|Nay|Absent)s?:', text)[::-1] data = filter(None, data) keymap = dict(yea='yes', nay='no') actual_vote = collections.defaultdict(int) vote_count = { 'yes': 0, 'no': 0, 'other': 0 } while True: if not data: break vote_val = data.pop() key = keymap.get(vote_val.lower(), 'other') values = data.pop() for name in re.split(r'(?:[\s,]+and\s|[\s,]{2,})', values): if name.lower().strip() == 'none.': continue name = name.replace('..', '') name = re.sub(r'\.$', '', name) name = name.strip('-1234567890 \n') if not name: continue vote.vote(key, name) actual_vote[vote_val] += 1 vote_count[key] += 1 assert actual_vote[vote_val] == vote_count[key] for key, value in vote_count.items(): vote.set_count(key, value) # updating result with actual value vote.result = 'pass' if vote_count['yes'] > (vote_count['no'] + vote_count['other']) else 'fail' yield vote
def add_vote(self, bill, chamber, date, text, url): votes = re.findall(r'Ayes,?[\s]?(\d+)[,;]\s+N(?:oes|ays),?[\s]?(\d+)', text) yes, no = int(votes[0][0]), int(votes[0][1]) vtype = 'other' for regex, type in motion_classifiers.items(): if re.match(regex, text): vtype = type break v = VoteEvent( chamber=chamber, start_date=TIMEZONE.localize(date), motion_text=text, result='pass' if yes > no else 'fail', classification=vtype, bill=bill, ) v.pupa_id = url.split('/')[-1] v.set_count('yes', yes) v.set_count('no', no) # fetch the vote itself if url: v.add_source(url) if 'av' in url: self.add_house_votes(v, url) elif 'sv' in url: self.add_senate_votes(v, url) return v
def _get_votes(self, date, actor, action, bill, url): vre = r'(?P<leader>.*)(AYES|YEAS):\s+(?P<yeas>\d+)\s+(NOES|NAYS):\s+(?P<nays>\d+).*' if 'YEAS' in action.upper() or 'AYES' in action.upper(): match = re.match(vre, action) if match: v = match.groupdict() yes, no = int(v['yeas']), int(v['nays']) vote = VoteEvent( chamber=actor, motion_text=v['leader'], result='pass' if yes > no else 'fail', classification='passage', start_date=TIMEZONE.localize(date), bill=bill, ) vote.add_source(url) yield vote
def get_vote_event(self, bill, act, votes, result): '''Make VoteEvent object from given Bill, action, votes and result.''' organization = json.loads(act['organization_id'].lstrip('~')) vote_event = VoteEvent(legislative_session=bill.legislative_session, motion_text=act['description'], organization=organization, classification=None, start_date=act['date'], result=result, bill=bill) legistar_web, legistar_api = [src['url'] for src in bill.sources] vote_event.add_source(legistar_web) vote_event.add_source(legistar_api + '/histories') for vote in votes: raw_option = vote['VoteValueName'].lower() if raw_option == 'suspended': continue clean_option = self.VOTE_OPTIONS.get(raw_option, raw_option) vote_event.vote(clean_option, vote['VotePersonName'].strip()) return vote_event
def test_vote_event_identifier_dedupe(): j = Jurisdiction.objects.create(id='jid', division_id='did') j.legislative_sessions.create(name='1900', identifier='1900') vote_event = ScrapeVoteEvent(legislative_session='1900', start_date='2013', classification='anything', result='passed', motion_text='a vote on something', identifier='Roll Call No. 1') dmi = DumbMockImporter() bi = BillImporter('jid', dmi, dmi) _, what = VoteEventImporter('jid', dmi, dmi, bi).import_item(vote_event.as_dict()) assert what == 'insert' assert VoteEvent.objects.count() == 1 # same exact vote event, no changes _, what = VoteEventImporter('jid', dmi, dmi, bi).import_item(vote_event.as_dict()) assert what == 'noop' assert VoteEvent.objects.count() == 1 # new info, update vote_event.result = 'failed' _, what = VoteEventImporter('jid', dmi, dmi, bi).import_item(vote_event.as_dict()) assert what == 'update' assert VoteEvent.objects.count() == 1 # new bill, insert vote_event.identifier = 'Roll Call 2' _, what = VoteEventImporter('jid', dmi, dmi, bi).import_item(vote_event.as_dict()) assert what == 'insert' assert VoteEvent.objects.count() == 2
def test_vote_event_pupa_identifier_dedupe(): j = create_jurisdiction() j.legislative_sessions.create(name='1900', identifier='1900') Organization.objects.create(id='org-id', name='Legislature', classification='legislature', jurisdiction=j) vote_event = ScrapeVoteEvent(legislative_session='1900', start_date='2013', classification='anything', result='passed', motion_text='a vote on something', identifier='Roll Call No. 1') vote_event.pupa_id = 'foo' dmi = DumbMockImporter() oi = OrganizationImporter('jid') bi = BillImporter('jid', dmi, oi) _, what = VoteEventImporter('jid', dmi, oi, bi).import_item(vote_event.as_dict()) assert what == 'insert' assert VoteEvent.objects.count() == 1 # same exact vote event, no changes _, what = VoteEventImporter('jid', dmi, oi, bi).import_item(vote_event.as_dict()) assert what == 'noop' assert VoteEvent.objects.count() == 1 # new info, update vote_event.result = 'failed' _, what = VoteEventImporter('jid', dmi, oi, bi).import_item(vote_event.as_dict()) assert what == 'update' assert VoteEvent.objects.count() == 1 # new bill identifier, update vote_event.identifier = 'First Roll Call' _, what = VoteEventImporter('jid', dmi, oi, bi).import_item(vote_event.as_dict()) assert what == 'update' assert VoteEvent.objects.count() == 1 # new pupa identifier, insert vote_event.pupa_id = 'bar' _, what = VoteEventImporter('jid', dmi, oi, bi).import_item(vote_event.as_dict()) assert what == 'insert' assert VoteEvent.objects.count() == 2
def test_full_vote_event(): j = Jurisdiction.objects.create(id='jid', division_id='did') j.legislative_sessions.create(name='1900', identifier='1900') sp1 = ScrapePerson('John Smith', primary_org='lower') sp2 = ScrapePerson('Adam Smith', primary_org='lower') org = ScrapeOrganization(name='House', classification='lower') bill = ScrapeBill('HB 1', '1900', 'Axe & Tack Tax Act', from_organization=org._id) vote_event = ScrapeVoteEvent(legislative_session='1900', motion_text='passage', start_date='1900-04-01', classification='passage:bill', result='pass', bill_chamber='lower', bill='HB 1', organization=org._id) vote_event.set_count('yes', 20) vote_event.yes('John Smith') vote_event.no('Adam Smith') oi = OrganizationImporter('jid') oi.import_data([org.as_dict()]) pi = PersonImporter('jid') pi.import_data([sp1.as_dict(), sp2.as_dict()]) mi = MembershipImporter('jid', pi, oi, DumbMockImporter()) mi.import_data([sp1._related[0].as_dict(), sp2._related[0].as_dict()]) bi = BillImporter('jid', oi, pi) bi.import_data([bill.as_dict()]) VoteEventImporter('jid', pi, oi, bi).import_data([vote_event.as_dict()]) assert VoteEvent.objects.count() == 1 ve = VoteEvent.objects.get() assert ve.legislative_session == LegislativeSession.objects.get() assert ve.motion_classification == ['passage:bill'] assert ve.bill == Bill.objects.get() count = ve.counts.get() assert count.option == 'yes' assert count.value == 20 votes = list(ve.votes.all()) assert len(votes) == 2 for v in ve.votes.all(): if v.voter_name == 'John Smith': assert v.option == 'yes' assert v.voter == Person.objects.get(name='John Smith') else: assert v.option == 'no' assert v.voter == Person.objects.get(name='Adam Smith')
def test_vote_event_bill_actions_two_stage(): # this test is very similar to what we're testing in test_vote_event_bill_actions w/ # ve3 and ve4, that two bills that reference the same action won't conflict w/ the # OneToOneField, but in this case we do it in two stages so that the conflict is found # even if the votes weren't in the same scrape j = create_jurisdiction() j.legislative_sessions.create(name='1900', identifier='1900') org1 = ScrapeOrganization(name='House', classification='lower') bill = ScrapeBill('HB 1', '1900', 'Axe & Tack Tax Act', from_organization=org1._id) bill.add_action(description='passage', date='1900-04-02', chamber='lower') ve1 = ScrapeVoteEvent(legislative_session='1900', motion_text='passage', start_date='1900-04-02', classification='passage:bill', result='pass', bill_chamber='lower', bill='HB 1', bill_action='passage', organization=org1._id) ve2 = ScrapeVoteEvent(legislative_session='1900', motion_text='passage', start_date='1900-04-02', classification='passage:bill', result='pass', bill_chamber='lower', bill='HB 1', bill_action='passage', organization=org1._id) # disambiguate them ve1.pupa_id = 'one' ve2.pupa_id = 'two' oi = OrganizationImporter('jid') oi.import_data([org1.as_dict()]) bi = BillImporter('jid', oi, DumbMockImporter()) bi.import_data([bill.as_dict()]) # first imports just fine VoteEventImporter('jid', DumbMockImporter(), oi, bi).import_data([ ve1.as_dict(), ]) votes = list(VoteEvent.objects.all()) assert len(votes) == 1 assert votes[0].bill_action is not None # when second is imported, ensure that action stays pinned to first just as it would # have if they were both in same import VoteEventImporter('jid', DumbMockImporter(), oi, bi).import_data([ ve1.as_dict(), ve2.as_dict(), ]) votes = list(VoteEvent.objects.all()) assert len(votes) == 2 assert votes[0].bill_action is not None assert votes[1].bill_action is None
def viva_voce_votes(root, session, chamber): for el in root.xpath(u'//div[starts-with(., "All Members are deemed")]'): mv = MaybeViva(el) if not mv.is_valid: continue v = VoteEvent( chamber=chamber, start_date=None, motion_text='passage' if mv.passed else 'other', result='pass' if mv.passed else 'fail', classification='passage' if mv.passed else 'other', legislative_session=session[0:2], bill=mv.bill_id, bill_chamber=mv.chamber ) v.set_count('yes', 0) v.set_count('no', 0) v.set_count('absent', 0) v.set_count('not voting', 0) yield v
def test_vote_event_bill_id_dedupe(): j = create_jurisdiction() session = j.legislative_sessions.create(name='1900', identifier='1900') org = Organization.objects.create(id='org-id', name='House', classification='lower', jurisdiction=j) bill = Bill.objects.create(id='bill-1', identifier='HB 1', legislative_session=session, from_organization=org) bill2 = Bill.objects.create(id='bill-2', identifier='HB 2', legislative_session=session, from_organization=org) vote_event = ScrapeVoteEvent(legislative_session='1900', start_date='2013', classification='anything', result='passed', motion_text='a vote on something', bill=bill.identifier, bill_chamber='lower', chamber='lower') dmi = DumbMockImporter() oi = OrganizationImporter('jid') bi = BillImporter('jid', dmi, oi) _, what = VoteEventImporter('jid', dmi, oi, bi).import_item(vote_event.as_dict()) assert what == 'insert' assert VoteEvent.objects.count() == 1 # same exact vote event, no changes _, what = VoteEventImporter('jid', dmi, oi, bi).import_item(vote_event.as_dict()) assert what == 'noop' assert VoteEvent.objects.count() == 1 # new info, update vote_event.result = 'failed' _, what = VoteEventImporter('jid', dmi, oi, bi).import_item(vote_event.as_dict()) assert what == 'update' assert VoteEvent.objects.count() == 1 # new vote event, insert vote_event = ScrapeVoteEvent(legislative_session='1900', start_date='2013', classification='anything', result='passed', motion_text='a vote on something', bill=bill2.identifier, bill_chamber='lower', chamber='lower') _, what = VoteEventImporter('jid', dmi, oi, bi).import_item(vote_event.as_dict()) assert what == 'insert' assert VoteEvent.objects.count() == 2
def test_vote_event_bill_clearing(): # ensure that we don't wind up with vote events sitting around forever on bills as # changes make it look like there are multiple vote events j = create_jurisdiction() session = j.legislative_sessions.create(name='1900', identifier='1900') org = Organization.objects.create(id='org-id', name='House', classification='lower', jurisdiction=j) bill = Bill.objects.create(id='bill-1', identifier='HB 1', legislative_session=session, from_organization=org) Bill.objects.create(id='bill-2', identifier='HB 2', legislative_session=session, from_organization=org) oi = OrganizationImporter('jid') dmi = DumbMockImporter() bi = BillImporter('jid', dmi, oi) vote_event1 = ScrapeVoteEvent(legislative_session='1900', start_date='2013', classification='anything', result='passed', motion_text='a vote on somthing', # typo intentional bill=bill.identifier, bill_chamber='lower', chamber='lower' ) vote_event2 = ScrapeVoteEvent(legislative_session='1900', start_date='2013', classification='anything', result='passed', motion_text='a vote on something else', bill=bill.identifier, bill_chamber='lower', chamber='lower' ) # have to use import_data so postimport is called VoteEventImporter('jid', dmi, oi, bi).import_data([ vote_event1.as_dict(), vote_event2.as_dict() ]) assert VoteEvent.objects.count() == 2 # a typo is fixed, we don't want 3 vote events now vote_event1.motion_text = 'a vote on something' VoteEventImporter('jid', dmi, oi, bi).import_data([ vote_event1.as_dict(), vote_event2.as_dict() ]) assert VoteEvent.objects.count() == 2
def createVoteEvent(self, motion, agenda_item_version): version = agenda_item_version date = self.toDate(version['date']) v = VoteEvent( motion_text=motion['title_text'], result=RESULT_MAP[motion['result']], classification=motion['action'], start_date=date, legislative_session=version['session'], ) if motion['mover']: v.extras['mover'] = motion['mover'] if motion['body_text']: v.extras['body'] = motion['body_text'] v.set_bill(version['bill_identifier']) v.add_source(version['url']) return v
def process_committee_vote(self, committee_action, bill): try: date = committee_action["ActionDate"] vote_info = committee_action["Vote"] except KeyError: self.logger.warning("Committee vote has no data. Skipping.") return date = self.date_format(date) other_count = 0 for v in vote_info: vote_count = 0 if v["VoteCount"] == "" else int(v["VoteCount"]) if v["VoteType"] == "Yes": yes_count = vote_count elif v["VoteType"] == "No": no_count = vote_count else: other_count += vote_count result = 'fail' if yes_count > no_count: result = 'pass' v = VoteEvent(chamber='legislature', start_date=date, motion_text='Committee Vote', result=result, classification='committee', bill=bill ) v.set_count('yes', yes_count) v.set_count('no', no_count) v.set_count('other', other_count) return v
def scrape_vote(self, bill, vote_json, session): if vote_json['amendmentNumber']: motion = '{}: {}'.format( vote_json['amendmentNumber'], vote_json['action']) else: motion = vote_json['action'] result = 'pass' if vote_json['yesVotesCount'] > vote_json['noVotesCount'] else 'fail' v = VoteEvent( chamber=self.chamber_abbrev_map[vote_json['chamber']], start_date=self.parse_local_date(vote_json['voteDate']), motion_text=motion, result=result, legislative_session=session, bill=bill, classification='other', ) v.set_count(option='yes', value=vote_json['yesVotesCount']) v.set_count('no', vote_json['noVotesCount']) v.set_count('absent', vote_json['absentVotesCount']) v.set_count('excused', vote_json['excusedVotesCount']) v.set_count('other', vote_json['conflictVotesCount']) for name in vote_json['yesVotes'].split(','): if name.strip(): v.yes(name.strip()) for name in vote_json['noVotes'].split(','): if name.strip(): v.no(name.strip()) # add votes with other classifications # option can be 'yes', 'no', 'absent', # 'abstain', 'not voting', 'paired', 'excused' for name in vote_json['absentVotes'].split(','): if name.strip(): v.vote(option="absent", voter=name) for name in vote_json['excusedVotes'].split(','): if name.strip(): v.vote(option="excused", voter=name) for name in vote_json['conflictVotes'].split(','): if name.strip(): v.vote(option="other", voter=name) source_url = 'http://lso.wyoleg.gov/Legislation/{}/{}'.format( session, vote_json['billNumber']) v.add_source(source_url) yield v
def parse_vote_pdf(self, vote_url, bill): filename, response = self.urlretrieve(vote_url) text = convert_pdf(filename, type='text').decode() lines = text.splitlines() if 'Senate' in vote_url: chamber = 'upper' else: chamber = 'lower' date_string = lines[0].split('Calendar Date:')[1].strip() date = datetime.datetime.strptime(date_string, "%b %d, %Y %I:%M (%p)") page_index = None for index, line in enumerate(lines): if 'Yeas' in line and 'Nays' in line: page_index = index break vote_counts = 5 * [0] vote_types = ['yes', 'no', 'not voting', 'excused', 'absent'] if page_index: counts = re.split(r'\s{2,}', lines[page_index].strip()) for index, count in enumerate(counts): number, string = count.split(' ', 1) number = int(number) vote_counts[index] = number else: raise ValueError("Vote Counts Not found at %s" % vote_url) passed = vote_counts[0] > vote_counts[1] # Consent calendar votes address multiple bills in one VoteEvent # eg, http://mgaleg.maryland.gov/2018RS/votes/Senate/0478.pdf is_consent_calendar = any( ['Consent Calendar' in line for line in lines[:page_index]]) consent_calendar_bills = None motion = "" if is_consent_calendar: motion = re.split(r'\s{2,}', lines[page_index - 4].strip())[0] consent_calendar_bills = re.split(r'\s{2,}', lines[page_index - 1].strip()) assert consent_calendar_bills, "Could not find bills for consent calendar vote" motion_keywords = [ 'favorable', 'reading', 'amendment', 'motion', 'introduced', 'bill pass', 'committee' ] motion_lines = [ 3, 2, 4, 5 ] # Relative LineNumbers to be checked for existence of motion for i in motion_lines: if any(motion_keyword in motion.lower() for motion_keyword in motion_keywords): break motion = re.split(r'\s{2,}', lines[page_index - i].strip())[0] else: if not any(motion_keyword in motion.lower() for motion_keyword in motion_keywords): # This condition covers for the bad formating in SB 1260 motion = lines[page_index - 3] if not any(motion_keyword in motion.lower() for motion_keyword in motion_keywords): # Check this one for SB 747 motion = "No motion given" self.warning("No motion given") vote = VoteEvent( bill=bill, chamber=chamber, start_date=date.strftime('%Y-%m-%d'), motion_text=motion, classification='passage', result='pass' if passed else 'fail', ) # Include bill ID to avoid duplication for consent calendars vote.pupa_id = '{}#{}'.format(vote_url, bill.identifier) for index, vote_type in enumerate(vote_types): vote.set_count(vote_type, vote_counts[index]) page_index = page_index + 2 # Keywords for identifying where names are located in the pdf show_stoppers = [ 'Voting Nay', 'Not Voting', 'COPY', 'Excused', 'indicates vote change', 'Indicates Vote Change' ] vote_index = 0 # For matching number of names extracted with vote counts(extracted independently) vote_name_counts = 5 * [0] while page_index < len(lines): current_line = lines[page_index].strip() if not current_line or 'Voting Yea' in current_line: page_index += 1 continue if any(show_stopper in current_line for show_stopper in show_stoppers): page_index += 1 vote_index = (vote_index + 1) continue names = re.split(r'\s{2,}', current_line) vote_name_counts[vote_index] += len(names) for name in names: vote.vote(vote_types[vote_index], name) page_index += 1 if vote_counts != vote_name_counts: raise ValueError("Votes Count and Number of Names don't match") return vote
def scrape_vote(self, chamber, session, bill_id, vote_url): NO_VOTE_URL = 'http://www.house.leg.state.mn.us/votes/novotefound.asp' resp = self.get(vote_url) html = resp.text # sometimes the link is broken, will redirect to NO_VOTE_URL if resp.url == NO_VOTE_URL: return doc = lxml.html.fromstring(html) try: motion = doc.xpath("//div[@id='leg_PageContent']/div/h2/text()")[0] except IndexError: self.logger.warning("Bill was missing a motion number, skipping") return vote_count = doc.xpath( ".//div[@id='leg_PageContent']/div/h3/text()")[1].split() yeas = int(vote_count[0]) nays = int(vote_count[3]) # second paragraph has date paragraphs = doc.xpath(".//div[@id='leg_PageContent']/div/p/text()") date = None for p in paragraphs: try: date = datetime.datetime.strptime(p.strip(), '%m/%d/%Y').date() break except ValueError: pass if date is None: self.logger.warning("No date could be found for vote on %s" % motion) return vote = VoteEvent(chamber='lower', start_date=date, motion_text=motion, result='pass' if yeas > nays else 'fail', classification='passage', legislative_session=session, bill=bill_id, bill_chamber=chamber) vote.set_count('yes', yeas) vote.set_count('no', nays) vote.add_source(vote_url) # first table has YEAs for name in doc.xpath('//table[1]/tr/td/font/text()'): vote.yes(name.strip()) # second table is nays for name in doc.xpath('//table[2]/tr/td/font/text()'): vote.no(name.strip()) yield vote
def scrape_bill(self, chamber, session, bill_id): # try and get bill for the first year of the session biennium url = 'http://legislature.mi.gov/doc.aspx?%s-%s' % ( session[:4], bill_id.replace(' ', '-')) html = self.get(url).text # Otherwise, try second year of the session biennium if ('Page Not Found' in html or 'The bill you are looking for is not available yet' in html): url = 'http://legislature.mi.gov/doc.aspx?%s-%s' % ( session[-4:], bill_id.replace(' ', '-')) html = self.get(url).text if ('Page Not Found' in html or 'The bill you are looking for is not available yet' in html): self.warning("Cannot open bill page for {}; skipping".format(bill_id)) return doc = lxml.html.fromstring(html) doc.make_links_absolute('http://legislature.mi.gov') title = doc.xpath('//span[@id="frg_billstatus_ObjectSubject"]')[0].text_content() # get B/R/JR/CR part and look up bill type bill_type = bill_types[bill_id.split(' ')[0][1:]] bill = Bill(bill_id, session, title, chamber=chamber, classification=bill_type) bill.add_source(url) # sponsors sponsors = doc.xpath('//span[@id="frg_billstatus_SponsorList"]/a') for sponsor in sponsors: name = sponsor.text.replace(u'\xa0', ' ') # sometimes district gets added as a link if name.isnumeric(): continue if len(sponsors) > 1: classification = ( 'primary' if sponsor.tail and 'primary' in sponsor.tail else 'cosponsor' ) else: classification = 'primary' bill.add_sponsorship( name=name, chamber=chamber, entity_type='person', primary=classification == 'primary', classification=classification, ) bill.subject = doc.xpath('//span[@id="frg_billstatus_CategoryList"]/a/text()') # actions (skip header) for row in doc.xpath('//table[@id="frg_billstatus_HistoriesGridView"]/tr')[1:]: tds = row.xpath('td') # date, journal link, action date = tds[0].text_content() journal = tds[1].text_content() action = tds[2].text_content() date = TIMEZONE.localize(datetime.datetime.strptime(date, "%m/%d/%Y")) # instead of trusting upper/lower case, use journal for actor actor = 'upper' if 'SJ' in journal else 'lower' classification = categorize_action(action) bill.add_action(action, date, chamber=actor, classification=classification) # check if action mentions a sub submatch = re.search(r'WITH SUBSTITUTE\s+([\w\-\d]+)', action, re.IGNORECASE) if submatch and tds[2].xpath('a'): version_url = tds[2].xpath('a/@href')[0] version_name = tds[2].xpath('a/text()')[0].strip() version_name = 'Substitute {}'.format(version_name) self.info("Found Substitute {}".format(version_url)) if version_url.lower().endswith('.pdf'): mimetype = 'application/pdf' elif version_url.lower().endswith('.htm'): mimetype = 'text/html' bill.add_version_link(version_name, version_url, media_type=mimetype) # check if action mentions a vote rcmatch = re.search(r'Roll Call # (\d+)', action, re.IGNORECASE) if rcmatch: rc_num = rcmatch.groups()[0] # in format mileg.aspx?page=getobject&objectname=2011-SJ-02-10-011 journal_link = tds[1].xpath('a/@href') if journal_link: objectname = journal_link[0].rsplit('=', 1)[-1] chamber_name = {'upper': 'Senate', 'lower': 'House'}[actor] vote_url = BASE_URL + '/documents/%s/Journal/%s/htm/%s.htm' % ( session, chamber_name, objectname) results = self.parse_roll_call(vote_url, rc_num) if results is not None: vote_passed = len(results['yes']) > len(results['no']) vote = VoteEvent( start_date=date, chamber=actor, bill=bill, motion_text=action, result='pass' if vote_passed else 'fail', classification='passage', ) # check the expected counts vs actual count = re.search(r'YEAS (\d+)', action, re.IGNORECASE) count = int(count.groups()[0]) if count else 0 if count != len(results['yes']): self.warning('vote count mismatch for %s %s, %d != %d' % (bill_id, action, count, len(results['yes']))) count = re.search(r'NAYS (\d+)', action, re.IGNORECASE) count = int(count.groups()[0]) if count else 0 if count != len(results['no']): self.warning('vote count mismatch for %s %s, %d != %d' % (bill_id, action, count, len(results['no']))) vote.set_count('yes', len(results['yes'])) vote.set_count('no', len(results['no'])) vote.set_count('other', len(results['other'])) for name in results['yes']: vote.yes(name) for name in results['no']: vote.no(name) for name in results['other']: vote.vote('other', name) vote.add_source(vote_url) yield vote else: self.warning("missing journal link for %s %s" % (bill_id, journal)) # versions for row in doc.xpath('//table[@id="frg_billstatus_DocumentGridTable"]/tr'): parsed = self.parse_doc_row(row) if parsed: name, url = parsed if url.endswith('.pdf'): mimetype = 'application/pdf' elif url.endswith('.htm'): mimetype = 'text/html' bill.add_version_link(name, url, media_type=mimetype) # documents for row in doc.xpath('//table[@id="frg_billstatus_HlaTable"]/tr'): document = self.parse_doc_row(row) if document: name, url = document bill.add_document_link(name, url) for row in doc.xpath('//table[@id="frg_billstatus_SfaTable"]/tr'): document = self.parse_doc_row(row) if document: name, url = document bill.add_document_link(name, url) yield bill
def scrape_vote(self, bill, name, url): if "VOTE/H" in url: vote_chamber = 'lower' cols = (1, 5, 9, 13) name_offset = 3 yes_offset = 0 no_offset = 1 else: vote_chamber = 'upper' cols = (1, 6) name_offset = 4 yes_offset = 1 no_offset = 2 # Connecticut's SSL is causing problems with Scrapelib, so use Requests page = requests.get(url, verify=False).text if 'BUDGET ADDRESS' in page: return page = lxml.html.fromstring(page) yes_count = page.xpath( "string(//span[contains(., 'Those voting Yea')])") yes_count = int(re.match(r'[^\d]*(\d+)[^\d]*', yes_count).group(1)) no_count = page.xpath( "string(//span[contains(., 'Those voting Nay')])") no_count = int(re.match(r'[^\d]*(\d+)[^\d]*', no_count).group(1)) other_count = page.xpath("string(//span[contains(., 'Those absent')])") other_count = int(re.match(r'[^\d]*(\d+)[^\d]*', other_count).group(1)) need_count = page.xpath("string(//span[contains(., 'Necessary for')])") need_count = int(re.match(r'[^\d]*(\d+)[^\d]*', need_count).group(1)) date = page.xpath("string(//span[contains(., 'Taken on')])") date = re.match(r'.*Taken\s+on\s+(\d+/\s?\d+)', date).group(1) date = date.replace(' ', '') date = datetime.datetime.strptime( date + " " + bill.legislative_session, "%m/%d %Y").date() # not sure about classification. vote = Vote(chamber=vote_chamber, start_date=date, motion_text=name, result='pass' if yes_count > need_count else 'fail', classification='passage', bill=bill) vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('other', other_count) vote.add_source(url) table = page.xpath("//table")[0] for row in table.xpath("tr"): for i in cols: name = row.xpath("string(td[%d])" % (i + name_offset)).strip() if not name or name == 'VACANT': continue if "Y" in row.xpath("string(td[%d])" % (i + yes_offset)): vote.yes(name) elif "N" in row.xpath("string(td[%d])" % (i + no_offset)): vote.no(name) else: vote.vote('other', name) yield vote
def scrape_votes(self, bill_page, page_url, bill, insert, year): root = lxml.html.fromstring(bill_page) trs = root.xpath('/html/body/div/table[6]//tr') assert len(trs) >= 1, "Didn't find the Final Passage Votes' table" for tr in trs[1:]: links = tr.xpath('td/a[contains(text(), "Passage")]') if len(links) == 0: self.warning("Non-passage vote found for {}; ".format(bill.identifier) + "probably a motion for the calendar. It will be skipped.") else: assert len(links) == 1, \ "Too many votes found for XPath query, on bill {}".format(bill.identifier) link = links[0] motion = link.text if 'Assembly' in motion: chamber = 'lower' else: chamber = 'upper' votes = {} tds = tr.xpath('td') for td in tds: if td.text: text = td.text.strip() date = re.match('... .*?, ....', text) count = re.match('(?P<category>.*?) (?P<votes>[0-9]+)[,]?', text) if date: vote_date = datetime.strptime(text, '%b %d, %Y') elif count: votes[count.group('category')] = int(count.group('votes')) yes = votes['Yea'] no = votes['Nay'] excused = votes['Excused'] not_voting = votes['Not Voting'] absent = votes['Absent'] other = excused + not_voting + absent passed = yes > no vote = VoteEvent(chamber=chamber, start_date=self._tz.localize(vote_date), motion_text=motion, result='pass' if passed else 'fail', classification='passage', bill=bill, ) vote.set_count('yes', yes) vote.set_count('no', no) vote.set_count('other', other) vote.set_count('not voting', not_voting) vote.set_count('absent', absent) # try to get vote details try: vote_url = 'http://www.leg.state.nv.us/Session/%s/Reports/%s' % ( insert, link.get('href')) vote.pupa_id = vote_url vote.add_source(vote_url) if vote_url in self._seen_votes: self.warning('%s is included twice, skipping second', vote_url) continue else: self._seen_votes.add(vote_url) page = self.get(vote_url).text page = page.replace(u"\xa0", " ") root = lxml.html.fromstring(page) for el in root.xpath('//table[2]/tr'): tds = el.xpath('td') name = tds[1].text_content().strip() vote_result = tds[2].text_content().strip() if vote_result == 'Yea': vote.yes(name) elif vote_result == 'Nay': vote.no(name) else: vote.vote('other', name) vote.add_source(page_url) except scrapelib.HTTPError: self.warning("failed to fetch vote page, adding vote without details") yield vote
def scrape_votes(self, session, zip_url): votes = {} last_line = [] for line in self.zf.open('tblrollcallsummary.txt'): if line.strip() == "": continue line = line.split('|') if len(line) < 14: if len(last_line + line[1:]) == 14: line = last_line self.warning('used bad vote line') else: last_line = line self.warning('bad vote line %s' % '|'.join(line)) session_yr = line[0] body = line[1] vote_num = line[2] timestamp = line[3] bill_id = line[4].strip() yeas = int(line[5]) nays = int(line[6]) # present = int(line[7]) # absent = int(line[8]) motion = line[11].strip() or '[not available]' if session_yr == session and bill_id in self.bills_by_id: actor = 'lower' if body == 'H' else 'upper' time = dt.datetime.strptime(timestamp, '%m/%d/%Y %I:%M:%S %p') # TODO: stop faking passed somehow passed = yeas > nays vote = Vote(chamber=actor, start_date=time.strftime("%Y-%m-%d"), motion_text=motion, result='pass' if passed else 'fail', classification='passage', bill=self.bills_by_id[bill_id]) vote.set_count('yes', yeas) vote.set_count('no', nays) vote.add_source(zip_url) votes[body+vote_num] = vote for line in self.zf.open('tblrollcallhistory.txt'): # 2012 | H | 2 | 330795 | HB309 | Yea |1/4/2012 8:27:03 PM session_yr, body, v_num, employee, bill_id, vote, date \ = line.split('|') if not bill_id: continue if session_yr == session and bill_id.strip() in self.bills_by_id: try: leg = self.legislators[employee]['name'] except KeyError: self.warning("Error, can't find person %s" % employee) continue vote = vote.strip() if body+v_num not in votes: self.warning("Skipping processing this vote:") self.warning("Bad ID: %s" % (body+v_num)) continue other_count = 0 # code = self.legislators[employee]['seat'] if vote == 'Yea': votes[body+v_num].yes(leg) elif vote == 'Nay': votes[body+v_num].no(leg) else: votes[body+v_num].other(leg) other_count += 1 votes[body+v_num].set_count('other', other_count) for vote in votes.values(): yield vote
def process_vote(self, votes, url, base_url, bill, legislators, chamber_dict, vote_results): for v in votes["items"]: try: v["yeas"] except KeyError: # sometimes the actual vote is buried a second layer deep v = self.get(base_url + v["link"]).json() try: v["yeas"] except KeyError: self.logger.warning("No vote info available, skipping") continue try: chamber = chamber_dict[v["chamber"]] except KeyError: chamber = "lower" if "house" in v["apn"] else "upper" try: date = self._tz.localize( datetime.datetime.strptime(v["date"], "%m/%d/%y")) date = "{:%Y-%m-%d}".format(date) except KeyError: try: date = self._tz.localize( datetime.datetime.strptime(v["occurred"], "%m/%d/%y")) date = "{:%Y-%m-%d}".format(date) except KeyError: self.logger.warning("No date found for vote, skipping") continue try: motion = v["action"] except KeyError: motion = v["motiontype"] # Sometimes Ohio's SOLAR will only return part of the JSON, so in that case skip if (not motion and isinstance(v['yeas'], str) and isinstance(v['nays'], str)): waringText = 'Malformed JSON found for vote ("revno" of {}); skipping' self.warning(waringText.format(v['revno'])) continue result = v.get("results") or v.get("passed") if result is None: if len(v['yeas']) > len(v['nays']): result = "passed" else: result = "failed" passed = vote_results[result.lower()] if "committee" in v: vote = VoteEvent( chamber=chamber, start_date=date, motion_text=motion, result='pass' if passed else 'fail', # organization=v["committee"], bill=bill, classification='passed') else: vote = VoteEvent(chamber=chamber, start_date=date, motion_text=motion, result='pass' if passed else 'fail', classification='passed', bill=bill) # Concatenate the bill identifier and vote identifier to avoid collisions vote.pupa_id = '{}:{}'.format(bill.identifier.replace(' ', ''), v['revno']) # the yea and nay counts are not displayed, but vote totals are # and passage status is. yes_count = 0 no_count = 0 absent_count = 0 excused_count = 0 for voter_id in v["yeas"]: vote.yes(legislators[voter_id]) yes_count += 1 for voter_id in v["nays"]: vote.no(legislators[voter_id]) no_count += 1 if "absent" in v: for voter_id in v["absent"]: vote.vote('absent', legislators[voter_id]) absent_count += 1 if "excused" in v: for voter_id in v["excused"]: vote.vote('excused', legislators[voter_id]) excused_count += 1 vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('absent', absent_count) vote.set_count('excused', excused_count) # check to see if there are any other things that look # like vote categories, throw a warning if so for key, val in v.items(): if (type(val) == list and len(val) > 0 and key not in ["yeas", "nays", "absent", "excused"]): if val[0] in legislators: self.logger.warning( "{k} looks like a vote type that's not being counted." " Double check it?".format(k=key)) vote.add_source(url) yield vote
def scrape_vote(self, bill, date, url): page = self.get(url).text page = lxml.html.fromstring(page) header = page.xpath("string(//h4[contains(@id, 'hdVote')])") if 'No Bill Action' in header: self.warning("bad vote header -- skipping") return location = header.split(', ')[1] if location.startswith('House'): chamber = 'lower' elif location.startswith('Senate'): chamber = 'upper' elif location.startswith('Joint'): chamber = 'legislature' else: raise ScrapeError("Bad chamber: %s" % location) # committee = ' '.join(location.split(' ')[1:]).strip() # if not committee or committee.startswith('of Representatives'): # committee = None motion = ', '.join(header.split(', ')[2:]).strip() if motion: # If we can't detect a motion, skip this vote yes_count = int( page.xpath("string(//td[contains(@id, 'tdAyes')])")) no_count = int( page.xpath("string(//td[contains(@id, 'tdNays')])")) excused_count = int( page.xpath("string(//td[contains(@id, 'tdExcused')])")) absent_count = int( page.xpath("string(//td[contains(@id, 'tdAbsent')])")) passed = yes_count > no_count if motion.startswith('Do Pass'): type = 'passage' elif motion == 'Concurred in amendments': type = 'amendment' elif motion == 'Veto override': type = 'veto_override' else: type = 'other' vote = VoteEvent(chamber=chamber, start_date=date, motion_text=motion, result='pass' if passed else 'fail', classification=type, bill=bill ) vote.pupa_id = url # vote id is in URL vote.add_source(url) vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('excused', excused_count) vote.set_count('absent', absent_count) for td in page.xpath("//table[contains(@id, 'tblVotes')]/tr/td"): if td.text in ('Aye', 'Yea'): vote.yes(td.getprevious().text.strip()) elif td.text == 'Nay': vote.no(td.getprevious().text.strip()) elif td.text == 'Excused': vote.vote('excused', td.getprevious().text.strip()) elif td.text == 'Absent': vote.vote('absent', td.getprevious().text.strip()) yield vote
def scrape_bill(self, chamber, session, bill_id): # try and get bill for the first year of the session biennium url = "http://legislature.mi.gov/doc.aspx?%s-%s" % ( session[:4], bill_id.replace(" ", "-"), ) html = self.get(url).text # Otherwise, try second year of the session biennium if ("Page Not Found" in html or "The bill you are looking for is not available yet" in html): url = "http://legislature.mi.gov/doc.aspx?%s-%s" % ( session[-4:], bill_id.replace(" ", "-"), ) html = self.get(url).text if ("Page Not Found" in html or "The bill you are looking for is not available yet" in html): self.warning( "Cannot open bill page for {}; skipping".format(bill_id)) return doc = lxml.html.fromstring(html) doc.make_links_absolute("http://legislature.mi.gov") title = doc.xpath( '//span[@id="frg_billstatus_ObjectSubject"]')[0].text_content() # get B/R/JR/CR part and look up bill type bill_type = bill_types[bill_id.split(" ")[0][1:]] bill = Bill(bill_id, session, title, chamber=chamber, classification=bill_type) bill.add_source(url) # sponsors sponsors = doc.xpath('//span[@id="frg_billstatus_SponsorList"]/a') for sponsor in sponsors: name = sponsor.text.replace(u"\xa0", " ") # sometimes district gets added as a link if name.isnumeric(): continue if len(sponsors) > 1: classification = ("primary" if sponsor.tail and "primary" in sponsor.tail else "cosponsor") else: classification = "primary" bill.add_sponsorship( name=name.strip(), chamber=chamber, entity_type="person", primary=classification == "primary", classification=classification, ) bill.subject = doc.xpath( '//span[@id="frg_billstatus_CategoryList"]/a/text()') # actions (skip header) for row in doc.xpath( '//table[@id="frg_billstatus_HistoriesGridView"]/tr')[1:]: tds = row.xpath("td") # date, journal link, action date = tds[0].text_content() journal = tds[1].text_content() action = tds[2].text_content() date = TIMEZONE.localize( datetime.datetime.strptime(date, "%m/%d/%Y")) # instead of trusting upper/lower case, use journal for actor actor = "upper" if "SJ" in journal else "lower" classification = categorize_action(action) bill.add_action(action, date, chamber=actor, classification=classification) # check if action mentions a sub submatch = re.search(r"WITH SUBSTITUTE\s+([\w\-\d]+)", action, re.IGNORECASE) if submatch and tds[2].xpath("a"): version_url = tds[2].xpath("a/@href")[0] version_name = tds[2].xpath("a/text()")[0].strip() version_name = "Substitute {}".format(version_name) self.info("Found Substitute {}".format(version_url)) if version_url.lower().endswith(".pdf"): mimetype = "application/pdf" elif version_url.lower().endswith(".htm"): mimetype = "text/html" bill.add_version_link(version_name, version_url, media_type=mimetype) # check if action mentions a vote rcmatch = re.search(r"Roll Call # (\d+)", action, re.IGNORECASE) if rcmatch: rc_num = rcmatch.groups()[0] # in format mileg.aspx?page=getobject&objectname=2011-SJ-02-10-011 journal_link = tds[1].xpath("a/@href") if journal_link: objectname = journal_link[0].rsplit("=", 1)[-1] chamber_name = {"upper": "Senate", "lower": "House"}[actor] vote_url = BASE_URL + "/documents/%s/Journal/%s/htm/%s.htm" % ( session, chamber_name, objectname, ) results = self.parse_roll_call(vote_url, rc_num, session) if results is not None: vote_passed = len(results["yes"]) > len(results["no"]) vote = VoteEvent( start_date=date, chamber=actor, bill=bill, motion_text=action, result="pass" if vote_passed else "fail", classification="passage", ) # check the expected counts vs actual count = re.search(r"YEAS (\d+)", action, re.IGNORECASE) count = int(count.groups()[0]) if count else 0 if count != len(results["yes"]): self.warning( "vote count mismatch for %s %s, %d != %d" % (bill_id, action, count, len(results["yes"]))) count = re.search(r"NAYS (\d+)", action, re.IGNORECASE) count = int(count.groups()[0]) if count else 0 if count != len(results["no"]): self.warning( "vote count mismatch for %s %s, %d != %d" % (bill_id, action, count, len(results["no"]))) vote.set_count("yes", len(results["yes"])) vote.set_count("no", len(results["no"])) vote.set_count("other", len(results["other"])) possible_vote_results = ["yes", "no", "other"] for pvr in possible_vote_results: for name in results[pvr]: if session == "2017-2018": names = name.split("\t") for n in names: vote.vote(pvr, name.strip()) else: # Prevents voter names like "House Bill No. 4451, entitled" and other sentences if len(name.split()) < 5: vote.vote(pvr, name.strip()) vote.add_source(vote_url) yield vote else: self.warning("missing journal link for %s %s" % (bill_id, journal)) # versions for row in doc.xpath( '//table[@id="frg_billstatus_DocumentGridTable"]/tr'): parsed = self.parse_doc_row(row) if parsed: name, url = parsed if url.endswith(".pdf"): mimetype = "application/pdf" elif url.endswith(".htm"): mimetype = "text/html" bill.add_version_link(name, url, media_type=mimetype) # documents for row in doc.xpath('//table[@id="frg_billstatus_HlaTable"]/tr'): document = self.parse_doc_row(row) if document: name, url = document bill.add_document_link(name, url) for row in doc.xpath('//table[@id="frg_billstatus_SfaTable"]/tr'): document = self.parse_doc_row(row) if document: name, url = document bill.add_document_link(name, url) yield bill
def _build_lower_votes(self): url = self.shared_url + '&Floor%26nbspVotes=Y' self.urls.add(votes=url) self.bill.add_source(url) doc = self.urls.votes.doc if doc is None: return # Grab bill information. try: pre = doc.xpath('//pre')[0].text_content().strip() no_votes = 'There are no votes for this bill in this legislative ' if pre == no_votes: raise ValueError('No votes for this bill.') # Skip bill if votes can't be found. except (IndexError, ValueError): return for table in doc.xpath('//table'): date = table.xpath('caption/span[contains(., "DATE:")]') date = next(date[0].itersiblings()).text date = datetime.datetime.strptime(date, '%m/%d/%Y') date = date.replace(tzinfo=timezone('UTC')) spanText = table.xpath('caption/span/text()') motion = spanText[2].strip() + spanText[3].strip() votes = table.xpath('caption/span/span')[0].text.split( ':')[1].split('/') yes_count, no_count = map(int, votes) passed = yes_count > no_count vote = VoteEvent(chamber='lower', start_date=date, motion_text=motion, bill=self.bill, result='pass' if passed else 'fail', classification='passage') vote.set_count('yes', yes_count) vote.set_count('no', no_count) absent_count = 0 excused_count = 0 tds = table.xpath('tr/td/text()') votes = [tds[i:i + 2] for i in range(0, len(tds), 2)] vote_dictionary = { 'Y': 'yes', 'NO': 'no', 'ER': 'excused', 'AB': 'absent', 'NV': 'not voting' } for vote_pair in votes: name, vote_val = vote_pair vote.vote(vote_dictionary[vote_val], name) if vote_val == 'AB': absent_count += 1 elif vote_val == 'ER': excused_count += 1 vote.set_count('absent', absent_count) vote.set_count('excused', excused_count) vote.add_source(url) vote.pupa_id = url + motion + spanText[1] yield vote
def scrape_votes(self, bill): bill_num = bill.identifier.split()[1] url = ("http://wslwebservices.leg.wa.gov/legislationservice.asmx/" "GetRollCalls?billNumber=%s&biennium=%s" % (bill_num, self.biennium)) page = self.get(url) page = lxml.etree.fromstring(page.content) for rc in xpath(page, "//wa:RollCall"): motion = xpath(rc, "string(wa:Motion)") seq_no = xpath(rc, "string(wa:SequenceNumber)") date = xpath(rc, "string(wa:VoteDate)").split("T")[0] date = datetime.datetime.strptime(date, "%Y-%m-%d").date() yes_count = int(xpath(rc, "string(wa:YeaVotes/wa:Count)")) no_count = int(xpath(rc, "string(wa:NayVotes/wa:Count)")) abs_count = int(xpath(rc, "string(wa:AbsentVotes/wa:Count)")) ex_count = int(xpath(rc, "string(wa:ExcusedVotes/wa:Count)")) other_count = abs_count + ex_count agency = xpath(rc, "string(wa:Agency)") chamber = {'House': 'lower', 'Senate': 'upper'}[agency] vote = Vote(chamber=chamber, start_date=date, motion_text='{} (#{})'.format(motion, seq_no), result='pass' if yes_count > (no_count + other_count) else 'fail', classification='other', bill=bill) vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('other', other_count) vote.add_source(url) for sv in xpath(rc, "wa:Votes/wa:Vote"): name = xpath(sv, "string(wa:Name)") vtype = xpath(sv, "string(wa:VOte)") if vtype == 'Yea': vote.yes(name) elif vtype == 'Nay': vote.no(name) else: vote.vote('other', name) yield vote
def scrape_votes(self, session): self.session_key = SESSION_KEYS[session] self.legislators = index_legislators(self, self.session_key) measures_response = self.api_client.get('votes', page=500, session=self.session_key) for measure in measures_response: bid = '{} {}'.format(measure['MeasurePrefix'], measure['MeasureNumber']) measure_history = measure['MeasureHistoryActions'] for event in measure_history: if event['MeasureVotes']: tally = self.tally_votes(event, 'measure') passed = self.passed_vote(tally) classification = self.determine_vote_classifiers(event['ActionText']) when = datetime.datetime.strptime(event['ActionDate'], '%Y-%m-%dT%H:%M:%S') when = self.tz.localize(when) vote = VoteEvent( start_date=when, bill_chamber=self.chamber_code[bid[0]], motion_text=event['ActionText'], classification=classification, result='pass' if passed else 'fail', legislative_session=session, bill=bid, chamber=self.chamber_code[event['Chamber']] ) vote.set_count('yes', tally['yes']) vote.set_count('no', tally['no']) vote.set_count('absent', tally['absent']) vote_call = event['MeasureVotes'] self.add_individual_votes(vote, vote_call, 'measure') vote.add_source( 'https://olis.leg.state.or.us/liz/{session}' '/Measures/Overview/{bid}'.format( session=self.session_key, bid=bid.replace(' ', '') )) yield vote committee_history = measure['CommitteeAgendaItems'] for event in committee_history: if event['CommitteeVotes']: tally = self.tally_votes(event, 'committee') passed = self.passed_vote(tally) # there is at least one event w/o an Action listed action = event['Action'] or event['Comments'] classification = self.determine_vote_classifiers(action) when = datetime.datetime.strptime(event['MeetingDate'], '%Y-%m-%dT%H:%M:%S') when = self.tz.localize(when) vote = VoteEvent( start_date=when, bill_chamber=self.chamber_code[bid[0]], motion_text=action, classification=classification, result='pass' if passed else 'fail', legislative_session=session, bill=bid, chamber=self.chamber_code[event['CommitteCode'][0]] ) vote.set_count('yes', tally['yes']) vote.set_count('no', tally['no']) vote.set_count('absent', tally['absent']) vote_call = event['CommitteeVotes'] self.add_individual_votes(vote, vote_call, 'committee') meeting_date = when.strftime('%Y-%m-%d-%H-%M') vote.add_source( 'https://olis.leg.state.or.us/liz/{session}/Committees' '/{committee}/{meeting_date}/{bid}/Details'.format( session=self.session_key, committee=event['CommitteCode'], meeting_date=meeting_date, bid=bid.replace(' ', '') )) yield vote
def handle_page(self): MOTION_INDEX = 4 TOTALS_INDEX = 6 VOTE_START_INDEX = 9 if len(self.lines) < 2: self.scraper.warning("Bad PDF! " + self.url) return motion = self.lines[MOTION_INDEX].strip() # Sometimes there is no motion name, only "Passage" in the line above if not motion and not self.lines[MOTION_INDEX - 1].startswith("Calendar Page:"): motion = self.lines[MOTION_INDEX - 1] MOTION_INDEX -= 1 TOTALS_INDEX -= 1 VOTE_START_INDEX -= 1 else: assert motion, "Floor vote's motion name appears to be empty" for _extra_motion_line in range(2): MOTION_INDEX += 1 if self.lines[MOTION_INDEX].strip(): motion = "{}, {}".format(motion, self.lines[MOTION_INDEX].strip()) TOTALS_INDEX += 1 VOTE_START_INDEX += 1 else: break (yes_count, no_count, nv_count) = [ int(x) for x in re.search( r"^\s+Yeas - (\d+)\s+Nays - (\d+)\s+Not Voting - (\d+)\s*$", self.lines[TOTALS_INDEX], ).groups() ] result = "pass" if yes_count > no_count else "fail" vote = VoteEvent( start_date=self.kwargs["date"], chamber=self.kwargs["chamber"], bill=self.kwargs["bill"], motion_text=motion, result=result, classification="passage", ) vote.add_source(self.url) vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("not voting", nv_count) for line in self.lines[VOTE_START_INDEX:]: if not line.strip(): break if " President " in line: line = line.replace(" President ", " ") elif " Speaker " in line: line = line.replace(" Speaker ", " ") # Votes follow the pattern of: # [vote code] [member name]-[district number] for vtype, member in re.findall( r"\s*(Y|N|EX|AV)\s+(.*?)-\d{1,3}\s*", line): vtype = { "Y": "yes", "N": "no", "EX": "excused", "AV": "abstain" }[vtype] member = member.strip() vote.vote(vtype, member) # check totals line up yes_count = no_count = nv_count = 0 for vc in vote.counts: if vc["option"] == "yes": yes_count = vc["value"] elif vc["option"] == "no": no_count = vc["value"] else: nv_count += vc["value"] for vr in vote.votes: if vr["option"] == "yes": yes_count -= 1 elif vr["option"] == "no": no_count -= 1 else: nv_count -= 1 if yes_count != 0 or no_count != 0: raise ValueError("vote count incorrect: " + self.url) if nv_count != 0: # On a rare occasion, a member won't have a vote code, # which indicates that they didn't vote. The totals reflect # this. self.scraper.info( "Votes don't add up; looking for additional ones") for line in self.lines[VOTE_START_INDEX:]: if not line.strip(): break for member in re.findall(r"\s{8,}([A-Z][a-z\'].*?)-\d{1,3}", line): member = member.strip() vote.vote("not voting", member) yield vote
def parse_committee_votes(self, bill, url): bill.add_source(url) html = self.get(url).text doc = lxml.html.fromstring(html) doc.make_links_absolute(url) chamber = ('upper' if 'Senate' in doc.xpath('string(//h1)') else 'lower') committee = tuple(doc.xpath('//h2')[0].itertext())[-2].strip() for link in doc.xpath("//a[contains(@href, 'listVoteSummary.cfm')]"): # Date for fmt in ("%m/%d/%Y", "%m-%d-%Y"): date = link.xpath('../../td')[0].text_content() try: date = datetime.datetime.strptime(date, fmt) except ValueError: continue break # Motion motion = link.text_content().split(' - ')[-1].strip() motion = 'Committee vote (%s): %s' % (committee, motion) # Roll call vote_url = link.attrib['href'] rollcall = self.parse_upper_committee_vote_rollcall(bill, vote_url) vote = VoteEvent( chamber=chamber, start_date=tz.localize(date), motion_text=motion, classification='other', result='pass' if rollcall['passed'] else 'fail', bill=bill, ) vote.pupa_id = vote_url vote.set_count('yes', rollcall['yes_count']) vote.set_count('no', rollcall['no_count']) vote.set_count('other', rollcall['other_count']) for voteval in ('yes', 'no', 'other'): for name in rollcall.get(voteval + '_votes', []): vote.vote(voteval, name) vote.add_source(url) vote.add_source(vote_url) yield vote
def handle_page(self): (_, motion) = self.lines[5].split("FINAL ACTION:") motion = motion.strip() if not motion: self.scraper.warning("Vote appears to be empty") return vote_top_row = [ self.lines.index(x) for x in self.lines if re.search(r"^\s+Yea\s+Nay.*?(?:\s+Yea\s+Nay)+$", x) ][0] yea_columns_end = self.lines[vote_top_row].index("Yea") + len("Yea") nay_columns_begin = self.lines[vote_top_row].index("Nay") votes = {"yes": [], "no": [], "other": []} for line in self.lines[(vote_top_row + 1):]: if line.strip(): member = re.search( r"""(?x) ^\s+(?:[A-Z\-]+)?\s+ # Possible vote indicator ([A-Z][a-z]+ # Name must have lower-case characters [\w\-\s]+) # Continue looking for the rest of the name (?:,[A-Z\s]+?)? # Leadership has an all-caps title (?:\s{2,}.*)? # Name ends when many spaces are seen """, line, ).group(1) # sometimes members have trailing X's from other motions in the # vote sheet we aren't collecting member = re.sub(r"(\s+X)+", "", member) # Usually non-voting members won't even have a code listed # Only a couple of codes indicate an actual vote: # "VA" (vote after roll call) and "VC" (vote change) did_vote = bool(re.search(r"^\s+(X|VA|VC)\s+[A-Z][a-z]", line)) if did_vote: # Check where the "X" or vote code is on the page vote_column = len(line) - len(line.lstrip()) if vote_column <= yea_columns_end: votes["yes"].append(member) elif vote_column >= nay_columns_begin: votes["no"].append(member) else: raise ValueError( "Unparseable vote found for {0} in {1}:\n{2}". format(member, self.url, line)) else: votes["other"].append(member) # End loop as soon as no more members are found else: break totals = re.search(r"(?msu)\s+(\d{1,3})\s+(\d{1,3})\s+.*?TOTALS", self.text).groups() yes_count = int(totals[0]) no_count = int(totals[1]) result = "pass" if (yes_count > no_count) else "fail" vote = VoteEvent( start_date=self.kwargs["date"], bill=self.kwargs["bill"], chamber="upper", motion_text=motion, classification="committee", result=result, ) vote.add_source(self.url) vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("other", len(votes["other"])) # set voters for vtype, voters in votes.items(): for voter in voters: voter = voter.strip() # Removes the few voter names with a ton of extra spaces with VA at the end. # Ex: Cruz VA if " VA" in voter: voter = " ".join(voter.split()[:-2]) if len(voter) > 0: vote.vote(vtype, voter) yield vote
def scrape_action_page(self, bill, page): action_rows = page.xpath('//tbody/tr') for row in action_rows: action_date = row.xpath('td[1]/text()')[0] action_date = datetime.strptime(action_date, '%m/%d/%Y') action_year = action_date.year action_date = action_date.strftime('%Y-%m-%d') if row.xpath('td[2]/text()'): action_actor = row.xpath('td[2]/text()')[0] action_actor = self.chamber_map_reverse[action_actor.strip()] action_name = row.xpath('string(td[3])') # House votes if "Supplement" in action_name: actor = "lower" vote_action = action_name.split(' -')[0] y = int(action_name.strip().split('-')[1].split('YEAS')[0]) n = int(action_name.strip().split('YEAS to')[1].split('NAYS')[0]) # get supplement number n_supplement = int(action_name.strip().split('No. ')[1].split(r')')[0]) cached_vote = VoteEvent( chamber=actor, start_date=action_date, motion_text=vote_action, result='pass' if y > n else 'fail', classification='passage', bill=bill, ) cached_vote.set_count('yes', y) cached_vote.set_count('no', n) housevote_pdf = 'http://www.mass.gov/legis/journal/combined{}RCs.pdf'.format( action_year ) # note: 2014-2015 different format and no data on website for years prior to 2014 self.scrape_house_vote(cached_vote, housevote_pdf, n_supplement) cached_vote.add_source(housevote_pdf) cached_vote.pupa_id = '{}#{}'.format(housevote_pdf, n_supplement) yield cached_vote # Senate votes if "Roll Call" in action_name: actor = "upper" # placeholder vote_action = action_name.split(' -')[0] try: y, n = re.search('(\d+) yeas .*? (\d+) nays', action_name.lower()).groups() y = int(y) n = int(n) except AttributeError: y = int(re.search(r"yeas\s*(\d*)", action_name.lower()).group(1)) n = int(re.search(r"nays\s*(\d*)", action_name.lower()).group(1)) # TODO: other count isn't included, set later cached_vote = VoteEvent( chamber=actor, start_date=action_date, motion_text=vote_action, result='pass' if y > n else 'fail', classification='passage', bill=bill, ) cached_vote.set_count('yes', y) cached_vote.set_count('no', n) rollcall_pdf = 'http://malegislature.gov' + row.xpath('string(td[3]/a/@href)') self.scrape_senate_vote(cached_vote, rollcall_pdf) cached_vote.add_source(rollcall_pdf) yield cached_vote attrs = self.categorizer.categorize(action_name) action = bill.add_action( action_name.strip(), action_date, chamber=action_actor, classification=attrs['classification'], ) for com in attrs.get('committees', []): action.add_related_entity(com, entity_type='organization')
def handle_page(self): # Checks to see if any vote totals are provided if (len( self.doc.xpath( '//span[contains(@id, "ctl00_MainContent_lblTotal")]/text()' )) > 0): (date, ) = self.doc.xpath('//span[contains(@id, "lblDate")]/text()') date = format_datetime( datetime.datetime.strptime(date, "%m/%d/%Y %I:%M:%S %p"), "US/Eastern") # ctl00_MainContent_lblTotal //span[contains(@id, "ctl00_MainContent_lblTotal")] yes_count = int( self.doc.xpath('//span[contains(@id, "lblYeas")]/text()')[0]) no_count = int( self.doc.xpath('//span[contains(@id, "lblNays")]/text()')[0]) other_count = int( self.doc.xpath('//span[contains(@id, "lblMissed")]/text()')[0]) result = "pass" if yes_count > no_count else "fail" (committee, ) = self.doc.xpath('//span[contains(@id, "lblCommittee")]/text()') (action, ) = self.doc.xpath('//span[contains(@id, "lblAction")]/text()') motion = "{} ({})".format(action, committee) vote = VoteEvent( start_date=date, bill=self.kwargs["bill"], chamber="lower", motion_text=motion, result=result, classification="committee", ) vote.add_source(self.url) vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("not voting", other_count) for member_vote in self.doc.xpath( '//ul[contains(@class, "vote-list")]/li'): if not member_vote.text_content().strip(): continue (member, ) = member_vote.xpath("span[2]//text()") (member_vote, ) = member_vote.xpath("span[1]//text()") member = member.strip() if member_vote == "Y": vote.yes(member) elif member_vote == "N": vote.no(member) elif member_vote == "-": vote.vote("not voting", member) # Parenthetical votes appear to not be counted in the # totals for Yea, Nay, _or_ Missed elif re.search(r"\([YN]\)", member_vote): continue else: raise ValueError( "Unknown vote type found: {}".format(member_vote)) yield vote
def _process_votes(self, rollcalls, bill_id, original_chamber, session, proxy): result_types = { "FAILED": False, "DEFEATED": False, "PREVAILED": True, "PASSED": True, "SUSTAINED": True, "NOT SECONDED": False, "OVERRIDDEN": True, "ADOPTED": True, } for r in rollcalls: proxy_link = proxy["url"] + r["link"] try: (path, resp) = self.urlretrieve(proxy_link) except scrapelib.HTTPError as e: self.warning(e) self.warning( "Unable to contact openstates proxy, skipping vote {}". format(r["link"])) continue text = convert_pdf(path, "text").decode("utf-8") lines = text.split("\n") os.remove(path) chamber = ("lower" if "house of representatives" in lines[0].lower() else "upper") date_parts = lines[1].strip().split()[-3:] date_str = " ".join(date_parts).title() + " " + lines[2].strip() vote_date = datetime.datetime.strptime(date_str, "%b %d, %Y %I:%M:%S %p") vote_date = pytz.timezone("America/Indiana/Indianapolis").localize( vote_date) vote_date = vote_date.isoformat() passed = None for res, val in result_types.items(): # We check multiple lines now because the result of the # roll call vote as parsed can potentially be split. # PDF documents suck. for line in lines[3:5]: if res in line.upper(): passed = val break if passed is None: raise AssertionError("Missing bill passage type") motion = " ".join(lines[4].split()[:-2]) try: yeas = int(lines[4].split()[-1]) nays = int(lines[5].split()[-1]) excused = int(lines[6].split()[-1]) not_voting = int(lines[7].split()[-1]) except ValueError: self.logger.warning("Vote format is weird, skipping") continue vote = VoteEvent( chamber=chamber, legislative_session=session, bill=bill_id, bill_chamber=original_chamber, start_date=vote_date, motion_text=motion, result="pass" if passed else "fail", classification="passage", ) vote.set_count("yes", yeas) vote.set_count("no", nays) vote.set_count("excused", excused) vote.set_count("not voting", not_voting) vote.add_source(proxy_link) currently_counting = "" possible_vote_lines = lines[8:] for line in possible_vote_lines: line = line.replace("NOT\xc2\xa0VOTING", "NOT VOTING") line = line.replace("\xc2\xa0", " -") if "yea-" in line.lower().replace(" ", ""): currently_counting = "yes" elif "nay-" in line.lower().replace(" ", ""): currently_counting = "no" elif "excused-" in line.lower().replace(" ", ""): currently_counting = "excused" elif "notvoting-" in line.lower().replace(" ", ""): currently_counting = "not voting" elif currently_counting == "": pass elif re.search(r"v\. \d\.\d", line): # this gets rid of the version number # which is often found at the bottom of the doc pass else: voters = line.split(" ") for v in voters: if v.strip(): vote.vote(currently_counting, v.strip()) yield vote
def _parse_votes(self, url, vote, bill): '''Given a vote url and a vote object, extract the voters and the vote counts from the vote page and update the vote object. ''' if url.lower().endswith('.pdf'): try: resp = self.get(url) except HTTPError: # This vote document wasn't found. msg = 'No document found at url %r' % url self.logger.warning(msg) return try: v = PDFCommitteeVote(url, resp.content, bill) return v.asvote() except PDFCommitteeVoteParseError: # Warn and skip. self.warning("Could't parse committee vote at %r" % url) return html = self.get(url).text doc = lxml.html.fromstring(html) # Yes, no, excused, absent. try: vals = doc.xpath('//table')[1].xpath('tr/td/text()') except IndexError: # Most likely was a bogus link lacking vote data. return yes_count, no_count, excused_count, absent_count = map(int, vals) # Get the motion. try: motion = doc.xpath('//br')[-1].tail.strip() except: # Some of them mysteriously have no motion listed. motion = vote['action'] if not motion: motion = vote['action'] vote['motion'] = motion action = vote['action'] vote_url = vote['vote_url'] vote = VoteEvent( chamber=vote['chamber'], start_date=vote['date'], motion_text=vote['motion'], result='fail', # placeholder classification='passage', bill=bill, bill_action=vote['action'], ) vote.add_source(vote_url) vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('excused', excused_count) vote.set_count('absent', absent_count) for text in doc.xpath('//table')[2].xpath('tr/td/text()'): if not text.strip(u'\xa0'): continue v, name = filter(None, text.split(u'\xa0')) # Considering Name is brackets as short name regex = re.compile(".*?\((.*?)\)") short_name = re.findall(regex, name) if len(short_name) > 0: note = 'Short Name: ' + short_name[0] else: note = '' # Name without brackets like 'Kary, Douglas' name = re.sub("[\(\[].*?[\)\]]", "", name) if v == 'Y': vote.yes(name, note=note) elif v == 'N': vote.no(name, note=note) elif v == 'E': vote.vote('excused', name, note=note) elif v == 'A': vote.vote('absent', name, note=note) # code to deterimine value of `passed` passed = None # some actions take a super majority, so we aren't just # comparing the yeas and nays here. for i in vote_passage_indicators: if i in action: passed = True break for i in vote_failure_indicators: if i in action and passed: # a quick explanation: originally an exception was # thrown if both passage and failure indicators were # present because I thought that would be a bug in my # lists. Then I found 2007 HB 160. # Now passed = False if the nays outnumber the yays.. # I won't automatically mark it as passed if the yays # ounumber the nays because I don't know what requires # a supermajority in MT. if no_count >= yes_count: passed = False break else: raise Exception("passage and failure indicator" "both present at: %s" % url) if i in action and passed is None: passed = False break for i in vote_ambiguous_indicators: if i in action: passed = yes_count > no_count break if passed is None: raise Exception("Unknown passage at: %s" % url) vote.result = 'pass' if passed else 'fail' return vote
def scrape_vote(self, bill, vote_url, chamber, date): page = self.lxmlize(vote_url) try: motion = page.xpath("//font/text()")[2] except IndexError: self.warning("Vote Summary Page Broken ") return # eg. http://leg.colorado.gov/content/sb18-033vote563ce6 if ("AM" in motion or "PM" in motion) and "/" in motion: motion = "Motion not given." if "withdrawn" not in motion: yes_no_counts = page.xpath( "//tr/td[preceding-sibling::td/descendant::" "font[contains(text(),'Aye')]]/font/text()" ) other_counts = page.xpath( "//tr/td[preceding-sibling::td/descendant::" "font[contains(text(),'Absent')]]/font/text()" ) abstain_counts = page.xpath( "//tr/td[preceding-sibling::td/descendant::" "font[contains(text(),'17C')]]/font/text()" ) yes_count = int(yes_no_counts[0]) no_count = int(yes_no_counts[2]) exc_count = int(other_counts[2]) absent_count = int(other_counts[0]) abstain_count = 0 if abstain_counts: abstain_count = int(abstain_counts[0]) # fix for # http://leg.colorado.gov/content/hb19-1029vote65e72e if absent_count == -1: absent_count = 0 passed = yes_count > no_count vote = VoteEvent( chamber=chamber, start_date=self._tz.localize(date), motion_text=motion, result="pass" if passed else "fail", bill=bill, classification="passage", ) vote.pupa_id = vote_url vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("excused", exc_count) vote.set_count("absent", absent_count) vote.set_count("abstain", abstain_count) vote.add_source(vote_url) rolls = page.xpath( "//tr[preceding-sibling::tr/descendant::" "td/div/b/font[contains(text(),'Vote')]]" ) vote_abrv = { "Y": "yes", "N": "no", "E": "excused", "A": "absent", "-": "absent", "17C": "abstain", } for roll in rolls: voted = roll.xpath(".//td/div/font/text()")[0].strip() voter = roll.xpath(".//td/font/text()")[0].strip() if voted == "V": continue vote.vote(vote_abrv[voted], voter) yield vote
def asvote(self): v = VoteEvent( chamber=self.chamber(), start_date=self.date(), motion_text=self.motion(), result='pass' if self.passed() else 'fail', classification='passage', bill=self.bill, ) v.set_count('yes', self.yes_count()) v.set_count('no', self.no_count()) v.set_count('other', self.other_count()) for voter in self.yes_votes(): v.yes(voter) for voter in self.no_votes(): v.no(voter) for voter in self.other_votes(): v.vote('other', voter) v.add_source(self.url) return v
def scrape_vote(self, bill, motion, url): page = self.get(url, retry_on_404=True).text page = lxml.html.fromstring(page) yeas_cell = page.xpath("//td[text() = 'Yeas (Y):']")[0] yes_count = int(yeas_cell.xpath("string(following-sibling::td)")) nays_cell = page.xpath("//td[text() = 'Nays (N):']")[0] no_count = int(nays_cell.xpath("string(following-sibling::td)")) abs_cell = page.xpath("//td[text() = 'Absent (X):']")[0] abs_count = int(abs_cell.xpath("string(following-sibling::td)")) ex_cell = page.xpath("//td[text() = 'Excused (E):']")[0] ex_count = int(ex_cell.xpath("string(following-sibling::td)")) other_count = abs_count + ex_count if 'chamber=House' in url: chamber = 'lower' elif 'chamber=Senate' in url: chamber = 'upper' date_cell = page.xpath("//td[text() = 'Date:']")[0] date = date_cell.xpath("string(following-sibling::td)") try: date = datetime.datetime.strptime(date, "%B %d, %Y") except ValueError: date = datetime.datetime.strptime(date, "%b. %d, %Y") outcome_cell = page.xpath("//td[text()='Outcome:']")[0] outcome = outcome_cell.xpath("string(following-sibling::td)") vote = VoteEvent( chamber=chamber, start_date=date.strftime('%Y-%m-%d'), motion_text=motion, result='pass' if outcome == 'PREVAILS' else 'fail', classification='passage', bill=bill, ) vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('other', other_count) vote.add_source(url) vote.pupa_id = url member_cell = page.xpath("//td[text() = 'Member']")[0] for row in member_cell.xpath("../../tr")[1:]: name = row.xpath("string(td[2])") # name = name.split(" of ")[0] vtype = row.xpath("string(td[4])") if vtype == 'Y': vote.vote('yes', name) elif vtype == 'N': vote.vote('no', name) elif vtype == 'X' or vtype == 'E': vote.vote('other', name) yield vote
def handle_page(self): summary = self.doc.xpath('/'.join([ '//h4[starts-with(text(), "SUMMARY")]', '/following-sibling::p', 'text()', ])) if summary and summary[0].strip(): self.obj.add_abstract(abstract=summary[0].strip(), note='summary') # versions for va in self.doc.xpath( '//h4[text()="FULL TEXT"]/following-sibling::ul[1]/li/a[1]'): # 11/16/09 \xa0House: Prefiled and ordered printed; offered 01/13/10 10100110D date, desc = va.text.split(u' \xa0') desc.rsplit(' ', 1)[0] # chop off last part link = va.get('href') if 'http' not in link: link = '{}{}'.format(BASE_URL, link) date = datetime.datetime.strptime(date, '%m/%d/%y').date() # budget bills in VA are searchable but no full text available if '+men+' in link: self.warning( 'not adding budget version, bill text not available') else: # VA duplicates reprinted bills, lets keep the original name self.obj.add_version_link(desc, link, date=date, media_type='text/html', on_duplicate='ignore') # actions cached_vote = None cached_action = None for ali in self.doc.xpath( '//h4[text()="HISTORY"]/following-sibling::ul[1]/li'): vote = None date, action = ali.text_content().split(u' \xa0') actor, action = action.split(': ', 1) # Bill history entries purely in parentheses tend to be # notes and not actions, so we'll skip them. if action.startswith('(') and action.endswith(')'): continue actor = self.actor_map[actor] date = datetime.datetime.strptime(date.strip(), '%m/%d/%y').date() # if action ends in (##-Y ##-N) remove that part vrematch = self.vote_strip_re.match(action) # The following conditional logic is messy to handle # Virginia's crazy and inconsistently formatted bill # histories. Someone less harried and tired than me # could probably make this much cleaner. - alo if vrematch: vote_action, y, n, o = vrematch.groups() y = int(y) n = int(n) # Set default count for "other" votes to 0. We have to # do this explicitly as it's excluded from the action # text when there were no abstentions (the only type of # "other" vote encountered thus far). if o is None: o = 0 else: o = int(o) vote_url = ali.xpath('a/@href') # Caches relevant information from the current action if # vote count encountered, then searches for the presence # of identical counts in the next entry (we assume that # it's probably there). If matching votes are found, it # pulls the cached data to create a unified vote record. # # This is because Virginia usually publishes two lines # of history data for a single vote, without guaranteed # order, so we cache and unsafely attempt to match on # identical vote counts in the next line. if cached_vote is None: cached_action = action cached_vote = VoteEvent( start_date=date, chamber=actor, motion_text=vote_action, result='pass' if y > n else 'fail', classification='passage', bill=self.obj, ) cached_vote.set_count('yes', y) cached_vote.set_count('no', n) cached_vote.set_count('other', o) if vote_url: cached_vote.add_source(vote_url[0]) else: cached_vote.add_source(self.url) continue elif cached_vote is not None: if vote_action.startswith(u'VOTE:'): counts = { count['option']: count['value'] for count in cached_vote.counts } if (vote_url and counts['yes'] == y and counts['no'] == n and counts['other'] == o): vote = cached_vote list( self.scrape_page_items(VotePage, url=vote_url[0], obj=vote)) vote.add_source(vote_url[0]) action = cached_action elif cached_vote.motion_text.startswith('VOTE:'): counts = { count['option']: count['value'] for count in cached_vote.counts } if (counts['yes'] == y and counts['no'] == n and counts['other'] == o): vote = cached_vote vote['motion'] = vote_action else: # Cached vote doesn't match up to the current # one. Save, then cache the current vote to # begin the next search. yield from add_pupa_id(cached_vote) cached_vote = VoteEvent( start_date=date, chamber=actor, motion_text=vote_action, result='pass' if y > n else 'fail', classification='passage', bill=self.obj, ) cached_vote.set_count('yes', y) cached_vote.set_count('no', n) cached_vote.set_count('other', o) if vote_url: cached_vote.add_source(vote_url[0]) else: cached_vote.add_source(self.url) cached_action = action continue if vote is not None: yield from add_pupa_id(vote) else: # If this action isn't a vote, but the last one was, # there's obviously no additional vote data to match. # Go ahead and save the cached data. if cached_vote is not None: yield from add_pupa_id(cached_vote) cached_vote = cached_action = None # categorize actions for pattern, atype in ACTION_CLASSIFIERS: if re.match(pattern, action): break else: atype = None # if matched a 'None' atype, don't add the action if atype != SKIP: self.obj.add_action(action, date, chamber=actor, classification=atype)
def scrape(self, window=28, matter_ids=None): '''By default, scrape board reports updated in the last 28 days. Optionally specify a larger or smaller window of time from which to scrape updates, or specific matters to scrape. Note that passing a value for :matter_ids supercedes the value of :window, such that the given matters will be scraped regardless of when they were updated. Optional parameters :window (numeric) - Amount of time for which to scrape updates, e.g. a window of 7 will scrape legislation updated in the last week. Pass a window of 0 to scrape all legislation. :matter_ids (str) - Comma-separated list of matter IDs to scrape ''' if matter_ids: matters = [ self.matter(matter_id) for matter_id in matter_ids.split(',') ] matters = filter( None, matters) # Skip matters that are not yet in Legistar elif float(window): # Support for partial days, i.e., window=0.15 n_days_ago = datetime.datetime.utcnow() - datetime.timedelta( float(window)) matters = self.matters(n_days_ago) else: # Scrape all matters, including those without a last-modified date matters = self.matters() n_days_ago = datetime.datetime.utcnow() - datetime.timedelta( float(window)) for matter in matters: # If this Boolean field is True, then do not scrape the Bill. # This issue explains why a restricted Bill might appear (unwelcome) in the Legistar API: # https://github.com/datamade/la-metro-councilmatic/issues/345#issuecomment-421184826 if matter['MatterRestrictViewViaWeb']: continue matter_id = matter['MatterId'] date = matter['MatterIntroDate'] title = matter['MatterTitle'] identifier = matter['MatterFile'] if not all((date, title, identifier)): continue bill_session = self.session(self.toTime(date)) bill_type = BILL_TYPES[matter['MatterTypeName']] if identifier.startswith('S'): alternate_identifiers = [identifier] identifier = identifier[1:] else: alternate_identifiers = [] bill = Bill(identifier=identifier, legislative_session=bill_session, title=title, classification=bill_type, from_organization={"name": "Board of Directors"}) legistar_web = matter['legistar_url'] legistar_api = self.BASE_URL + '/matters/{0}'.format(matter_id) bill.add_source(legistar_web, note='web') bill.add_source(legistar_api, note='api') for identifier in alternate_identifiers: bill.add_identifier(identifier) for action, vote in self.actions(matter_id): act = bill.add_action(**action) if action['description'] == 'Referred': body_name = matter['MatterBodyName'] act.add_related_entity( body_name, 'organization', entity_id=_make_pseudo_id(name=body_name)) result, votes = vote if result: vote_event = VoteEvent( legislative_session=bill.legislative_session, motion_text=action['description'], organization=action['organization'], classification=None, start_date=action['date'], result=result, bill=bill) vote_event.add_source(legistar_web) vote_event.add_source(legistar_api + '/histories') for vote in votes: raw_option = vote['VoteValueName'].lower() clean_option = self.VOTE_OPTIONS.get( raw_option, raw_option) vote_event.vote(clean_option, vote['VotePersonName'].strip()) yield vote_event for sponsorship in self.sponsorships(matter_id): bill.add_sponsorship(**sponsorship) for topic in self.topics(matter_id): bill.add_subject(topic['MatterIndexName'].strip()) for relation in self.relations(matter_id): try: # Get data (i.e., json) for the related bill. # Then, we can find the 'MatterFile' (i.e., identifier) and the 'MatterIntroDate' (i.e., to determine its legislative session). # Sometimes, the related bill does not yet exist: in this case, throw an error, and continue. related_bill = self.endpoint( '/matters/{0}', relation['MatterRelationMatterId']) except scrapelib.HTTPError: continue else: date = related_bill['MatterIntroDate'] related_bill_session = self.session(self.toTime(date)) identifier = related_bill['MatterFile'] bill.add_related_bill( identifier=identifier, legislative_session=related_bill_session, relation_type='companion') # Currently, the relation type for bills can be one of a few possibilites: https://github.com/opencivicdata/python-opencivicdata/blob/master/opencivicdata/common.py#L104 # Metro simply understands these as related files, suggesting that they receive a relation of 'companion'. bill.add_version_link( 'Board Report', 'https://metro.legistar.com/ViewReport.ashx?M=R&N=TextL5&GID=557&ID={}&GUID=LATEST&Title=Board+Report' .format(matter_id), media_type="application/pdf") for attachment in self.attachments(matter_id): if attachment['MatterAttachmentName']: bill.add_document_link( attachment['MatterAttachmentName'], attachment['MatterAttachmentHyperlink'], media_type="application/pdf") bill.extras = {'local_classification': matter['MatterTypeName']} text = self.text(matter_id) if text: if text['MatterTextPlain']: bill.extras['plain_text'] = text['MatterTextPlain'] if text['MatterTextRtf']: bill.extras['rtf_text'] = text['MatterTextRtf'].replace( u'\u0000', '') yield bill
def scrape_votes(self, bill): bill_num = bill.identifier.split()[1] url = ("http://wslwebservices.leg.wa.gov/legislationservice.asmx/" "GetRollCalls?billNumber=%s&biennium=%s" % ( bill_num, self.biennium)) page = self.get(url) page = lxml.etree.fromstring(page.content) for rc in xpath(page, "//wa:RollCall"): motion = xpath(rc, "string(wa:Motion)") seq_no = xpath(rc, "string(wa:SequenceNumber)") date = xpath(rc, "string(wa:VoteDate)").split("T")[0] date = datetime.datetime.strptime(date, "%Y-%m-%d").date() yes_count = int(xpath(rc, "string(wa:YeaVotes/wa:Count)")) no_count = int(xpath(rc, "string(wa:NayVotes/wa:Count)")) abs_count = int( xpath(rc, "string(wa:AbsentVotes/wa:Count)")) ex_count = int( xpath(rc, "string(wa:ExcusedVotes/wa:Count)")) other_count = abs_count + ex_count agency = xpath(rc, "string(wa:Agency)") chamber = {'House': 'lower', 'Senate': 'upper'}[agency] vote = Vote(chamber=chamber, start_date=date, motion_text='{} (#{})'.format(motion, seq_no), result='pass' if yes_count > (no_count + other_count) else 'fail', classification='other', bill=bill) vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('other', other_count) vote.add_source(url) for sv in xpath(rc, "wa:Votes/wa:Vote"): name = xpath(sv, "string(wa:Name)") vtype = xpath(sv, "string(wa:VOte)") if vtype == 'Yea': vote.yes(name) elif vtype == 'Nay': vote.no(name) else: vote.vote('other', name) yield vote
def scrape_votes(self, bill, page): base_url = "https://apps.azleg.gov/api/BillStatusFloorAction" for header in page["FloorHeaders"]: params = { "billStatusId": page["BillId"], "billStatusActionId": header["BillStatusActionId"], "includeVotes": "true", } resp = self.get(base_url, params=params) actions = json.loads(resp.content.decode("utf-8")) for action in actions: if action["Action"] == "No Action": continue action_date = datetime.datetime.strptime( action["ReportDate"], "%Y-%m-%dT%H:%M:%S") vote = VoteEvent( chamber={ "S": "upper", "H": "lower" }[header["LegislativeBody"]], motion_text=action["Action"], classification="passage", result=("pass" if action["UnanimouslyAdopted"] or action["Ayes"] > action["Nays"] else "fail"), start_date=action_date.strftime("%Y-%m-%d"), bill=bill, ) vote.add_source(resp.url) vote.set_count("yes", action["Ayes"] or 0) vote.set_count("no", action["Nays"] or 0) vote.set_count("other", (action["Present"] or 0)) vote.set_count("absent", (action["Absent"] or 0)) vote.set_count("excused", (action["Excused"] or 0)) vote.set_count("not voting", (action["NotVoting"] or 0)) for v in action["Votes"]: vote_type = {"Y": "yes", "N": "no"}.get(v["Vote"], "other") vote.vote(vote_type, v["Legislator"]["FullName"]) vote.pupa_id = resp.url + str(action["ReferralNumber"]) yield vote
def parse_bill_actions_table(self, bill, action_table, bill_id, session, url, bill_chamber): for action in action_table.xpath('*')[1:]: date = action[0].text_content() date = dt.datetime.strptime(date, "%m/%d/%Y").strftime('%Y-%m-%d') actor = action[1].text_content().upper() string = action[2].text_content() actor = { "S": "upper", "H": "lower", "D": "legislature", # "Data Systems", "$": "Appropriation measure", "CONAM": "Constitutional Amendment" }[actor] act_type, committees = categorize_action(string) # XXX: Translate short-code to full committee name for the # matcher. real_committees = [] if committees: for committee in committees: try: committee = self.short_ids[committee]['name'] real_committees.append(committee) except KeyError: pass act = bill.add_action(string, date, chamber=actor, classification=act_type) for committee in real_committees: act.add_related_entity(name=committee, entity_type="organization") vote = self.parse_vote(string) if vote: v, motion = vote vote = VoteEvent(start_date=date, chamber=actor, bill=bill_id, bill_chamber=bill_chamber, legislative_session=session, motion_text=motion, result='pass' if 'passed' in string.lower() else 'fail', classification='passage') vote.add_source(url) vote.set_count('yes', int(v['n_yes'] or 0)) vote.set_count('no', int(v['n_no'] or 0)) vote.set_count('not voting', int(v['n_excused'] or 0)) for voter in split_specific_votes(v['yes']): vote.yes(voter) for voter in split_specific_votes(v['yes_resv']): vote.yes(voter) for voter in split_specific_votes(v['no']): vote.no(voter) for voter in split_specific_votes(v['excused']): vote.vote('not voting', voter) yield vote
def scrape_action_page(self, bill, page): action_rows = page.xpath('//tbody/tr') for row in action_rows: action_date = row.xpath('td[1]/text()')[0] action_date = datetime.strptime(action_date, '%m/%d/%Y') action_year = action_date.year action_date = action_date.strftime('%Y-%m-%d') if row.xpath('td[2]/text()'): action_actor = row.xpath('td[2]/text()')[0] action_actor = self.chamber_map_reverse[action_actor.strip()] action_name = row.xpath('string(td[3])') # House votes if "Supplement" in action_name: actor = "lower" vote_action = re.findall(r'(.+)-\s*\d+\s*YEAS', action_name)[0].strip() y = int(re.findall(r'(\d+)\s*YEAS', action_name)[0]) n = int(re.findall(r'(\d+)\s*NAYS', action_name)[0]) # get supplement number n_supplement = int(re.findall(r'No\.\s*(\d+)', action_name)[0]) cached_vote = VoteEvent( chamber=actor, start_date=action_date, motion_text=vote_action, result='pass' if y > n else 'fail', classification='passage', bill=bill, ) cached_vote.set_count('yes', y) cached_vote.set_count('no', n) housevote_pdf = 'https://malegislature.gov/Journal/House/{}/{}/RollCalls'.format( bill.legislative_session, action_year) self.scrape_house_vote(cached_vote, housevote_pdf, n_supplement) cached_vote.add_source(housevote_pdf) cached_vote.pupa_id = '{}#{}'.format(housevote_pdf, n_supplement) # XXX: disabled house votes on 8/1 to try to get MA importing again # will leaving this in and commented out once we resolve the ID issue # yield cached_vote # Senate votes if "Roll Call" in action_name: actor = "upper" # placeholder vote_action = action_name.split(' -')[0] try: y, n = re.search(r'(\d+) yeas .*? (\d+) nays', action_name.lower()).groups() y = int(y) n = int(n) except AttributeError: y = int(re.search(r"yeas\s+(\d+)", action_name.lower()).group(1)) n = int(re.search(r"nays\s+(\d+)", action_name.lower()).group(1)) # TODO: other count isn't included, set later cached_vote = VoteEvent( chamber=actor, start_date=action_date, motion_text=vote_action, result='pass' if y > n else 'fail', classification='passage', bill=bill, ) cached_vote.set_count('yes', y) cached_vote.set_count('no', n) rollcall_pdf = 'http://malegislature.gov' + row.xpath('string(td[3]/a/@href)') self.scrape_senate_vote(cached_vote, rollcall_pdf) cached_vote.add_source(rollcall_pdf) cached_vote.pupa_id = rollcall_pdf # XXX: also disabled, see above note # yield cached_vote attrs = self.categorizer.categorize(action_name) action = bill.add_action( action_name.strip(), action_date, chamber=action_actor, classification=attrs['classification'], ) for com in attrs.get('committees', []): action.add_related_entity(com, entity_type='organization')
def parse_vote(self, bill, link): member_doc = lxml.html.fromstring(self.get(link).text) motion = member_doc.xpath("//div[@id='main_content']/h4/text()") opinions = member_doc.xpath("//div[@id='main_content']/h3/text()") if len(opinions) > 0: temp = opinions[0].split() vote_chamber = temp[0] vote_date = datetime.datetime.strptime(temp[-1], '%m/%d/%Y') vote_status = " ".join(temp[2:-2]) vote_status = vote_status if vote_status.strip() else motion[0] vote_chamber = 'upper' if vote_chamber == 'Senate' else 'lower' for i in opinions: try: count = int(i[i.find("(") + 1:i.find(")")]) except: pass if "yea" in i.lower(): yes_count = count elif "nay" in i.lower(): no_count = count elif "present" in i.lower(): p_count = count elif "absent" in i.lower(): a_count = count vote = VoteEvent( bill=bill, start_date=vote_date.strftime('%Y-%m-%d'), chamber=vote_chamber, motion_text=vote_status, result='pass' if yes_count > no_count else 'fail', classification='passage', ) vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('abstain', p_count) vote.set_count('absent', a_count) vote.add_source(link) a_links = member_doc.xpath("//div[@id='main_content']/a/text()") for i in range(1, len(a_links)): if i <= yes_count: vote.vote('yes', re.sub(',', '', a_links[i]).split()[0]) elif no_count != 0 and i > yes_count and i <= yes_count + no_count: vote.vote('no', re.sub(',', '', a_links[i]).split()[0]) else: vote.vote('other', re.sub(',', '', a_links[i]).split()[0]) yield vote else: self.warning("No Votes for: %s", link)
def parse_roll_call(self, bill, link, chamber, date): url = link.attrib['href'] page = self.get(url).text page = lxml.html.fromstring(page) xpath = 'string(//div[@class="Column-OneFourth"]/div[3])' motion = page.xpath(xpath).strip() motion = re.sub(r'\s+', ' ', motion) if motion == 'FP': motion = 'FINAL PASSAGE' if motion == 'FINAL PASSAGE': type = 'passage' elif re.match(r'CONCUR(RENCE)? IN \w+ AMENDMENTS', motion): type = 'amendment' else: type = 'other' motion = link.text_content() yeas = int(page.xpath("//div[text() = 'YEAS']")[0].getnext().text) nays = int(page.xpath("//div[text() = 'NAYS']")[0].getnext().text) lve = int(page.xpath("//div[text() = 'LVE']")[0].getnext().text) nv = int(page.xpath("//div[text() = 'N/V']")[0].getnext().text) other = lve + nv vote = VoteEvent( chamber=chamber, start_date=tz.localize(date), motion_text=motion, classification=type, result='pass' if yeas > (nays + other) else 'fail', bill=bill, ) # pupa_id situation here is a bit weird, same vote can be used for # multiple bills see: # http://www.legis.state.pa.us/CFDOCS/Legis/RC/Public/rc_view_action2.cfm?sess_yr=2017&sess_ind=0&rc_body=H&rc_nbr=11 # noqa # so we toss the bill id onto the end of the URL vote.pupa_id = url + '#' + bill.identifier vote.add_source(url) vote.set_count('yes', yeas) vote.set_count('no', nays) vote.set_count('other', other) for div in page.xpath('//*[contains(@class, "RollCalls-Vote")]'): name = div.text_content().strip() name = re.sub(r'^[\s,]+', '', name) name = re.sub(r'[\s,]+$', '', name) class_attr = div.attrib['class'].lower() if 'yea' in class_attr: voteval = 'yes' elif 'nay' in class_attr: voteval = 'no' elif 'nvote' in class_attr: voteval = 'other' elif 'lve' in class_attr: voteval = 'other' else: msg = 'Unrecognized vote val: %s' % class_attr raise Exception(msg) vote.vote(voteval, name) return vote
def scrape_votes(self, bill, bill_page, chamber): vote_links = bill_page.xpath( '//table[contains(@class,"history")]//a[contains(@href, "view_votes")]' ) for vote_link in vote_links: vote_url = vote_link.attrib['href'] date_td, motion_td, *_ = vote_link.xpath('ancestor::tr/td') date = datetime.strptime(date_td.text, '%b %d, %Y') motion_text = motion_td.text_content() vote_page = self.lxmlize(vote_url) passed = ('Passed' in motion_text or 'Advanced' in motion_text) cells = vote_page.xpath( '//div[contains(@class,"table-responsive")]/table//td') vote = VoteEvent( bill=bill, chamber=chamber, start_date=TIMEZONE.localize(date), motion_text=motion_text, classification='passage', result='pass' if passed else 'fail', ) yes_count = self.process_count(vote_page, 'Yes:') no_count = self.process_count(vote_page, 'No:') exc_count = self.process_count(vote_page, 'Excused - Not Voting:') absent_count = self.process_count(vote_page, 'Absent - Not Voting:') present_count = self.process_count(vote_page, 'Present - Not Voting:') vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('excused', exc_count) vote.set_count('absent', absent_count) vote.set_count('abstain', present_count) query_params = urllib.parse.parse_qs( urllib.parse.urlparse(vote_url).query) vote.pupa_id = query_params['KeyID'][0] vote.add_source(vote_url) for chunk in range(0, len(cells), 2): name = cells[chunk].text vote_type = cells[chunk + 1].text if name and vote_type: vote.vote(VOTE_TYPE_MAP.get(vote_type.lower(), 'other'), name) yield vote
def scrape_vote(self, bill, vote_id, session): vote_url = 'https://legis.delaware.gov/json/RollCall/GetRollCallVoteByRollCallId' form = { 'rollCallId': vote_id, 'sort': '', 'group': '', 'filter': '', } page = self.post(url=vote_url, data=form, allow_redirects=True).json() if page: roll = page['Model'] vote_chamber = self.chamber_map[roll['ChamberName']] # "7/1/16 01:00 AM" vote_date = dt.datetime.strptime(roll['TakenAtDateTime'], '%m/%d/%y %I:%M %p').strftime('%Y-%m-%d') # TODO: What does this code mean? vote_motion = roll['RollCallVoteType'] vote_passed = 'pass' if roll['RollCallStatus'] == 'Passed' else 'fail' other_count = (int(roll['NotVotingCount']) + int(roll['VacantVoteCount']) + int(roll['AbsentVoteCount']) + int(roll['ConflictVoteCount']) ) vote = Vote(chamber=vote_chamber, start_date=vote_date, motion_text=vote_motion, result=vote_passed, classification='other', bill=bill.identifier, legislative_session=session ) vote.add_source(vote_url) vote.set_count('yes', roll['YesVoteCount']) vote.set_count('no', roll['NoVoteCount']) vote.set_count('other', other_count) for row in roll['AssemblyMemberVotes']: # AssemblyMemberId looks like it should work here, # but for some sessions it's bugged to only return session try: voter = self.legislators_by_short[str(row['ShortName'])] name = voter['DisplayName'] except KeyError: self.warning('could not find legislator short name %s', row['ShortName']) name = row['ShortName'] if row['SelectVoteTypeCode'] == 'Y': vote.yes(name) elif row['SelectVoteTypeCode'] == 'N': vote.no(name) else: vote.vote('other', name) # bill.add_vote_event(vote) yield vote
def parse_vote_pdf(self, vote_url, bill): filename, response = self.urlretrieve(vote_url) text = convert_pdf(filename, type="text").decode() lines = text.splitlines() if "Senate" in vote_url: chamber = "upper" else: chamber = "lower" date_string = lines[0].split("Calendar Date:")[1].strip() date = datetime.datetime.strptime(date_string, "%b %d, %Y %I:%M (%p)") page_index = None for index, line in enumerate(lines): if "Yeas" in line and "Nays" in line: page_index = index break vote_counts = 5 * [0] vote_types = ["yes", "no", "not voting", "excused", "absent"] if page_index: counts = re.split(r"\s{2,}", lines[page_index].strip()) for index, count in enumerate(counts): number, string = count.split(" ", 1) number = int(number) vote_counts[index] = number else: raise ValueError("Vote Counts Not found at %s" % vote_url) passed = vote_counts[0] > vote_counts[1] # Consent calendar votes address multiple bills in one VoteEvent # eg, http://mgaleg.maryland.gov/2018RS/votes/Senate/0478.pdf is_consent_calendar = any( ["Consent Calendar" in line for line in lines[:page_index]]) consent_calendar_bills = None motion = "" if is_consent_calendar: motion = re.split(r"\s{2,}", lines[page_index - 4].strip())[0] consent_calendar_bills = re.split(r"\s{2,}", lines[page_index - 1].strip()) assert (consent_calendar_bills ), "Could not find bills for consent calendar vote" motion_keywords = [ "favorable", "reading", "amendment", "motion", "introduced", "bill pass", "committee", ] motion_lines = [ 3, 2, 4, 5, ] # Relative LineNumbers to be checked for existence of motion for i in motion_lines: if any(motion_keyword in motion.lower() for motion_keyword in motion_keywords): break motion = re.split(r"\s{2,}", lines[page_index - i].strip())[0] else: if not any(motion_keyword in motion.lower() for motion_keyword in motion_keywords): # This condition covers for the bad formating in SB 1260 motion = lines[page_index - 3] if not any(motion_keyword in motion.lower() for motion_keyword in motion_keywords): # Check this one for SB 747 motion = "No motion given" self.warning("No motion given") vote = VoteEvent( bill=bill, chamber=chamber, start_date=date.strftime("%Y-%m-%d"), motion_text=motion, classification="passage", result="pass" if passed else "fail", ) # Include bill ID to avoid duplication for consent calendars vote.pupa_id = "{}#{}".format(vote_url, bill.identifier) for index, vote_type in enumerate(vote_types): vote.set_count(vote_type, vote_counts[index]) page_index = page_index + 2 # Keywords for identifying where names are located in the pdf show_stoppers = [ "Voting Nay", "Not Voting", "COPY", "Excused", "indicates vote change", "Indicates Vote Change", ] vote_index = 0 # For matching number of names extracted with vote counts(extracted independently) vote_name_counts = 5 * [0] while page_index < len(lines): current_line = lines[page_index].strip() if not current_line or "Voting Yea" in current_line: page_index += 1 continue if any(show_stopper in current_line for show_stopper in show_stoppers): page_index += 1 vote_index = vote_index + 1 continue names = re.split(r"\s{2,}", current_line) vote_name_counts[vote_index] += len(names) for name in names: vote.vote(vote_types[vote_index], name) page_index += 1 if vote_counts != vote_name_counts: raise ValueError("Votes Count and Number of Names don't match") return vote
def scrape(self, session=None): HTML_TAGS_RE = r'<.*?>' if session is None: session = self.latest_session() year_slug = self.jurisdiction.get_year_slug(session) # Load all bills and resolutions via the private API bills_url = \ 'http://legislature.vermont.gov/bill/loadBillsReleased/{}/'.\ format(year_slug) bills_json = self.get(bills_url).text bills = json.loads(bills_json)['data'] or [] bills_url = \ 'http://legislature.vermont.gov/bill/loadBillsIntroduced/{}/'.\ format(year_slug) bills_json = self.get(bills_url).text bills.extend(json.loads(bills_json)['data'] or []) resolutions_url = \ 'http://legislature.vermont.gov/bill/loadAllResolutionsByChamber/{}/both'.\ format(year_slug) resolutions_json = self.get(resolutions_url).text bills.extend(json.loads(resolutions_json)['data'] or []) # Parse the information from each bill for info in bills: # Strip whitespace from strings info = {k: v.strip() for k, v in info.items()} # Identify the bill type and chamber if info['BillNumber'].startswith('J.R.H.'): bill_type = 'joint resolution' bill_chamber = 'lower' elif info['BillNumber'].startswith('J.R.S.'): bill_type = 'joint resolution' bill_chamber = 'upper' elif info['BillNumber'].startswith('H.C.R.'): bill_type = 'concurrent resolution' bill_chamber = 'lower' elif info['BillNumber'].startswith('S.C.R.'): bill_type = 'concurrent resolution' bill_chamber = 'upper' elif info['BillNumber'].startswith('H.R.'): bill_type = 'resolution' bill_chamber = 'lower' elif info['BillNumber'].startswith('S.R.'): bill_type = 'resolution' bill_chamber = 'upper' elif info['BillNumber'].startswith('PR.'): bill_type = 'constitutional amendment' if info['Body'] == 'H': bill_chamber = 'lower' elif info['Body'] == 'S': bill_chamber = 'upper' else: raise AssertionError("Amendment not tied to chamber") elif info['BillNumber'].startswith('H.'): bill_type = 'bill' bill_chamber = 'lower' elif info['BillNumber'].startswith('S.'): bill_type = 'bill' bill_chamber = 'upper' else: raise AssertionError( "Unknown bill type found: '{}'". format(info['BillNumber']) ) bill_id = info['BillNumber'].replace('.', '').replace(' ', '') # put one space back in between type and number bill_id = re.sub(r'([a-zA-Z]+)(\d+)', r'\1 \2', bill_id) # Create the bill using its basic information bill = Bill( identifier=bill_id, legislative_session=session, chamber=bill_chamber, title=info['Title'], classification=bill_type ) if 'resolution' in bill_type: bill.add_source(resolutions_url) else: bill.add_source(bills_url) # Load the bill's information page to access its metadata bill_url = 'http://legislature.vermont.gov/bill/status/{0}/{1}'.\ format(year_slug, info['BillNumber']) doc = self.lxmlize(bill_url) bill.add_source(bill_url) # Capture sponsors sponsors = doc.xpath( '//dl[@class="summary-table"]/dt[text()="Sponsor(s)"]/' 'following-sibling::dd[1]/ul/li' ) sponsor_type = 'primary' for sponsor in sponsors: if sponsor.xpath('span/text()') == ['Additional Sponsors']: sponsor_type = 'cosponsor' continue sponsor_name = sponsor.xpath('a/text()')[0].\ replace("Rep.", "").replace("Sen.", "").strip() if sponsor_name and not \ (sponsor_name[:5] == "Less" and len(sponsor_name) == 5): bill.add_sponsorship( name=sponsor_name, classification=sponsor_type, entity_type='person', primary=(sponsor_type == 'primary') ) # Capture bill text versions # Warning: There's a TODO in VT's source code saying 'move this to where it used to be' # so leave in the old and new positions versions = doc.xpath( '//dl[@class="summary-table"]/dt[text()="Bill/Resolution Text"]/' 'following-sibling::dd[1]/ul/li/a |' '//ul[@class="bill-path"]//a' ) for version in versions: if version.xpath('text()'): bill.add_version_link( note=version.xpath('text()')[0], url=version.xpath('@href')[0].replace(' ', '%20'), media_type='application/pdf' ) # Identify the internal bill ID, used for actions and votes # If there is no internal bill ID, then it has no extra information try: internal_bill_id = re.search( r'"bill/loadBillDetailedStatus/.+?/(\d+)"', lxml.etree.tostring(doc).decode('utf-8') ).group(1) except AttributeError: self.warning("Bill {} appears to have no activity".format(info['BillNumber'])) yield bill continue # Capture actions actions_url = 'http://legislature.vermont.gov/bill/loadBillDetailedStatus/{0}/{1}'.\ format(year_slug, internal_bill_id) actions_json = self.get(actions_url).text actions = json.loads(actions_json)['data'] bill.add_source(actions_url) chambers_passed = set() for action in actions: action = {k: v for k, v in action.items() if v is not None} if "Signed by Governor" in action['FullStatus']: actor = 'executive' elif action['ChamberCode'] == 'H': actor = 'lower' elif action['ChamberCode'] == 'S': actor = 'upper' else: raise AssertionError("Unknown actor for bill action") # Categorize action if "Signed by Governor" in action['FullStatus']: # assert chambers_passed == set("HS") action_type = 'executive-signature' elif "Vetoed by the Governor" in action['FullStatus']: action_type = 'executive-veto' elif "Read first time" in action['FullStatus'] \ or "Read 1st time" in action['FullStatus']: action_type = 'introduction' elif "Reported favorably" in action['FullStatus']: action_type = 'committee-passage-favorable' elif actor == 'lower' and any(x.lower().startswith('aspassed') for x in action['keywords'].split(';')): action_type = 'passage' chambers_passed.add("H") elif actor == 'upper' and any(x.lower().startswith(' aspassed') or x.lower().startswith('aspassed') for x in action['keywords'].split(';')): action_type = 'passage' chambers_passed.add("S") else: action_type = None bill.add_action( description=re.sub(HTML_TAGS_RE, "", action['FullStatus']), date=datetime.datetime.strftime( datetime.datetime.strptime(action['StatusDate'], '%m/%d/%Y'), '%Y-%m-%d' ), chamber=actor, classification=action_type ) # Capture votes votes_url = 'http://legislature.vermont.gov/bill/loadBillRollCalls/{0}/{1}'.format( year_slug, internal_bill_id) votes_json = self.get(votes_url).text votes = json.loads(votes_json)['data'] bill.add_source(votes_url) for vote in votes: roll_call_id = vote['VoteHeaderID'] roll_call_url = ('http://legislature.vermont.gov/bill/' 'loadBillRollCallDetails/{0}/{1}'.format( year_slug, roll_call_id)) roll_call_json = self.get(roll_call_url).text roll_call = json.loads(roll_call_json)['data'] roll_call_yea = [] roll_call_nay = [] roll_call_not_voting = [] for member in roll_call: (member_name, _district) = member['MemberName'].split(" of ") member_name = member_name.strip() if member['MemberVote'] == "Yea": roll_call_yea.append(member_name) elif member['MemberVote'] == "Nay": roll_call_nay.append(member_name) else: roll_call_not_voting.append(member_name) if ("Passed -- " in vote['FullStatus'] or "Veto of Governor overridden" in vote['FullStatus']): did_pass = True elif ("Failed -- " in vote['FullStatus'] or 'Veto of the Governor sustained' in vote['FullStatus']): did_pass = False else: raise AssertionError("Roll call vote result is unclear") # Check vote counts yea_count = int(re.search(r'Yeas = (\d+)', vote['FullStatus']).group(1)) nay_count = int(re.search(r'Nays = (\d+)', vote['FullStatus']).group(1)) vote_to_add = VoteEvent( bill=bill, chamber=('lower' if vote['ChamberCode'] == 'H' else 'upper'), start_date=datetime.datetime.strftime( datetime.datetime.strptime(vote['StatusDate'], '%m/%d/%Y'), '%Y-%m-%d' ), motion_text=re.sub(HTML_TAGS_RE, "", vote['FullStatus']).strip(), result='pass' if did_pass else 'fail', classification='passage', legislative_session=session, ) vote_to_add.add_source(roll_call_url) vote_to_add.set_count('yes', yea_count) vote_to_add.set_count('no', nay_count) vote_to_add.set_count('not voting', len(roll_call_not_voting)) for member in roll_call_yea: vote_to_add.yes(member) for member in roll_call_nay: vote_to_add.no(member) for member in roll_call_not_voting: vote_to_add.vote('not voting', member) yield vote_to_add # Capture extra information- Not yet implemented # Witnesses: # http://legislature.vermont.gov/bill/loadBillWitnessList/{year_slug}/{internal_bill_id} # Conference committee members: # http://legislature.vermont.gov/bill/loadBillConference/{year_slug}/{bill_number} # Committee meetings: # http://legislature.vermont.gov/committee/loadHistoryByBill/{year_slug}?LegislationId={internal_bill_id} yield bill
def parse_vote(self, bill, link): member_doc = lxml.html.fromstring(self.get(link).text) motion = member_doc.xpath("//div[@id='main_content']/h4/text()") opinions = member_doc.xpath("//div[@id='main_content']/h3/text()") if len(opinions) > 0: temp = opinions[0].split() vote_chamber = temp[0] vote_date = datetime.datetime.strptime(temp[-1], '%m/%d/%Y') vote_status = " ".join(temp[2:-2]) vote_status = vote_status if vote_status.strip() else motion[0] vote_chamber = 'upper' if vote_chamber == 'Senate' else 'lower' for i in opinions: try: count = int(i[i.find("(") + 1:i.find(")")]) except ValueError: # This is likely not a vote-count text chunk # It's probably '`On roll call the vote was:` pass else: if "yea" in i.lower(): yes_count = count elif "nay" in i.lower(): no_count = count elif "present" in i.lower(): p_count = count elif "absent" in i.lower(): a_count = count vote = VoteEvent( bill=bill, start_date=vote_date.strftime('%Y-%m-%d'), chamber=vote_chamber, motion_text=vote_status, result='pass' if yes_count > no_count else 'fail', classification='passage', ) vote.pupa_id = link vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('abstain', p_count) vote.set_count('absent', a_count) vote.add_source(link) a_links = member_doc.xpath("//div[@id='main_content']/a/text()") for i in range(1, len(a_links)): if i <= yes_count: vote.vote('yes', re.sub(',', '', a_links[i]).split()[0]) elif no_count != 0 and i > yes_count and i <= yes_count + no_count: vote.vote('no', re.sub(',', '', a_links[i]).split()[0]) else: vote.vote('other', re.sub(',', '', a_links[i]).split()[0]) yield vote else: self.warning("No Votes for: %s", link)
def build_vote(session, bill_id, url, vote_record, chamber, motion_text): passed = len(vote_record['yes']) > len(vote_record['no']) vote_event = VoteEvent( result='pass' if passed else 'fail', chamber=chamber, start_date=vote_record['date'].strftime('%Y-%m-%d'), motion_text=motion_text, classification='passage', legislative_session=session, bill=bill_id, bill_chamber='upper' if bill_id[0] is 'S' else 'lower' ) vote_event.pupa_id = url vote_event.set_count('yes', len(vote_record['yes'])) vote_event.set_count('no', len(vote_record['no'])) vote_event.set_count('excused', len(vote_record['excused'])) vote_event.set_count('absent', len(vote_record['absent'])) vote_event.set_count('other', len(vote_record['other'])) for vote_type in ['yes', 'no', 'excused', 'absent', 'other']: for voter in vote_record[vote_type]: vote_event.vote(vote_type, voter) vote_event.add_source(url) return vote_event
def scrape_action_page(self, bill, page): action_rows = page.xpath("//tbody/tr") for row in action_rows: action_date = row.xpath("td[1]/text()")[0] action_date = datetime.strptime(action_date, "%m/%d/%Y") action_year = action_date.year action_date = action_date.strftime("%Y-%m-%d") if row.xpath("td[2]/text()"): action_actor = row.xpath("td[2]/text()")[0] action_actor = self.chamber_map_reverse[action_actor.strip()] action_name = row.xpath("string(td[3])") # House votes if "Supplement" in action_name: actor = "lower" vote_action = re.findall(r"(.+)-\s*\d+\s*YEAS", action_name)[0].strip() y = int(re.findall(r"(\d+)\s*YEAS", action_name)[0]) n = int(re.findall(r"(\d+)\s*NAYS", action_name)[0]) # get supplement number n_supplement = int(re.findall(r"No\.\s*(\d+)", action_name)[0]) cached_vote = VoteEvent( chamber=actor, start_date=action_date, motion_text=vote_action, result="pass" if y > n else "fail", classification="passage", bill=bill, ) cached_vote.set_count("yes", y) cached_vote.set_count("no", n) housevote_pdf = "https://malegislature.gov/Journal/House/{}/{}/RollCalls".format( bill.legislative_session, action_year ) self.scrape_house_vote(cached_vote, housevote_pdf, n_supplement) cached_vote.add_source(housevote_pdf) cached_vote.pupa_id = "{}#{}".format(housevote_pdf, n_supplement) # XXX: disabled house votes on 8/1 to try to get MA importing again # will leaving this in and commented out once we resolve the ID issue # yield cached_vote # Senate votes if "Roll Call" in action_name: actor = "upper" # placeholder vote_action = action_name.split(" -")[0] # 2019 H86 Breaks our regex, # Ordered to a third reading -- # see Senate Roll Call #25 and House Roll Call 56 if "yeas" in action_name and "nays" in action_name: try: y, n = re.search( r"(\d+) yeas .*? (\d+) nays", action_name.lower() ).groups() y = int(y) n = int(n) except AttributeError: y = int( re.search(r"yeas\s+(\d+)", action_name.lower()).group(1) ) n = int( re.search(r"nays\s+(\d+)", action_name.lower()).group(1) ) # TODO: other count isn't included, set later cached_vote = VoteEvent( chamber=actor, start_date=action_date, motion_text=vote_action, result="pass" if y > n else "fail", classification="passage", bill=bill, ) cached_vote.set_count("yes", y) cached_vote.set_count("no", n) rollcall_pdf = "http://malegislature.gov" + row.xpath( "string(td[3]/a/@href)" ) self.scrape_senate_vote(cached_vote, rollcall_pdf) cached_vote.add_source(rollcall_pdf) cached_vote.pupa_id = rollcall_pdf # XXX: also disabled, see above note # yield cached_vote attrs = self.categorizer.categorize(action_name) action = bill.add_action( action_name.strip(), action_date, chamber=action_actor, classification=attrs["classification"], ) for com in attrs.get("committees", []): com = com.strip() action.add_related_entity(com, entity_type="organization")
def build_vote(session, bill_id, url, vote_record, chamber, motion_text): passed = len(vote_record['yes']) > len(vote_record['no']) vote_event = VoteEvent( result='pass' if passed else 'fail', chamber=chamber, start_date=vote_record['date'].strftime('%Y-%m-%d'), motion_text=motion_text, classification='passage', legislative_session=session, bill=bill_id, bill_chamber='upper' if bill_id[0] is 'S' else 'lower') vote_event.pupa_id = url vote_event.set_count('yes', len(vote_record['yes'])) vote_event.set_count('no', len(vote_record['no'])) vote_event.set_count('excused', len(vote_record['excused'])) vote_event.set_count('absent', len(vote_record['absent'])) vote_event.set_count('other', len(vote_record['other'])) for vote_type in ['yes', 'no', 'excused', 'absent', 'other']: for voter in vote_record[vote_type]: vote_event.vote(vote_type, voter) vote_event.add_source(url) return vote_event
def scrape_vote(self, bill, vote_url, chamber, date): page = self.lxmlize(vote_url) try: motion = page.xpath("//font/text()")[2] except IndexError: self.warning("Vote Summary Page Broken ") return # eg. http://leg.colorado.gov/content/sb18-033vote563ce6 if ('AM' in motion or 'PM' in motion) and '/' in motion: motion = "Motion not given." if 'withdrawn' not in motion: yes_no_counts = page.xpath( "//tr/td[preceding-sibling::td/descendant::" "font[contains(text(),'Aye')]]/font/text()") other_counts = page.xpath( "//tr/td[preceding-sibling::td/descendant::" "font[contains(text(),'Absent')]]/font/text()") abstain_counts = page.xpath( "//tr/td[preceding-sibling::td/descendant::" "font[contains(text(),'17C')]]/font/text()") yes_count = int(yes_no_counts[0]) no_count = int(yes_no_counts[2]) exc_count = int(other_counts[2]) absent_count = int(other_counts[0]) abstain_count = 0 if abstain_counts: abstain_count = int(abstain_counts[0]) # fix for # http://leg.colorado.gov/content/hb19-1029vote65e72e if absent_count == -1: absent_count = 0 passed = yes_count > no_count vote = VoteEvent( chamber=chamber, start_date=self._tz.localize(date), motion_text=motion, result='pass' if passed else 'fail', bill=bill, classification='passage', ) vote.pupa_id = vote_url vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('excused', exc_count) vote.set_count('absent', absent_count) vote.set_count('abstain', abstain_count) vote.add_source(vote_url) rolls = page.xpath("//tr[preceding-sibling::tr/descendant::" "td/div/b/font[contains(text(),'Vote')]]") vote_abrv = { 'Y': 'yes', 'N': 'no', 'E': 'excused', 'A': 'absent', '-': 'absent', '17C': 'abstain' } for roll in rolls: voted = roll.xpath(".//td/div/font/text()")[0].strip() voter = roll.xpath(".//td/font/text()")[0].strip() if voted == 'V': continue vote.vote(vote_abrv[voted], voter) yield vote
def scrape(self): unreachable_urls = [] for leg_summary in self.legislation(created_after=datetime.datetime(2015, 5, 17)) : title = leg_summary['Title'].strip() if not title or not leg_summary['Intro\xa0Date'] : continue # https://chicago.legistar.com/LegislationDetail.aspx?ID=1800754&GUID=29575A7A-5489-4D8B-8347-4FC91808B201&Options=Advanced&Search= # doesn't have an intro date bill_type = BILL_TYPES[leg_summary['Type']] bill_session = self.session(self.toTime(leg_summary['Intro\xa0Date'])) bill = Bill(identifier=leg_summary['Record #'], legislative_session=bill_session, title=title, classification=bill_type, from_organization={"name":"Chicago City Council"}) bill.add_source(leg_summary['url']) try : leg_details = self.legDetails(leg_summary['url']) except IndexError : unreachable_urls.append(leg_summary['url']) yield bill continue for related_bill in leg_details.get('Related files', []) : lower_title = title.lower() if "sundry" in title or "miscellaneous" in title: #these are ominbus bill.add_related_bill(identifier = related_bill['label'], legislative_session = bill.legislative_session, relation_type='replaces') #for now we're skipping related bills if they #don't contain words that make us think they're #in a ominbus relationship with each other for i, sponsor in enumerate(leg_details.get('Sponsors', [])) : if i == 0 : primary = True sponsorship_type = "Primary" else : primary = False sponsorship_type = "Regular" sponsor_name = sponsor['label'] # Does the Mayor/Clerk introduce legisislation as # individuals role holders or as the OFfice of City # Clerk and the Office of the Mayor? entity_type = 'person' if sponsor_name.startswith(('City Clerk', 'Mendoza, Susana')) : sponsor_name = 'Office of the City Clerk' entity_type = 'organization' elif sponsor_name.startswith(('Emanuel, Rahm',)) : sponsor_name = 'Office of the Mayor' entity_type = 'organization' if not sponsor_name.startswith(('Misc. Transmittal', 'No Sponsor', 'Dept./Agency')) : bill.add_sponsorship(sponsor_name, sponsorship_type, entity_type, primary, entity_id = _make_pseudo_id(name=sponsor_name)) if 'Topic' in leg_details : for subject in leg_details[u'Topic'].split(',') : bill.add_subject(subject) for attachment in leg_details.get('Attachments', []) : if attachment['label'] : bill.add_version_link(attachment['label'], attachment['url'], media_type="application/pdf") for action in self.history(leg_summary['url']) : action_description = action['Action'] try : action_date = self.toTime(action['Date']).date().isoformat() except AttributeError : # https://chicago.legistar.com/LegislationDetail.aspx?ID=1424866&GUID=CEC53337-B991-4268-AE8A-D4D174F8D492 continue if action_description : try : responsible_org = action['Action\xa0By']['label'] except TypeError : responsible_org = action['Action\xa0By'] if responsible_org == 'City Council' : responsible_org = 'Chicago City Council' act = bill.add_action(action_description, action_date, organization={'name': responsible_org}, classification=ACTION_CLASSIFICATION[action_description]) if action_description == 'Referred' : try : leg_details['Current Controlling Legislative Body']['label'] controlling_bodies = [leg_details['Current Controlling Legislative Body']] except TypeError : controlling_bodies = leg_details['Current Controlling Legislative Body'] if controlling_bodies : for controlling_body in controlling_bodies : body_name = controlling_body['label'] if body_name.startswith("Joint Committee") : act.add_related_entity(body_name, 'organization') else : act.add_related_entity(body_name, 'organization', entity_id = _make_pseudo_id(name=body_name)) if 'url' in action['Action\xa0Details'] : action_detail_url = action['Action\xa0Details']['url'] result, votes = self.extractVotes(action_detail_url) if votes and result : # see https://github.com/datamade/municipal-scrapers-us/issues/15 action_vote = VoteEvent(legislative_session=bill.legislative_session, motion_text=action_description, organization={'name': responsible_org}, classification=None, start_date=action_date, result=result, bill=bill) action_vote.add_source(action_detail_url) for option, voter in votes : action_vote.vote(option, voter) yield action_vote bill.extras = {'local_classification' : leg_summary['Type']} yield bill print(unreachable_urls)
def parse_vote(self, actor, date, row, session, bill_id, bill_chamber, source): """ takes the actor, date and row element and returns a Vote object """ spans = row.xpath(".//span") motion = row.text.replace(u"\u00a0", " ").replace("-", "").strip() motion = motion if motion else "passage" passed, yes_count, no_count, other_count = ( spans[0].text_content().rsplit("-", 3)) yes_votes = self.get_names(spans[1].tail) no_votes = self.get_names(spans[2].tail) other_votes = [] for span in spans[3:]: if span.text.startswith(("Absent", "Excused")): other_votes += self.get_names(span.tail) for key, val in { "adopted": "pass", "passed": "pass", "failed": "fail" }.items(): if key in passed.lower(): passed = val break vote = VoteEvent( chamber=actor, start_date=date, motion_text=motion, bill=bill_id, bill_chamber=bill_chamber, result=passed, classification="passage", legislative_session=session, ) vote.add_source(source) vote.set_count("yes", int(yes_count)) vote.set_count("no", int(no_count)) vote.set_count("absent", int(other_count)) for name in yes_votes: if name and name != "None": vote.yes(name) for name in no_votes: if name and name != "None": vote.no(name) for name in other_votes: if name and name != "None": vote.vote("absent", name) yield vote
def parse_vote(self, actor, date, row, session, bill_id, bill_chamber, source): """ takes the actor, date and row element and returns a Vote object """ spans = row.xpath('.//span') motion = row.text.replace(u'\u00a0', " ").replace("-", "").strip() motion = motion if motion else "passage" passed, yes_count, no_count, other_count = spans[0].text_content().rsplit('-', 3) yes_votes = self.get_names(spans[1].tail) no_votes = self.get_names(spans[2].tail) other_votes = [] for span in spans[3:]: if span.text.startswith(('Absent', 'Excused')): other_votes += self.get_names(span.tail) for key, val in {'adopted': 'pass', 'passed': 'pass', 'failed': 'fail'}.items(): if key in passed.lower(): passed = val break vote = VoteEvent(chamber=actor, start_date=date, motion_text=motion, bill=bill_id, bill_chamber=bill_chamber, result=passed, classification="passage", legislative_session=session) vote.add_source(source) vote.set_count('yes', int(yes_count)) vote.set_count('no', int(no_count)) vote.set_count('absent', int(other_count)) for name in yes_votes: if name and name != 'None': vote.yes(name) for name in no_votes: if name and name != 'None': vote.no(name) for name in other_votes: if name and name != 'None': vote.vote('absent', name) yield vote
def scrape_votes(self, vote_url, bill, chamber): # Grabs text from pdf pdflines = [ line.decode("utf-8") for line in convert_pdf(vote_url, "text").splitlines() ] vote_date = 0 voters = defaultdict(list) for x in range(len(pdflines)): line = pdflines[x] if re.search(r"(\d+/\d+/\d+)", line): initial_date = line.strip() if ("AM" in line) or ("PM" in line): split_l = line.split() for y in split_l: if ":" in y: time_location = split_l.index(y) motion = " ".join(split_l[0:time_location]) time = split_l[time_location:] if len(time) > 0: time = "".join(time) dt = initial_date + " " + time dt = datetime.strptime(dt, "%m/%d/%Y %I:%M:%S%p") vote_date = central.localize(dt) vote_date = vote_date.isoformat() # In rare case that no motion is provided if len(motion) < 1: motion = "No Motion Provided" if "YEAS:" in line: yeas = int(line.split()[-1]) if "NAYS:" in line: nays = int(line.split()[-1]) if "ABSTAINED:" in line: abstained = int(line.split()[-1]) if "PASSES:" in line: abstained = int(line.split()[-1]) if "NOT VOTING:" in line: not_voting = int(line.split()[-1]) if "YEAS :" in line: y = 0 next_line = pdflines[x + y] while "NAYS : " not in next_line: next_line = next_line.split(" ") if next_line and ("YEAS" not in next_line): for v in next_line: if v and "YEAS" not in v: voters["yes"].append(v.strip()) next_line = pdflines[x + y] y += 1 if line and "NAYS :" in line: y = 0 next_line = 0 next_line = pdflines[x + y] while ("ABSTAINED : " not in next_line) and ("PASSES :" not in next_line): next_line = next_line.split(" ") if next_line and "NAYS" not in next_line: for v in next_line: if v and "NAYS" not in v: voters["no"].append(v.strip()) next_line = pdflines[x + y] y += 1 if line and ("ABSTAINED :" in line or "PASSES :" in line): y = 2 next_line = 0 next_line = pdflines[x + y] while "NOT VOTING :" not in next_line: next_line = next_line.split(" ") if next_line and ("ABSTAINED" not in next_line or "PASSES" not in next_line): for v in next_line: if v: voters["abstain"].append(v.strip()) next_line = pdflines[x + y] y += 1 if line and "NOT VOTING : " in line: lines_to_go_through = math.ceil(not_voting / len(line.split())) next_line = pdflines[x] for y in range(lines_to_go_through): next_line = pdflines[x + y + 2].split(" ") for v in next_line: if v: voters["not voting"].append(v.strip()) if yeas > (nays + abstained + not_voting): passed = True else: passed = False ve = VoteEvent( chamber=chamber, start_date=vote_date, motion_text=motion, result="pass" if passed else "fail", classification="bill", bill=bill, ) ve.add_source(vote_url) for how_voted, how_voted_voters in voters.items(): for voter in how_voted_voters: if len(voter) > 0: ve.vote(how_voted, voter) # Resets voters dictionary before going onto next page in pdf voters = defaultdict(list) yield ve
def _process_votes(self, rollcalls, bill_id, original_chamber, session, proxy): result_types = { 'FAILED': False, 'DEFEATED': False, 'PREVAILED': True, 'PASSED': True, 'SUSTAINED': True, 'NOT SECONDED': False, 'OVERRIDDEN': True, 'ADOPTED': True, } for r in rollcalls: proxy_link = proxy["url"] + r["link"] (path, resp) = self.urlretrieve(proxy_link) text = convert_pdf(path, 'text').decode("utf-8") lines = text.split("\n") os.remove(path) chamber = "lower" if "house of representatives" in lines[0].lower() else "upper" date_parts = lines[1].strip().split()[-3:] date_str = " ".join(date_parts).title() + " " + lines[2].strip() vote_date = datetime.datetime.strptime(date_str, "%b %d, %Y %I:%M:%S %p") vote_date = pytz.timezone('America/Indiana/Indianapolis').localize(vote_date) vote_date = vote_date.isoformat() passed = None for res, val in result_types.items(): # We check multiple lines now because the result of the # roll call vote as parsed can potentially be split. # PDF documents suck. for line in lines[3:5]: if res in line.upper(): passed = val break if passed is None: raise AssertionError("Missing bill passage type") motion = " ".join(lines[4].split()[:-2]) try: yeas = int(lines[4].split()[-1]) nays = int(lines[5].split()[-1]) excused = int(lines[6].split()[-1]) not_voting = int(lines[7].split()[-1]) except ValueError: self.logger.warning("Vote format is weird, skipping") continue vote = VoteEvent(chamber=chamber, legislative_session=session, bill=bill_id, bill_chamber=original_chamber, start_date=vote_date, motion_text=motion, result="pass" if passed else "fail", classification="passage") vote.set_count('yes', yeas) vote.set_count('no', nays) vote.set_count('excused', excused) vote.set_count('not voting', not_voting) vote.add_source(proxy_link) currently_counting = "" possible_vote_lines = lines[8:] for l in possible_vote_lines: l = l.replace("NOT\xc2\xa0VOTING", "NOT VOTING") l = l.replace("\xc2\xa0", " -") if "yea-" in l.lower().replace(" ", ""): currently_counting = "yes" elif "nay-" in l.lower().replace(" ", ""): currently_counting = "no" elif "excused-" in l.lower().replace(" ", ""): currently_counting = "excused" elif "notvoting-" in l.lower().replace(" ", ""): currently_counting = "not voting" elif currently_counting == "": pass elif re.search(r'v\. \d\.\d', l): # this gets rid of the version number # which is often found at the bottom of the doc pass else: voters = l.split(" ") for v in voters: if v.strip(): vote.vote(currently_counting, v.strip()) yield vote
def parse_vote_page(self, vote_url, bill): vote_html = self.get(vote_url).text doc = lxml.html.fromstring(vote_html) # chamber if "senate" in vote_url: chamber = "upper" else: chamber = "lower" # date in the following format: Mar 23, 2009 date = doc.xpath('//td[starts-with(text(), "Legislative")]')[0].text date = date.replace(u"\xa0", " ") date = datetime.datetime.strptime(date[18:], "%b %d, %Y") # motion motion = "".join(x.text_content() for x in doc.xpath('//td[@colspan="23"]')) if motion == "": motion = "No motion given" # XXX: Double check this. See SJ 3. motion = motion.replace(u"\xa0", " ") # totals tot_class = doc.xpath('//td[contains(text(), "Yeas")]')[0].get("class") totals = doc.xpath('//td[@class="%s"]/text()' % tot_class)[1:] yes_count = int(totals[0].split()[-1]) no_count = int(totals[1].split()[-1]) other_count = int(totals[2].split()[-1]) other_count += int(totals[3].split()[-1]) other_count += int(totals[4].split()[-1]) passed = yes_count > no_count vote = VoteEvent( bill=bill, chamber=chamber, start_date=date.strftime("%Y-%m-%d"), motion_text=motion, classification="passage", result="pass" if passed else "fail", ) vote.pupa_id = vote_url # contains sequence number vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("other", other_count) # go through, find Voting Yea/Voting Nay/etc. and next tds are voters func = None for td in doc.xpath("//td/text()"): td = td.replace(u"\xa0", " ") if td.startswith("Voting Yea"): func = vote.yes elif td.startswith("Voting Nay"): func = vote.no elif td.startswith("Not Voting"): func = vote.other elif td.startswith("Excused"): func = vote.other elif func: td = td.rstrip("*") func(td) return vote