def build_vote(session, bill_id, url, vote_record, chamber, motion_text): # When they vote in a substitute they mark it as XHB bill_id = bill_id.replace('XHB', 'HB') passed = len(vote_record['yes']) > len(vote_record['no']) vote_event = VoteEvent( result='pass' if passed else 'fail', chamber=chamber, start_date=vote_record['date'].strftime('%Y-%m-%d'), motion_text=motion_text, classification='passage', legislative_session=session, bill=bill_id, bill_chamber='upper' if bill_id[0] == 'S' else 'lower' ) vote_event.pupa_id = url vote_event.set_count('yes', len(vote_record['yes'])) vote_event.set_count('no', len(vote_record['no'])) vote_event.set_count('excused', len(vote_record['excused'])) vote_event.set_count('absent', len(vote_record['absent'])) vote_event.set_count('other', len(vote_record['other'])) for vote_type in ['yes', 'no', 'excused', 'absent', 'other']: for voter in vote_record[vote_type]: vote_event.vote(vote_type, voter) vote_event.add_source(url) return vote_event
def add_vote(self, bill, chamber, date, text, url): votes = re.findall(r'Ayes,?[\s]?(\d+)[,;]\s+N(?:oes|ays),?[\s]?(\d+)', text) yes, no = int(votes[0][0]), int(votes[0][1]) vtype = 'other' for regex, type in motion_classifiers.items(): if re.match(regex, text): vtype = type break v = VoteEvent( chamber=chamber, start_date=TIMEZONE.localize(date), motion_text=text, result='pass' if yes > no else 'fail', classification=vtype, bill=bill, ) v.set_count('yes', yes) v.set_count('no', no) # fetch the vote itself if url: v.add_source(url) if 'av' in url: self.add_house_votes(v, url) elif 'sv' in url: self.add_senate_votes(v, url) return v
def scrape_senate_vote(self, bill, url, date): try: filename, resp = self.urlretrieve(url) except scrapelib.HTTPError: self.warning("missing vote file %s" % url) return vote = Vote( chamber='upper', start_date=date.strftime("%Y-%m-%d"), motion_text='Passage', # setting 'fail' for now. result='fail', classification='passage', bill=bill ) vote.add_source(url) text = convert_pdf(filename, 'text').decode('utf-8') os.remove(filename) if re.search('Yea:\s+\d+\s+Nay:\s+\d+\s+Absent:\s+\d+', text): yield from self.scrape_senate_vote_3col(bill, vote, text, url, date) return data = re.split(r'(Yea|Nay|Absent)s?:', text)[::-1] data = filter(None, data) keymap = dict(yea='yes', nay='no') actual_vote = collections.defaultdict(int) vote_count = { 'yes': 0, 'no': 0, 'other': 0 } while True: if not data: break vote_val = data.pop() key = keymap.get(vote_val.lower(), 'other') values = data.pop() for name in re.split(r'(?:[\s,]+and\s|[\s,]{2,})', values): if name.lower().strip() == 'none.': continue name = name.replace('..', '') name = re.sub(r'\.$', '', name) name = name.strip('-1234567890 \n') if not name: continue vote.vote(key, name) actual_vote[vote_val] += 1 vote_count[key] += 1 assert actual_vote[vote_val] == vote_count[key] for key, value in vote_count.items(): vote.set_count(key, value) # updating result with actual value vote.result = 'pass' if vote_count['yes'] > (vote_count['no'] + vote_count['other']) else 'fail' yield vote
def scrape_vote(self, bill, vote_json, session): if vote_json['amendmentNumber']: motion = '{}: {}'.format( vote_json['amendmentNumber'], vote_json['action']) else: motion = vote_json['action'] result = 'pass' if vote_json['yesVotesCount'] > vote_json['noVotesCount'] else 'fail' v = VoteEvent( chamber=self.chamber_abbrev_map[vote_json['chamber']], start_date=self.parse_local_date(vote_json['voteDate']), motion_text=motion, result=result, legislative_session=session, bill=bill, classification='other', ) v.set_count(option='yes', value=vote_json['yesVotesCount']) v.set_count('no', vote_json['noVotesCount']) v.set_count('absent', vote_json['absentVotesCount']) v.set_count('excused', vote_json['excusedVotesCount']) v.set_count('other', vote_json['conflictVotesCount']) for name in vote_json['yesVotes'].split(','): if name.strip(): v.yes(name.strip()) for name in vote_json['noVotes'].split(','): if name.strip(): v.no(name.strip()) # add votes with other classifications # option can be 'yes', 'no', 'absent', # 'abstain', 'not voting', 'paired', 'excused' for name in vote_json['absentVotes'].split(','): if name.strip(): v.vote(option="absent", voter=name) for name in vote_json['excusedVotes'].split(','): if name.strip(): v.vote(option="excused", voter=name) for name in vote_json['conflictVotes'].split(','): if name.strip(): v.vote(option="other", voter=name) source_url = 'http://lso.wyoleg.gov/Legislation/{}/{}'.format( session, vote_json['billNumber']) v.add_source(source_url) yield v
def scrape_vote(self, bill, date, motion, url): try: page = self.get(url).text if 'not yet official' in page: # Sometimes they link to vote pages before they go live pass else: page = lxml.html.fromstring(page) if url.endswith('Senate'): actor = 'upper' else: actor = 'lower' votevals = ['yes', 'no', 'not voting', 'other'] count_path = "string(//td[@align = 'center' and contains(., '%s: ')])" yes_count = int(page.xpath(count_path % "Yeas").split()[-1]) no_count = int(page.xpath(count_path % "Nays").split()[-1]) not_voting_count = int(page.xpath(count_path % "Non Voting").split()[-1]) other_count = int(page.xpath(count_path % "Present").split()[-1]) passed = yes_count > no_count + not_voting_count + other_count vote = VoteEvent(start_date='2017-03-04', motion_text=motion, result='pass' if passed else 'fail', classification='passage', chamber=actor, bill=bill) try: excused_count = int(page.xpath(count_path % "Excused").split()[-1]) vote.set_count('excused', excused_count) votevals.append('excused') except: pass vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('not voting', not_voting_count) vote.set_count('other', other_count) vote.add_source(url) xpath = ( '//*[contains(@class, "ms-standardheader")]/' 'following-sibling::table') divs = page.xpath(xpath) for (voteval, div) in zip(votevals, divs): for a in div.xpath('.//a'): name = a.text_content().strip() if not name: continue else: vote.vote(voteval, name) yield vote except: # sometiems the link is there but is dead pass
def scrape_vote(self, chamber, session, bill_id, vote_url): NO_VOTE_URL = 'http://www.house.leg.state.mn.us/votes/novotefound.asp' resp = self.get(vote_url) html = resp.text # sometimes the link is broken, will redirect to NO_VOTE_URL if resp.url == NO_VOTE_URL: return doc = lxml.html.fromstring(html) try: motion = doc.xpath("//div[@id='leg_PageContent']/div/h2/text()")[0] except IndexError: self.logger.warning("Bill was missing a motion number, skipping") return vote_count = doc.xpath(".//div[@id='leg_PageContent']/div/h3/text()")[1].split() yeas = int(vote_count[0]) nays = int(vote_count[3]) # second paragraph has date paragraphs = doc.xpath(".//div[@id='leg_PageContent']/div/p/text()") date = None for p in paragraphs: try: date = datetime.datetime.strptime(p.strip(), '%m/%d/%Y').date() break except ValueError: pass if date is None: self.logger.warning("No date could be found for vote on %s" % motion) return vote = VoteEvent(chamber='lower', start_date=date, motion_text=motion, result='pass' if yeas > nays else 'fail', classification='passage', legislative_session=session, bill=bill_id, bill_chamber=chamber) vote.set_count('yes', yeas) vote.set_count('no', nays) vote.add_source(vote_url) vote.pupa_id = vote_url # first table has YEAs for name in doc.xpath('//table[1]/tr/td/font/text()'): vote.yes(name.strip()) # second table is nays for name in doc.xpath('//table[2]/tr/td/font/text()'): vote.no(name.strip()) yield vote
def test_full_vote_event(): j = Jurisdiction.objects.create(id='jid', division_id='did') j.legislative_sessions.create(name='1900', identifier='1900') sp1 = ScrapePerson('John Smith', primary_org='lower') sp2 = ScrapePerson('Adam Smith', primary_org='lower') org = ScrapeOrganization(name='House', classification='lower') bill = ScrapeBill('HB 1', '1900', 'Axe & Tack Tax Act', from_organization=org._id) vote_event = ScrapeVoteEvent(legislative_session='1900', motion_text='passage', start_date='1900-04-01', classification='passage:bill', result='pass', bill_chamber='lower', bill='HB 1', organization=org._id) vote_event.set_count('yes', 20) vote_event.yes('John Smith') vote_event.no('Adam Smith') oi = OrganizationImporter('jid') oi.import_data([org.as_dict()]) pi = PersonImporter('jid') pi.import_data([sp1.as_dict(), sp2.as_dict()]) mi = MembershipImporter('jid', pi, oi, DumbMockImporter()) mi.import_data([sp1._related[0].as_dict(), sp2._related[0].as_dict()]) bi = BillImporter('jid', oi, pi) bi.import_data([bill.as_dict()]) VoteEventImporter('jid', pi, oi, bi).import_data([vote_event.as_dict()]) assert VoteEvent.objects.count() == 1 ve = VoteEvent.objects.get() assert ve.legislative_session == LegislativeSession.objects.get() assert ve.motion_classification == ['passage:bill'] assert ve.bill == Bill.objects.get() count = ve.counts.get() assert count.option == 'yes' assert count.value == 20 votes = list(ve.votes.all()) assert len(votes) == 2 for v in ve.votes.all(): if v.voter_name == 'John Smith': assert v.option == 'yes' assert v.voter == Person.objects.get(name='John Smith') else: assert v.option == 'no' assert v.voter == Person.objects.get(name='Adam Smith')
def scrape_votes(self, bill, page): base_url = 'https://apps.azleg.gov/api/BillStatusFloorAction' for header in page['FloorHeaders']: params = { 'billStatusId': page['BillId'], 'billStatusActionId': header['BillStatusActionId'], 'includeVotes': 'true', } resp = self.get(base_url, params=params) actions = json.loads(resp.content.decode('utf-8')) for action in actions: if action['Action'] == 'No Action': continue action_date = datetime.datetime.strptime(action['ReportDate'], '%Y-%m-%dT%H:%M:%S') vote = VoteEvent( chamber={ 'S': 'upper', 'H': 'lower', }[header['LegislativeBody']], motion_text=action['Action'], classification='passage', result=( 'pass' if action['UnanimouslyAdopted'] or action['Ayes'] > action['Nays'] else 'fail' ), start_date=action_date.strftime('%Y-%m-%d'), bill=bill, ) vote.add_source(resp.url) vote.set_count('yes', action['Ayes'] or 0) vote.set_count('no', action['Nays'] or 0) vote.set_count('other', (action['Present'] or 0)) vote.set_count('absent', (action['Absent'] or 0)) vote.set_count('excused', (action['Excused'] or 0)) vote.set_count('not voting', (action['NotVoting'] or 0)) for v in action['Votes']: vote_type = { 'Y': 'yes', 'N': 'no', }.get(v['Vote'], 'other') vote.vote(vote_type, v['Legislator']['FullName']) vote.pupa_id = resp.url+str(action['ReferralNumber']) yield vote
def parse_vote(self, bill, link): member_doc = lxml.html.fromstring(self.get(link).text) motion = member_doc.xpath("//div[@id='main_content']/h4/text()") opinions = member_doc.xpath("//div[@id='main_content']/h3/text()") if len(opinions) > 0: temp = opinions[0].split() vote_chamber = temp[0] vote_date = datetime.datetime.strptime(temp[-1], '%m/%d/%Y') vote_status = " ".join(temp[2:-2]) vote_status = vote_status if vote_status.strip() else motion[0] vote_chamber = 'upper' if vote_chamber == 'Senate' else 'lower' for i in opinions: try: count = int(i[i.find("(") + 1:i.find(")")]) except: pass if "yea" in i.lower(): yes_count = count elif "nay" in i.lower(): no_count = count elif "present" in i.lower(): p_count = count elif "absent" in i.lower(): a_count = count vote = VoteEvent( bill=bill, start_date=vote_date.strftime('%Y-%m-%d'), chamber=vote_chamber, motion_text=vote_status, result='pass' if yes_count > no_count else 'fail', classification='passage', ) vote.pupa_id = link vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('abstain', p_count) vote.set_count('absent', a_count) vote.add_source(link) a_links = member_doc.xpath("//div[@id='main_content']/a/text()") for i in range(1, len(a_links)): if i <= yes_count: vote.vote('yes', re.sub(',', '', a_links[i]).split()[0]) elif no_count != 0 and i > yes_count and i <= yes_count + no_count: vote.vote('no', re.sub(',', '', a_links[i]).split()[0]) else: vote.vote('other', re.sub(',', '', a_links[i]).split()[0]) yield vote else: self.warning("No Votes for: %s", link)
def scrape_committee_vote(self, bill, actor, date, motion, page, url, uniqid): votes = page.xpath("//table")[0] rows = votes.xpath(".//tr")[0] if rows[0].text_content() == 'Votes:': # New webste rows = votes.xpath(".//tr")[2] yno = rows.xpath(".//td") if len(yno) < 3: yes = yno[0] no, other = None, None else: yes, no, other = rows.xpath(".//td")[:3] def proc_block(obj, typ): if obj is None: return { "type": None, "count": None, "votes": [] } votes = [] for vote in obj.xpath(".//br"): if vote.tail: vote = vote.tail.strip() if vote: votes.append(vote) count = len(votes) return { "type": typ, "count": count, "votes": votes } vote_dict = { "yes": proc_block(yes, 'yes'), "no": proc_block(no, 'no'), "other": proc_block(other, 'other'), } yes_count = vote_dict['yes']['count'] no_count = vote_dict['no']['count'] or 0 other_count = vote_dict['other']['count'] or 0 print(motion) vote = Vote(chamber=actor, start_date=date, motion_text=motion, identifier=str(uniqid), result='pass' if (yes_count > no_count) else 'fail', classification='passage', bill=bill) vote.extras = {'_vote_id': uniqid} vote.add_source(url) vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('other', other_count) for key in vote_dict: for voter in vote_dict[key]['votes']: vote.vote(key, voter) yield vote
def scrape_vote(self, bill, motion, url): page = self.get(url, retry_on_404=True).text page = lxml.html.fromstring(page) yeas_cell = page.xpath("//td[text() = 'Yeas (Y):']")[0] yes_count = int(yeas_cell.xpath("string(following-sibling::td)")) nays_cell = page.xpath("//td[text() = 'Nays (N):']")[0] no_count = int(nays_cell.xpath("string(following-sibling::td)")) abs_cell = page.xpath("//td[text() = 'Absent (X):']")[0] abs_count = int(abs_cell.xpath("string(following-sibling::td)")) ex_cell = page.xpath("//td[text() = 'Excused (E):']")[0] ex_count = int(ex_cell.xpath("string(following-sibling::td)")) other_count = abs_count + ex_count if 'chamber=House' in url: chamber = 'lower' elif 'chamber=Senate' in url: chamber = 'upper' date_cell = page.xpath("//td[text() = 'Date:']")[0] date = date_cell.xpath("string(following-sibling::td)") try: date = datetime.datetime.strptime(date, "%B %d, %Y") except ValueError: date = datetime.datetime.strptime(date, "%b. %d, %Y") outcome_cell = page.xpath("//td[text()='Outcome:']")[0] outcome = outcome_cell.xpath("string(following-sibling::td)") vote = VoteEvent( chamber=chamber, start_date=date.strftime('%Y-%m-%d'), motion_text=motion, result='pass' if outcome == 'PREVAILS' else 'fail', classification='passage', bill=bill, ) vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('other', other_count) vote.add_source(url) member_cell = page.xpath("//td[text() = 'Member']")[0] for row in member_cell.xpath("../../tr")[1:]: name = row.xpath("string(td[2])") # name = name.split(" of ")[0] vtype = row.xpath("string(td[4])") if vtype == 'Y': vote.vote('yes', name) elif vtype == 'N': vote.vote('no', name) elif vtype == 'X' or vtype == 'E': vote.vote('other', name) yield vote
def record_votes(root, session, chamber): for el in root.xpath('//div{}'.format(''.join(vote_selectors))): mv = MaybeVote(el) if not mv.is_valid: continue v = VoteEvent( chamber=chamber, start_date=None, motion_text='passage' if mv.passed else 'other', result='pass' if mv.passed else 'fail', classification='passage' if mv.passed else 'other', legislative_session=session[0:2], bill=mv.bill_id, bill_chamber=mv.chamber ) v.set_count('yes', mv.yeas or 0) v.set_count('no', mv.nays or 0) v.set_count('not voting', mv.present or 0) for each in mv.votes['yeas']: v.yes(each) for each in mv.votes['nays']: v.no(each) for each in mv.votes['present']: v.vote('not voting', each) for each in mv.votes['absent']: v.vote('absent', each) yield v
def parse_roll_call(self, bill, link, chamber, date): url = link.attrib['href'] page = self.get(url).text page = lxml.html.fromstring(page) xpath = 'string(//div[@class="Column-OneFourth"]/div[3])' motion = page.xpath(xpath).strip() motion = re.sub(r'\s+', ' ', motion) if motion == 'FP': motion = 'FINAL PASSAGE' if motion == 'FINAL PASSAGE': type = 'passage' elif re.match(r'CONCUR(RENCE)? IN \w+ AMENDMENTS', motion): type = 'amendment' else: type = 'other' motion = link.text_content() yeas = int(page.xpath("//div[text() = 'YEAS']")[0].getnext().text) nays = int(page.xpath("//div[text() = 'NAYS']")[0].getnext().text) lve = int(page.xpath("//div[text() = 'LVE']")[0].getnext().text) nv = int(page.xpath("//div[text() = 'N/V']")[0].getnext().text) other = lve + nv vote = VoteEvent( chamber=chamber, start_date=tz.localize(date), motion_text=motion, classification=type, result='pass' if yeas > (nays + other) else 'fail', bill=bill, ) vote.add_source(url) vote.set_count('yes', yeas) vote.set_count('no', nays) vote.set_count('other', other) for div in page.xpath('//*[contains(@class, "RollCalls-Vote")]'): name = div.text_content().strip() name = re.sub(r'^[\s,]+', '', name) name = re.sub(r'[\s,]+$', '', name) class_attr = div.attrib['class'].lower() if 'yea' in class_attr: voteval = 'yes' elif 'nay' in class_attr: voteval = 'no' elif 'nvote' in class_attr: voteval = 'other' elif 'lve' in class_attr: voteval = 'other' else: msg = 'Unrecognized vote val: %s' % class_attr raise Exception(msg) vote.vote(voteval, name) return vote
def scrape_vote(self, bill, vote_id, session): vote_url = 'https://legis.delaware.gov/json/RollCall/GetRollCallVoteByRollCallId' form = { 'rollCallId': vote_id, 'sort': '', 'group': '', 'filter': '', } page = self.post(url=vote_url, data=form, allow_redirects=True).json() if page: roll = page['Model'] vote_chamber = self.chamber_map[roll['ChamberName']] # "7/1/16 01:00 AM" vote_date = dt.datetime.strptime(roll['TakenAtDateTime'], '%m/%d/%y %I:%M %p').strftime('%Y-%m-%d') # TODO: What does this code mean? vote_motion = roll['RollCallVoteType'] vote_passed = 'pass' if roll['RollCallStatus'] == 'Passed' else 'fail' other_count = (int(roll['NotVotingCount']) + int(roll['VacantVoteCount']) + int(roll['AbsentVoteCount']) + int(roll['ConflictVoteCount']) ) vote = Vote(chamber=vote_chamber, start_date=vote_date, motion_text=vote_motion, result=vote_passed, classification='other', bill=bill.identifier, legislative_session=session ) vote.add_source(vote_url) vote.set_count('yes', roll['YesVoteCount']) vote.set_count('no', roll['NoVoteCount']) vote.set_count('other', other_count) for row in roll['AssemblyMemberVotes']: # AssemblyMemberId looks like it should work here, # but for some sessions it's bugged to only return session try: voter = self.legislators_by_short[str(row['ShortName'])] name = voter['DisplayName'] except KeyError: self.warning('could not find legislator short name %s', row['ShortName']) name = row['ShortName'] if row['SelectVoteTypeCode'] == 'Y': vote.yes(name) elif row['SelectVoteTypeCode'] == 'N': vote.no(name) else: vote.vote('other', name) # bill.add_vote_event(vote) yield vote
def handle_page(self): (date, ) = self.doc.xpath( '//span[@id="ctl00_ContentPlaceHolder1_lblDate"]/text()') date = format_datetime( datetime.datetime.strptime(date, '%m/%d/%Y %I:%M:%S %p'), 'US/Eastern') totals = self.doc.xpath('//table//table')[-1].text_content() totals = re.sub(r'(?mu)\s+', " ", totals).strip() (yes_count, no_count, other_count) = [ int(x) for x in re.search( r'(?m)Total Yeas:\s+(\d+)\s+Total Nays:\s+(\d+)\s+' r'Total Missed:\s+(\d+)', totals).groups() ] result = 'pass' if yes_count > no_count else 'fail' (committee, ) = self.doc.xpath( '//span[@id="ctl00_ContentPlaceHolder1_lblCommittee"]/text()') (action, ) = self.doc.xpath( '//span[@id="ctl00_ContentPlaceHolder1_lblAction"]/text()') motion = "{} ({})".format(action, committee) vote = VoteEvent( start_date=date, bill=self.kwargs['bill'], chamber='lower', motion_text=motion, result=result, classification='committee', ) vote.add_source(self.url) vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('not voting', other_count) for member_vote in self.doc.xpath('//table//table//table//td'): if not member_vote.text_content().strip(): continue (member, ) = member_vote.xpath('span[2]//text()') (member_vote, ) = member_vote.xpath('span[1]//text()') if member_vote == "Y": vote.yes(member) elif member_vote == "N": vote.no(member) elif member_vote == "-": vote.vote('not voting', member) # Parenthetical votes appear to not be counted in the # totals for Yea, Nay, _or_ Missed elif re.search(r'\([YN]\)', member_vote): continue else: raise ValueError( "Unknown vote type found: {}".format(member_vote)) yield vote
def _parse_senate_votes(self, vote_data, bill, url): vote_datetime = datetime.datetime.strptime(vote_data['voteDate'], '%Y-%m-%d') if vote_data['voteType'] == 'FLOOR': motion = 'Floor Vote' elif vote_data['voteType'] == 'COMMITTEE': motion = '{} Vote'.format(vote_data['committee']['name']) else: raise ValueError('Unknown vote type encountered.') vote = VoteEvent( chamber='upper', start_date=vote_datetime.strftime('%Y-%m-%d'), motion_text=motion, classification='passage', result='fail', bill=bill, ) vote.add_source(url) vote_rolls = vote_data['memberVotes']['items'] yes_count, no_count, other_count = 0, 0, 0 # Count all yea votes. if 'items' in vote_rolls.get('AYE', {}): for legislator in vote_rolls['AYE']['items']: vote.yes(legislator['fullName']) yes_count += 1 if 'items' in vote_rolls.get('AYEWR', {}): for legislator in vote_rolls['AYEWR']['items']: vote.yes(legislator['fullName']) yes_count += 1 # Count all nay votes. if 'items' in vote_rolls.get('NAY', {}): for legislator in vote_rolls['NAY']['items']: vote.no(legislator['fullName']) no_count += 1 # Count all other types of votes. other_vote_types = ('EXC', 'ABS', 'ABD') for vote_type in other_vote_types: if vote_rolls.get(vote_type, []): for legislator in vote_rolls[vote_type]['items']: vote.vote('other', legislator['fullName']) other_count += 1 vote.result = 'pass' if yes_count > no_count else 'fail' vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('other', other_count) return vote
def _parse_senate_votes(self, vote_data, bill, url): vote_datetime = datetime.datetime.strptime(vote_data["voteDate"], "%Y-%m-%d") if vote_data["voteType"] == "FLOOR": motion = "Floor Vote" elif vote_data["voteType"] == "COMMITTEE": motion = "{} Vote".format(vote_data["committee"]["name"]) else: raise ValueError("Unknown vote type encountered.") vote = VoteEvent( chamber="upper", start_date=vote_datetime.strftime("%Y-%m-%d"), motion_text=motion, classification="passage", result="fail", bill=bill, ) vote.add_source(url) vote_rolls = vote_data["memberVotes"]["items"] yes_count, no_count, other_count = 0, 0, 0 # Count all yea votes. if "items" in vote_rolls.get("AYE", {}): for legislator in vote_rolls["AYE"]["items"]: vote.yes(legislator["fullName"]) yes_count += 1 if "items" in vote_rolls.get("AYEWR", {}): for legislator in vote_rolls["AYEWR"]["items"]: vote.yes(legislator["fullName"]) yes_count += 1 # Count all nay votes. if "items" in vote_rolls.get("NAY", {}): for legislator in vote_rolls["NAY"]["items"]: vote.no(legislator["fullName"]) no_count += 1 # Count all other types of votes. other_vote_types = ("EXC", "ABS", "ABD") for vote_type in other_vote_types: if vote_rolls.get(vote_type, []): for legislator in vote_rolls[vote_type]["items"]: vote.vote("other", legislator["fullName"]) other_count += 1 vote.result = "pass" if yes_count > no_count else "fail" vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("other", other_count) return vote
def _parse_senate_votes(self, vote_data, bill, url): vote_datetime = datetime.datetime.strptime( vote_data['voteDate'], '%Y-%m-%d') if vote_data['voteType'] == 'FLOOR': motion = 'Floor Vote' elif vote_data['voteType'] == 'COMMITTEE': motion = '{} Vote'.format(vote_data['committee']['name']) else: raise ValueError('Unknown vote type encountered.') vote = VoteEvent( chamber='upper', start_date=vote_datetime.strftime('%Y-%m-%d'), motion_text=motion, classification='passage', result='fail', bill=bill, ) vote.add_source(url) vote_rolls = vote_data['memberVotes']['items'] yes_count, no_count, other_count = 0, 0, 0 # Count all yea votes. if 'items' in vote_rolls.get('AYE', {}): for legislator in vote_rolls['AYE']['items']: vote.yes(legislator['fullName']) yes_count += 1 if 'items' in vote_rolls.get('AYEWR', {}): for legislator in vote_rolls['AYEWR']['items']: vote.yes(legislator['fullName']) yes_count += 1 # Count all nay votes. if 'items' in vote_rolls.get('NAY', {}): for legislator in vote_rolls['NAY']['items']: vote.no(legislator['fullName']) no_count += 1 # Count all other types of votes. other_vote_types = ('EXC', 'ABS', 'ABD') for vote_type in other_vote_types: if vote_rolls.get(vote_type, []): for legislator in vote_rolls[vote_type]['items']: vote.vote('other', legislator['fullName']) other_count += 1 vote.result = 'pass' if yes_count > no_count else 'fail' vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('other', other_count) return vote
def build_vote(session, bill_id, url, vote_record, chamber, motion_text): passed = len(vote_record['yes']) > len(vote_record['no']) vote_event = VoteEvent( result='pass' if passed else 'fail', chamber=chamber, start_date=vote_record['date'].strftime('%Y-%m-%d'), motion_text=motion_text, classification='passage', legislative_session=session, bill=bill_id, bill_chamber='upper' if bill_id[0] is 'S' else 'lower' ) vote_event.pupa_id = url vote_event.set_count('yes', len(vote_record['yes'])) vote_event.set_count('no', len(vote_record['no'])) vote_event.set_count('excused', len(vote_record['excused'])) vote_event.set_count('absent', len(vote_record['absent'])) vote_event.set_count('other', len(vote_record['other'])) for vote_type in ['yes', 'no', 'excused', 'absent', 'other']: for voter in vote_record[vote_type]: vote_event.vote(vote_type, voter) vote_event.add_source(url) return vote_event
def parse_bill_actions_table(self, bill, action_table, bill_id, session, url, bill_chamber): for action in action_table.xpath('*')[1:]: date = action[0].text_content() date = dt.datetime.strptime(date, "%m/%d/%Y").strftime('%Y-%m-%d') actor = action[1].text_content().upper() string = action[2].text_content() actor = { "S": "upper", "H": "lower", "D": "legislature", # "Data Systems", "$": "Appropriation measure", "CONAM": "Constitutional Amendment" }[actor] act_type, committees = categorize_action(string) # XXX: Translate short-code to full committee name for the # matcher. real_committees = [] if committees: for committee in committees: try: committee = self.short_ids[committee]['name'] real_committees.append(committee) except KeyError: pass act = bill.add_action(string, date, chamber=actor, classification=act_type) for committee in real_committees: act.add_related_entity(name=committee, entity_type="organization") vote = self.parse_vote(string) if vote: v, motion = vote vote = VoteEvent(start_date=date, chamber=actor, bill=bill_id, bill_chamber=bill_chamber, legislative_session=session, motion_text=motion, result='pass' if 'passed' in string.lower() else 'fail', classification='passage') vote.add_source(url) vote.set_count('yes', int(v['n_yes'] or 0)) vote.set_count('no', int(v['n_no'] or 0)) vote.set_count('not voting', int(v['n_excused'] or 0)) for voter in split_specific_votes(v['yes']): vote.yes(voter) for voter in split_specific_votes(v['yes_resv']): vote.yes(voter) for voter in split_specific_votes(v['no']): vote.no(voter) for voter in split_specific_votes(v['excused']): vote.vote('not voting', voter) yield vote
def handle_page(self): date, = self.doc.xpath('//span[contains(@id, "lblDate")]/text()') date = format_datetime( datetime.datetime.strptime(date, '%m/%d/%Y %I:%M:%S %p'), 'US/Eastern') yes_count = int( self.doc.xpath('//span[contains(@id, "lblYeas")]/text()')[0]) no_count = int( self.doc.xpath('//span[contains(@id, "lblNays")]/text()')[0]) other_count = int( self.doc.xpath('//span[contains(@id, "lblMissed")]/text()')[0]) result = 'pass' if yes_count > no_count else 'fail' committee, = self.doc.xpath( '//span[contains(@id, "lblCommittee")]/text()') action, = self.doc.xpath('//span[contains(@id, "lblAction")]/text()') motion = "{} ({})".format(action, committee) vote = VoteEvent( start_date=date, bill=self.kwargs['bill'], chamber='lower', motion_text=motion, result=result, classification='committee', ) vote.add_source(self.url) vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('not voting', other_count) for member_vote in self.doc.xpath( '//ul[contains(@class, "vote-list")]/li'): if not member_vote.text_content().strip(): continue member, = member_vote.xpath('span[2]//text()') member_vote, = member_vote.xpath('span[1]//text()') if member_vote == "Y": vote.yes(member) elif member_vote == "N": vote.no(member) elif member_vote == "-": vote.vote('not voting', member) # Parenthetical votes appear to not be counted in the # totals for Yea, Nay, _or_ Missed elif re.search(r'\([YN]\)', member_vote): continue else: raise ValueError( "Unknown vote type found: {}".format(member_vote)) yield vote
def scrape_committee_vote(self, bill, actor, date, motion, page, url, uniqid): votes = page.xpath("//table")[0] rows = votes.xpath(".//tr")[0] if rows[0].text_content() == "Votes:": # New webste rows = votes.xpath(".//tr")[2] yno = rows.xpath(".//td") if len(yno) < 3: yes = yno[0] no, other = None, None else: yes, _, no, _, other = rows.xpath(".//td")[:5] def proc_block(obj, typ): if obj is None: return {"type": None, "count": None, "votes": []} votes = [] for vote in obj.xpath("./text()"): if vote.strip(): vote = vote.strip() if vote: votes.append(vote) count = len(votes) return {"type": typ, "count": count, "votes": votes} vote_dict = { "yes": proc_block(yes, "yes"), "no": proc_block(no, "no"), "other": proc_block(other, "other"), } yes_count = vote_dict["yes"]["count"] no_count = vote_dict["no"]["count"] or 0 other_count = vote_dict["other"]["count"] or 0 vote = Vote( chamber=actor, start_date=date, motion_text=motion, identifier=str(uniqid), result="pass" if (yes_count > no_count) else "fail", classification="passage", bill=bill, ) vote.extras = {"_vote_id": uniqid} vote.add_source(url) vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("other", other_count) for key in vote_dict: for voter in vote_dict[key]["votes"]: vote.vote(key, voter) yield vote
def scrape_committee_vote(self, bill, actor, date, motion, page, url, uniqid): votes = page.xpath("//table")[0] rows = votes.xpath(".//tr")[0] if rows[0].text_content() == 'Votes:': # New webste rows = votes.xpath(".//tr")[2] yno = rows.xpath(".//td") if len(yno) < 3: yes = yno[0] no, other = None, None else: yes, no, other = rows.xpath(".//td")[:3] def proc_block(obj, typ): if obj is None: return {"type": None, "count": None, "votes": []} votes = [] for vote in obj.xpath(".//br"): if vote.tail: vote = vote.tail.strip() if vote: votes.append(vote) count = len(votes) return {"type": typ, "count": count, "votes": votes} vote_dict = { "yes": proc_block(yes, 'yes'), "no": proc_block(no, 'no'), "other": proc_block(other, 'other'), } yes_count = vote_dict['yes']['count'] no_count = vote_dict['no']['count'] or 0 other_count = vote_dict['other']['count'] or 0 print(motion) vote = Vote(chamber=actor, start_date=date, motion_text=motion, identifier=str(uniqid), result='pass' if (yes_count > no_count) else 'fail', classification='passage', bill=bill) vote.extras = {'_vote_id': uniqid} vote.add_source(url) vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('other', other_count) for key in vote_dict: for voter in vote_dict[key]['votes']: vote.vote(key, voter) yield vote
def parse_vote(self, bill, actor, date, motion, url, uniqid): page = self.get(url).text bill.add_source(url) vote_re = re.compile( r"YEAS -?\s?(\d+)(.*)NAYS -?\s?(\d+)" r"(.*)ABSENT( OR NOT VOTING)? -?\s?" r"(\d+)(.*)", re.MULTILINE | re.DOTALL, ) match = vote_re.search(page) yes_count = int(match.group(1)) no_count = int(match.group(3)) other_count = int(match.group(6)) if yes_count > no_count: passed = True else: passed = False if actor == "upper" or actor == "lower": vote_chamber = actor else: vote_chamber = "" vote = Vote( chamber=vote_chamber, start_date=date, motion_text=motion, result="pass" if passed else "fail", identifier=str(uniqid), classification="passage", bill=bill, ) vote.add_source(url) vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("other", other_count) yes_votes = re.split(r"\s{2,}", match.group(2).strip()) no_votes = re.split(r"\s{2,}", match.group(4).strip()) other_votes = re.split(r"\s{2,}", match.group(7).strip()) for yes in yes_votes: if yes: vote.yes(yes) for no in no_votes: if no: vote.no(no) for other in other_votes: if other: vote.vote("other", other) yield vote
def parse_vote(self, chamber, bill, row, action_text, action_date, url): yes = int( row.xpath( './/div[label[contains(text(), "A Favor")]]/span[contains(@class,"smalltxt")]/text()' )[0]) no = int( row.xpath( './/div[label[contains(text(), "En Contra")]]/span[contains(@class,"smalltxt")]/text()' )[0]) abstain = int( row.xpath( './/div[label[contains(text(), "Abstenido")]]/span[contains(@class,"smalltxt")]/text()' )[0]) absent = int( row.xpath( './/div[label[contains(text(), "Ausente")]]/span[contains(@class,"smalltxt")]/text()' )[0]) vote_chamber = self.parse_vote_chamber(chamber, action_text) classification = "passage" if u"Votación Final" in action_text else "other" vote = Vote( chamber=vote_chamber, start_date=action_date, motion_text=action_text, result="pass" if (yes > no) else "fail", bill=bill, classification=classification, ) vote.add_source(url) vote.set_count("yes", yes) vote.set_count("no", no) vote.set_count("absent", absent) vote.set_count("abstain", abstain) # we don't want to add the attached vote PDF as a version, # so add it as a document # TODO: maybe this should be set as the source? self.parse_version(bill, row, is_document=True) yield vote
def scrape_votes(self, bill): bill_num = bill.identifier.split()[1] url = ( "http://wslwebservices.leg.wa.gov/legislationservice.asmx/" "GetRollCalls?billNumber=%s&biennium=%s" % (bill_num, self.biennium) ) page = self.get(url) page = lxml.etree.fromstring(page.content) for rc in xpath(page, "//wa:RollCall"): motion = xpath(rc, "string(wa:Motion)") seq_no = xpath(rc, "string(wa:SequenceNumber)") date = xpath(rc, "string(wa:VoteDate)").split("T")[0] date = datetime.datetime.strptime(date, "%Y-%m-%d").date() yes_count = int(xpath(rc, "string(wa:YeaVotes/wa:Count)")) no_count = int(xpath(rc, "string(wa:NayVotes/wa:Count)")) abs_count = int(xpath(rc, "string(wa:AbsentVotes/wa:Count)")) ex_count = int(xpath(rc, "string(wa:ExcusedVotes/wa:Count)")) other_count = abs_count + ex_count agency = xpath(rc, "string(wa:Agency)") chamber = {"House": "lower", "Senate": "upper"}[agency] vote = Vote( chamber=chamber, start_date=date, motion_text="{} (#{})".format(motion, seq_no), result="pass" if yes_count > (no_count + other_count) else "fail", classification="other", bill=bill, ) vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("other", other_count) vote.add_source(url) for sv in xpath(rc, "wa:Votes/wa:Vote"): name = xpath(sv, "string(wa:Name)") vtype = xpath(sv, "string(wa:VOte)") if vtype == "Yea": vote.yes(name) elif vtype == "Nay": vote.no(name) else: vote.vote("other", name) yield vote
def viva_voce_votes(root, session, chamber): for el in root.xpath(u'//div[starts-with(., "All Members are deemed")]'): mv = MaybeViva(el) if not mv.is_valid: continue v = VoteEvent(chamber=chamber, start_date=None, motion_text='passage' if mv.passed else 'other', result='pass' if mv.passed else 'fail', classification='passage' if mv.passed else 'other', legislative_session=session[0:2], bill=mv.bill_id, bill_chamber=mv.chamber) v.set_count('yes', 0) v.set_count('no', 0) v.set_count('absent', 0) v.set_count('not voting', 0) yield v
def handle_page(self): (date, ) = self.doc.xpath('//span[@id="ctl00_ContentPlaceHolder1_lblDate"]/text()') date = datetime.datetime.strptime(date, '%m/%d/%Y %I:%M:%S %p' ).isoformat().replace('T', ' ') totals = self.doc.xpath('//table//table')[-1].text_content() totals = re.sub(r'(?mu)\s+', " ", totals).strip() (yes_count, no_count, other_count) = [int(x) for x in re.search( r'(?m)Total Yeas:\s+(\d+)\s+Total Nays:\s+(\d+)\s+' 'Total Missed:\s+(\d+)', totals).groups()] result = 'pass' if yes_count > no_count else 'fail' (committee, ) = self.doc.xpath( '//span[@id="ctl00_ContentPlaceHolder1_lblCommittee"]/text()') (action, ) = self.doc.xpath('//span[@id="ctl00_ContentPlaceHolder1_lblAction"]/text()') motion = "{} ({})".format(action, committee) vote = VoteEvent(start_date=date, bill=self.kwargs['bill'], chamber='lower', motion_text=motion, result=result, classification='committee', ) vote.add_source(self.url) vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('not voting', other_count) for member_vote in self.doc.xpath('//table//table//table//td'): if not member_vote.text_content().strip(): continue (member, ) = member_vote.xpath('span[2]//text()') (member_vote, ) = member_vote.xpath('span[1]//text()') if member_vote == "Y": vote.yes(member) elif member_vote == "N": vote.no(member) elif member_vote == "-": vote.vote('not voting', member) # Parenthetical votes appear to not be counted in the # totals for Yea, Nay, _or_ Missed elif re.search(r'\([YN]\)', member_vote): continue else: raise ValueError("Unknown vote type found: {}".format(member_vote)) yield vote
def parse_vote(self, actor, date, row, session, bill_id, bill_chamber, source): """ takes the actor, date and row element and returns a Vote object """ spans = row.xpath(".//span") motion = row.text.replace(u"\u00a0", " ").replace("-", "").strip() motion = motion if motion else "passage" passed, yes_count, no_count, other_count = ( spans[0].text_content().rsplit("-", 3)) yes_votes = self.get_names(spans[1].tail) no_votes = self.get_names(spans[2].tail) other_votes = [] for span in spans[3:]: if span.text.startswith(("Absent", "Excused")): other_votes += self.get_names(span.tail) for key, val in { "adopted": "pass", "passed": "pass", "failed": "fail" }.items(): if key in passed.lower(): passed = val break vote = VoteEvent( chamber=actor, start_date=date, motion_text=motion, bill=bill_id, bill_chamber=bill_chamber, result=passed, classification="passage", legislative_session=session, ) vote.add_source(source) vote.set_count("yes", int(yes_count)) vote.set_count("no", int(no_count)) vote.set_count("absent", int(other_count)) for name in yes_votes: if name and name != "None": vote.yes(name) for name in no_votes: if name and name != "None": vote.no(name) for name in other_votes: if name and name != "None": vote.vote("absent", name) yield vote
def scrape_chamber_votes(self, chamber, session): url = { "upper": "%s/%s" % (RI_URL_BASE, "SVotes"), "lower": "%s/%s" % (RI_URL_BASE, "HVotes") }[chamber] action = "%s/%s" % (url, "votes.asp") dates = self.get_vote_dates(url, session) for date in dates: votes = self.parse_vote_page(self.post_to(action, date), url, session) for vote_dict in votes: for vote in vote_dict.values(): count = vote['count'] chamber = { "H": "lower", "S": "upper" }[vote['meta']['chamber']] try: bill_id = self._bill_id_by_type[(chamber, vote['meta']['bill'])] except: self.warning('no such bill_id %s %s', chamber, vote['meta']['bill']) continue v = VoteEvent( chamber=chamber, start_date=vote['time'].strftime('%Y-%m-%d'), motion_text=vote['meta']['extra']['motion'], result='pass' if count['passage'] else 'fail', classification='passage', legislative_session=session, bill=bill_id, bill_chamber=chamber, ) v.set_count('yes', int(count['YEAS'])) v.set_count('no', int(count['NAYS'])) v.set_count('other', int(count['NOT VOTING'])) v.add_source(vote['source']) v.pupa_id = vote['source'] for vt in vote['votes']: key = { 'Y': 'yes', 'N': 'no', }.get(vt['vote'], 'other') v.vote(key, vt['name']) yield v
def parse_vote(self, bill, actor, date, motion, url, uniqid): page = self.get(url).text bill.add_source(url) vote_re = re.compile('YEAS -?\s?(\d+)(.*)NAYS -?\s?(\d+)' '(.*)ABSENT( OR NOT VOTING)? -?\s?' '(\d+)(.*)', re.MULTILINE | re.DOTALL) match = vote_re.search(page) yes_count = int(match.group(1)) no_count = int(match.group(3)) other_count = int(match.group(6)) if yes_count > no_count: passed = True else: passed = False if actor == 'upper' or actor == 'lower': vote_chamber = actor else: vote_chamber = '' vote = Vote(chamber=vote_chamber, start_date=date, motion_text=motion, result='pass' if passed else 'fail', identifier=str(uniqid), classification='passage', bill=bill) vote.add_source(url) vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('other', other_count) yes_votes = re.split('\s{2,}', match.group(2).strip()) no_votes = re.split('\s{2,}', match.group(4).strip()) other_votes = re.split('\s{2,}', match.group(7).strip()) for yes in yes_votes: if yes: vote.yes(yes) for no in no_votes: if no: vote.no(no) for other in other_votes: if other: vote.vote('other', other) yield vote
def parse_vote(self, bill, actor, date, motion, url, uniqid): page = self.get(url).text bill.add_source(url) vote_re = re.compile( r'YEAS -?\s?(\d+)(.*)NAYS -?\s?(\d+)' r'(.*)ABSENT( OR NOT VOTING)? -?\s?' r'(\d+)(.*)', re.MULTILINE | re.DOTALL) match = vote_re.search(page) yes_count = int(match.group(1)) no_count = int(match.group(3)) other_count = int(match.group(6)) if yes_count > no_count: passed = True else: passed = False if actor == 'upper' or actor == 'lower': vote_chamber = actor else: vote_chamber = '' vote = Vote(chamber=vote_chamber, start_date=date, motion_text=motion, result='pass' if passed else 'fail', identifier=str(uniqid), classification='passage', bill=bill) vote.add_source(url) vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('other', other_count) yes_votes = re.split(r'\s{2,}', match.group(2).strip()) no_votes = re.split(r'\s{2,}', match.group(4).strip()) other_votes = re.split(r'\s{2,}', match.group(7).strip()) for yes in yes_votes: if yes: vote.yes(yes) for no in no_votes: if no: vote.no(no) for other in other_votes: if other: vote.vote('other', other) yield vote
def scrape_votes(self, bill): bill_num = bill.identifier.split()[1] url = ("http://wslwebservices.leg.wa.gov/legislationservice.asmx/" "GetRollCalls?billNumber=%s&biennium=%s" % ( bill_num, self.biennium)) page = self.get(url) page = lxml.etree.fromstring(page.content) for rc in xpath(page, "//wa:RollCall"): motion = xpath(rc, "string(wa:Motion)") seq_no = xpath(rc, "string(wa:SequenceNumber)") date = xpath(rc, "string(wa:VoteDate)").split("T")[0] date = datetime.datetime.strptime(date, "%Y-%m-%d").date() yes_count = int(xpath(rc, "string(wa:YeaVotes/wa:Count)")) no_count = int(xpath(rc, "string(wa:NayVotes/wa:Count)")) abs_count = int( xpath(rc, "string(wa:AbsentVotes/wa:Count)")) ex_count = int( xpath(rc, "string(wa:ExcusedVotes/wa:Count)")) other_count = abs_count + ex_count agency = xpath(rc, "string(wa:Agency)") chamber = {'House': 'lower', 'Senate': 'upper'}[agency] vote = Vote(chamber=chamber, start_date=date, motion_text='{} (#{})'.format(motion, seq_no), result='pass' if yes_count > (no_count + other_count) else 'fail', classification='other', bill=bill) vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('other', other_count) vote.add_source(url) for sv in xpath(rc, "wa:Votes/wa:Vote"): name = xpath(sv, "string(wa:Name)") vtype = xpath(sv, "string(wa:VOte)") if vtype == 'Yea': vote.yes(name) elif vtype == 'Nay': vote.no(name) else: vote.vote('other', name) yield vote
def scrape_votes(self, bill): bill_num = bill.identifier.split()[1] url = ("http://wslwebservices.leg.wa.gov/legislationservice.asmx/" "GetRollCalls?billNumber=%s&biennium=%s" % (bill_num, self.biennium)) page = self.get(url) page = lxml.etree.fromstring(page.content) for rc in xpath(page, "//wa:RollCall"): motion = xpath(rc, "string(wa:Motion)") date = xpath(rc, "string(wa:VoteDate)").split("T")[0] date = datetime.datetime.strptime(date, "%Y-%m-%d").date() yes_count = int(xpath(rc, "string(wa:YeaVotes/wa:Count)")) no_count = int(xpath(rc, "string(wa:NayVotes/wa:Count)")) abs_count = int(xpath(rc, "string(wa:AbsentVotes/wa:Count)")) ex_count = int(xpath(rc, "string(wa:ExcusedVotes/wa:Count)")) other_count = abs_count + ex_count agency = xpath(rc, "string(wa:Agency)") chamber = {'House': 'lower', 'Senate': 'upper'}[agency] vote = Vote(chamber=chamber, start_date=date, motion_text=motion, result='pass' if yes_count > (no_count + other_count) else 'fail', classification='other', bill=bill) vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('other', other_count) vote.add_source(url) for sv in xpath(rc, "wa:Votes/wa:Vote"): name = xpath(sv, "string(wa:Name)") vtype = xpath(sv, "string(wa:VOte)") if vtype == 'Yea': vote.yes(name) elif vtype == 'Nay': vote.no(name) else: vote.vote('other', name) yield vote
def parse_committee_votes(self, bill, url): bill.add_source(url) html = self.get(url).text doc = lxml.html.fromstring(html) doc.make_links_absolute(url) chamber = ('upper' if 'Senate' in doc.xpath('string(//h1)') else 'lower') committee = tuple(doc.xpath('//h2')[0].itertext())[-2].strip() for link in doc.xpath("//a[contains(@href, 'listVoteSummary.cfm')]"): # Date for fmt in ("%m/%d/%Y", "%m-%d-%Y"): date = link.xpath('../../td')[0].text_content() try: date = datetime.datetime.strptime(date, fmt) except ValueError: continue break # Motion motion = link.text_content().split(' - ')[-1].strip() motion = 'Committee vote (%s): %s' % (committee, motion) # Roll call vote_url = link.attrib['href'] rollcall = self.parse_upper_committee_vote_rollcall(bill, vote_url) vote = VoteEvent( chamber=chamber, start_date=tz.localize(date), motion_text=motion, classification='other', result='pass' if rollcall['passed'] else 'fail', bill=bill, ) vote.pupa_id = vote_url vote.set_count('yes', rollcall['yes_count']) vote.set_count('no', rollcall['no_count']) vote.set_count('other', rollcall['other_count']) for voteval in ('yes', 'no', 'other'): for name in rollcall.get(voteval + '_votes', []): vote.vote(voteval, name) vote.add_source(url) vote.add_source(vote_url) yield vote
def viva_voce_votes(root, session, chamber): for el in root.xpath(u'//div[starts-with(., "All Members are deemed")]'): mv = MaybeViva(el) if not mv.is_valid: continue v = VoteEvent( chamber=chamber, start_date=None, motion_text="passage" if mv.passed else "other", result="pass" if mv.passed else "fail", classification="passage" if mv.passed else "other", legislative_session=session[0:2], bill=mv.bill_id, bill_chamber=mv.chamber, ) v.set_count("yes", 0) v.set_count("no", 0) v.set_count("absent", 0) v.set_count("not voting", 0) yield v
def viva_voce_votes(root, session, chamber): for el in root.xpath(u'//div[starts-with(., "All Members are deemed")]'): mv = MaybeViva(el) if not mv.is_valid: continue v = VoteEvent( chamber=chamber, start_date=None, motion_text='passage' if mv.passed else 'other', result='pass' if mv.passed else 'fail', classification='passage' if mv.passed else 'other', legislative_session=session[0:2], bill=mv.bill_id, bill_chamber=mv.chamber ) v.set_count('yes', 0) v.set_count('no', 0) v.set_count('absent', 0) v.set_count('not voting', 0) yield v
def parse_vote(self, actor, date, row, session, bill_id, bill_chamber, source): """ takes the actor, date and row element and returns a Vote object """ spans = row.xpath('.//span') motion = row.text.replace(u'\u00a0', " ").replace("-", "").strip() motion = motion if motion else "passage" passed, yes_count, no_count, other_count = spans[0].text_content( ).rsplit('-', 3) yes_votes = self.get_names(spans[1].tail) no_votes = self.get_names(spans[2].tail) other_votes = [] for span in spans[3:]: if span.text.startswith(('Absent', 'Excused')): other_votes += self.get_names(span.tail) for key, val in { 'adopted': 'pass', 'passed': 'pass', 'failed': 'fail' }.items(): if key in passed.lower(): passed = val break vote = VoteEvent(chamber=actor, start_date=date, motion_text=motion, bill=bill_id, bill_chamber=bill_chamber, result=passed, classification="passage", legislative_session=session) vote.add_source(source) vote.set_count('yes', int(yes_count)) vote.set_count('no', int(no_count)) vote.set_count('absent', int(other_count)) for name in yes_votes: if name and name != 'None': vote.yes(name) for name in no_votes: if name and name != 'None': vote.no(name) for name in other_votes: if name and name != 'None': vote.vote('absent', name) yield vote
def parse_committee_votes(self, bill, url): bill.add_source(url) html = self.get(url).text doc = lxml.html.fromstring(html) doc.make_links_absolute(url) chamber = "upper" if "Senate" in doc.xpath("string(//h1)") else "lower" committee = tuple(doc.xpath("//h2")[0].itertext())[-2].strip() for link in doc.xpath("//a[contains(@href, 'listVoteSummary.cfm')]"): # Date for fmt in ("%m/%d/%Y", "%m-%d-%Y"): date = link.xpath("../../td")[0].text_content() try: date = datetime.datetime.strptime(date, fmt) except ValueError: continue break # Motion motion = link.text_content().split(" - ")[-1].strip() motion = "Committee vote (%s): %s" % (committee, motion) # Roll call vote_url = link.attrib["href"] rollcall = self.parse_upper_committee_vote_rollcall(bill, vote_url) vote = VoteEvent( chamber=chamber, start_date=tz.localize(date), motion_text=motion, classification="other", result="pass" if rollcall["passed"] else "fail", bill=bill, ) vote.pupa_id = vote_url vote.set_count("yes", rollcall["yes_count"]) vote.set_count("no", rollcall["no_count"]) vote.set_count("other", rollcall["other_count"]) for voteval in ("yes", "no", "other"): for name in rollcall.get(voteval + "_votes", []): vote.vote(voteval, name) vote.add_source(url) vote.add_source(vote_url) yield vote
def scrape_chamber_votes(self, chamber, session): url = { "upper": "%s/%s" % (RI_URL_BASE, "SVotes"), "lower": "%s/%s" % (RI_URL_BASE, "HVotes"), }[chamber] action = "%s/%s" % (url, "votes.asp") dates = self.get_vote_dates(url, session) for date in dates: votes = self.parse_vote_page(self.post_to(action, date), url, session) for vote_dict in votes: for vote in vote_dict.values(): count = vote["count"] chamber = { "H": "lower", "S": "upper" }[vote["meta"]["chamber"]] try: bill_id = self._bill_id_by_type[(chamber, vote["meta"]["bill"])] except KeyError: self.warning("no such bill_id %s %s", chamber, vote["meta"]["bill"]) continue v = VoteEvent( chamber=chamber, start_date=vote["time"].strftime("%Y-%m-%d"), motion_text=vote["meta"]["extra"]["motion"], result="pass" if count["passage"] else "fail", classification="passage", legislative_session=session, bill=bill_id, bill_chamber=chamber, ) v.set_count("yes", int(count["YEAS"])) v.set_count("no", int(count["NAYS"])) v.set_count("other", int(count["NOT VOTING"])) v.add_source(vote["source"]) v.pupa_id = vote["source"] for vt in vote["votes"]: key = {"Y": "yes", "N": "no"}.get(vt["vote"], "other") v.vote(key, vt["name"]) yield v
def scrape_chamber_votes(self, chamber, session): url = { "upper": "%s/%s" % (RI_URL_BASE, "SVotes"), "lower": "%s/%s" % (RI_URL_BASE, "HVotes") }[chamber] action = "%s/%s" % (url, "votes.asp") dates = self.get_vote_dates(url, session) for date in dates: votes = self.parse_vote_page(self.post_to(action, date), url, session) for vote_dict in votes: for vote in vote_dict.values(): count = vote['count'] chamber = { "H": "lower", "S": "upper" }[vote['meta']['chamber']] try: bill_id = self._bill_id_by_type[(chamber, vote['meta']['bill'])] except KeyError: self.warning('no such bill_id %s %s', chamber, vote['meta']['bill']) continue v = VoteEvent( chamber=chamber, start_date=vote['time'].strftime('%Y-%m-%d'), motion_text=vote['meta']['extra']['motion'], result='pass' if count['passage'] else 'fail', classification='passage', legislative_session=session, bill=bill_id, bill_chamber=chamber, ) v.set_count('yes', int(count['YEAS'])) v.set_count('no', int(count['NAYS'])) v.set_count('other', int(count['NOT VOTING'])) v.add_source(vote['source']) v.pupa_id = vote['source'] for vt in vote['votes']: key = { 'Y': 'yes', 'N': 'no', }.get(vt['vote'], 'other') v.vote(key, vt['name']) yield v
def parse_vote(self, actor, date, row, session, bill_id, bill_chamber, source): """ takes the actor, date and row element and returns a Vote object """ spans = row.xpath('.//span') motion = row.text.replace(u'\u00a0', " ").replace("-", "").strip() motion = motion if motion else "passage" passed, yes_count, no_count, other_count = spans[0].text_content().rsplit('-', 3) yes_votes = self.get_names(spans[1].tail) no_votes = self.get_names(spans[2].tail) other_votes = [] for span in spans[3:]: if span.text.startswith(('Absent', 'Excused')): other_votes += self.get_names(span.tail) for key, val in {'adopted': 'pass', 'passed': 'pass', 'failed': 'fail'}.items(): if key in passed.lower(): passed = val break vote = VoteEvent(chamber=actor, start_date=date, motion_text=motion, bill=bill_id, bill_chamber=bill_chamber, result=passed, classification="passage", legislative_session=session) vote.add_source(source) vote.set_count('yes', int(yes_count)) vote.set_count('no', int(no_count)) vote.set_count('absent', int(other_count)) for name in yes_votes: if name and name != 'None': vote.yes(name) for name in no_votes: if name and name != 'None': vote.no(name) for name in other_votes: if name and name != 'None': vote.vote('absent', name) yield vote
def asvote(self): v = VoteEvent( chamber=self.chamber(), start_date=self.date(), motion_text=self.motion(), result='pass' if self.passed() else 'fail', classification='passage', bill=self.bill, ) v.set_count('yes', self.yes_count()) v.set_count('no', self.no_count()) v.set_count('other', self.other_count()) for voter in self.yes_votes(): v.yes(voter) for voter in self.no_votes(): v.no(voter) for voter in self.other_votes(): v.vote('other', voter) v.add_source(self.url) return v
def parse_vote( self, bill, journal_entry_number, action, act_chamber, act_date, url ): # html = self.get(url).text # doc = lxml.html.fromstring(html) yes = no = other = 0 result = "" vote_counts = action.split() for vote_count in vote_counts: if re.match(r"[\D][\d]", vote_count): if "Y" in vote_count: yes = int(vote_count[1:]) elif "N" in vote_count: no = int(vote_count[1:]) elif "E" in vote_count or "A" in vote_count: other += int(vote_count[1:]) if "PASSED" in action: result = "pass" elif "FAILED" in action: result = "fail" else: result = "pass" if yes > no else "fail" vote = VoteEvent( bill=bill, start_date=act_date.strftime("%Y-%m-%d"), chamber=act_chamber, motion_text=action + " #" + journal_entry_number, result=result, classification="passage", ) vote.set_count("yes", yes) vote.set_count("no", no) vote.set_count("other", other) vote.add_source(url) yield vote
def asvote(self): v = VoteEvent( chamber=self.chamber(), start_date=self.date(), motion_text=self.motion(), result="pass" if self.passed() else "fail", classification="passage", bill=self.bill, ) v.pupa_id = self.url # URL contains sequence number v.set_count("yes", self.yes_count()) v.set_count("no", self.no_count()) v.set_count("other", self.other_count()) for voter in self.yes_votes(): v.yes(voter) for voter in self.no_votes(): v.no(voter) for voter in self.other_votes(): v.vote("other", voter) v.add_source(self.url) return v
def process_committee_vote(self, committee_action, bill): try: date = committee_action["ActionDate"] vote_info = committee_action["Vote"] except KeyError: self.logger.warning("Committee vote has no data. Skipping.") return date = self.date_format(date) other_count = 0 for v in vote_info: vote_count = 0 if v["VoteCount"] == "" else int(v["VoteCount"]) if v["VoteType"] == "Yes": yes_count = vote_count elif v["VoteType"] == "No": no_count = vote_count else: other_count += vote_count result = "fail" if yes_count > no_count: result = "pass" v = VoteEvent( chamber="legislature", start_date=date, motion_text="Committee Vote", result=result, classification="committee", bill=bill, ) v.set_count("yes", yes_count) v.set_count("no", no_count) v.set_count("other", other_count) return v
def process_committee_vote(self, committee_action, bill): try: date = committee_action["ActionDate"] vote_info = committee_action["Vote"] except KeyError: self.logger.warning("Committee vote has no data. Skipping.") return date = self.date_format(date) other_count = 0 for v in vote_info: vote_count = 0 if v["VoteCount"] == "" else int(v["VoteCount"]) if v["VoteType"] == "Yes": yes_count = vote_count elif v["VoteType"] == "No": no_count = vote_count else: other_count += vote_count result = 'fail' if yes_count > no_count: result = 'pass' v = VoteEvent(chamber='legislature', start_date=date, motion_text='Committee Vote', result=result, classification='committee', bill=bill ) v.set_count('yes', yes_count) v.set_count('no', no_count) v.set_count('other', other_count) return v
def process_committee_vote(self, committee_action, bill): try: date = committee_action["ActionDate"] vote_info = committee_action["Vote"] except KeyError: self.logger.warning("Committee vote has no data. Skipping.") return date = self.date_format(date) other_count = 0 for v in vote_info: vote_count = 0 if v["VoteCount"] == "" else int(v["VoteCount"]) if v["VoteType"] == "Yes": yes_count = vote_count elif v["VoteType"] == "No": no_count = vote_count else: other_count += vote_count result = 'fail' if yes_count > no_count: result = 'pass' v = VoteEvent(chamber='legislature', start_date=date, motion_text='Committee Vote', result=result, classification='committee', bill=bill) v.set_count('yes', yes_count) v.set_count('no', no_count) v.set_count('other', other_count) return v
def _process_votes(self, rollcalls, bill_id, original_chamber, session, proxy): result_types = { 'FAILED': False, 'DEFEATED': False, 'PREVAILED': True, 'PASSED': True, 'SUSTAINED': True, 'NOT SECONDED': False, 'OVERRIDDEN': True, 'ADOPTED': True, } for r in rollcalls: proxy_link = proxy["url"] + r["link"] (path, resp) = self.urlretrieve(proxy_link) text = convert_pdf(path, 'text').decode("utf-8") lines = text.split("\n") os.remove(path) chamber = "lower" if "house of representatives" in lines[0].lower() else "upper" date_parts = lines[1].strip().split()[-3:] date_str = " ".join(date_parts).title() + " " + lines[2].strip() vote_date = datetime.datetime.strptime(date_str, "%b %d, %Y %I:%M:%S %p") vote_date = pytz.timezone('America/Indiana/Indianapolis').localize(vote_date) vote_date = vote_date.isoformat() passed = None for res, val in result_types.items(): # We check multiple lines now because the result of the # roll call vote as parsed can potentially be split. # PDF documents suck. for line in lines[3:5]: if res in line.upper(): passed = val break if passed is None: raise AssertionError("Missing bill passage type") motion = " ".join(lines[4].split()[:-2]) try: yeas = int(lines[4].split()[-1]) nays = int(lines[5].split()[-1]) excused = int(lines[6].split()[-1]) not_voting = int(lines[7].split()[-1]) except ValueError: self.logger.warning("Vote format is weird, skipping") continue vote = VoteEvent(chamber=chamber, legislative_session=session, bill=bill_id, bill_chamber=original_chamber, start_date=vote_date, motion_text=motion, result="pass" if passed else "fail", classification="passage") vote.set_count('yes', yeas) vote.set_count('no', nays) vote.set_count('excused', excused) vote.set_count('not voting', not_voting) vote.add_source(proxy_link) currently_counting = "" possible_vote_lines = lines[8:] for l in possible_vote_lines: l = l.replace("NOT\xc2\xa0VOTING", "NOT VOTING") l = l.replace("\xc2\xa0", " -") if "yea-" in l.lower().replace(" ", ""): currently_counting = "yes" elif "nay-" in l.lower().replace(" ", ""): currently_counting = "no" elif "excused-" in l.lower().replace(" ", ""): currently_counting = "excused" elif "notvoting-" in l.lower().replace(" ", ""): currently_counting = "not voting" elif currently_counting == "": pass elif re.search(r'v\. \d\.\d', l): # this gets rid of the version number # which is often found at the bottom of the doc pass else: voters = l.split(" ") for v in voters: if v.strip(): vote.vote(currently_counting, v.strip()) yield vote
def parse_vote_pdf(self, vote_url, bill): filename, response = self.urlretrieve(vote_url) text = convert_pdf(filename, type="text").decode() lines = text.splitlines() if "Senate" in vote_url: chamber = "upper" else: chamber = "lower" date_string = lines[0].split("Calendar Date:")[1].strip() date = datetime.datetime.strptime(date_string, "%b %d, %Y %I:%M (%p)") page_index = None for index, line in enumerate(lines): if "Yeas" in line and "Nays" in line: page_index = index break vote_counts = 5 * [0] vote_types = ["yes", "no", "not voting", "excused", "absent"] if page_index: counts = re.split(r"\s{2,}", lines[page_index].strip()) for index, count in enumerate(counts): number, string = count.split(" ", 1) number = int(number) vote_counts[index] = number else: raise ValueError("Vote Counts Not found at %s" % vote_url) passed = vote_counts[0] > vote_counts[1] # Consent calendar votes address multiple bills in one VoteEvent # eg, http://mgaleg.maryland.gov/2018RS/votes/Senate/0478.pdf is_consent_calendar = any( ["Consent Calendar" in line for line in lines[:page_index]] ) consent_calendar_bills = None motion = "" if is_consent_calendar: motion = re.split(r"\s{2,}", lines[page_index - 4].strip())[0] consent_calendar_bills = re.split(r"\s{2,}", lines[page_index - 1].strip()) assert ( consent_calendar_bills ), "Could not find bills for consent calendar vote" motion_keywords = [ "favorable", "reading", "amendment", "motion", "introduced", "bill pass", "committee", ] motion_lines = [ 3, 2, 4, 5, ] # Relative LineNumbers to be checked for existence of motion for i in motion_lines: if any( motion_keyword in motion.lower() for motion_keyword in motion_keywords ): break motion = re.split(r"\s{2,}", lines[page_index - i].strip())[0] else: if not any( motion_keyword in motion.lower() for motion_keyword in motion_keywords ): # This condition covers for the bad formating in SB 1260 motion = lines[page_index - 3] if not any( motion_keyword in motion.lower() for motion_keyword in motion_keywords ): # Check this one for SB 747 motion = "No motion given" self.warning("No motion given") vote = VoteEvent( bill=bill, chamber=chamber, start_date=date.strftime("%Y-%m-%d"), motion_text=motion, classification="passage", result="pass" if passed else "fail", ) # Include bill ID to avoid duplication for consent calendars vote.pupa_id = "{}#{}".format(vote_url, bill.identifier) for index, vote_type in enumerate(vote_types): vote.set_count(vote_type, vote_counts[index]) page_index = page_index + 2 # Keywords for identifying where names are located in the pdf show_stoppers = [ "Voting Nay", "Not Voting", "COPY", "Excused", "indicates vote change", "Indicates Vote Change", ] vote_index = 0 # For matching number of names extracted with vote counts(extracted independently) vote_name_counts = 5 * [0] while page_index < len(lines): current_line = lines[page_index].strip() if not current_line or "Voting Yea" in current_line: page_index += 1 continue if any(show_stopper in current_line for show_stopper in show_stoppers): page_index += 1 vote_index = vote_index + 1 continue names = re.split(r"\s{2,}", current_line) vote_name_counts[vote_index] += len(names) for name in names: vote.vote(vote_types[vote_index], name) page_index += 1 if vote_counts != vote_name_counts: raise ValueError("Votes Count and Number of Names don't match") return vote
def scrape_vote(self, bill, name, url): if "VOTE/h" in url: vote_chamber = 'lower' cols = (1, 5, 9, 13) name_offset = 3 yes_offset = 0 no_offset = 1 else: vote_chamber = 'upper' cols = (1, 6) name_offset = 4 yes_offset = 1 no_offset = 2 # Connecticut's SSL is causing problems with Scrapelib, so use Requests page = requests.get(url, verify=False).text if 'BUDGET ADDRESS' in page: return page = lxml.html.fromstring(page) yes_count = page.xpath( "string(//span[contains(., 'Those voting Yea')])") yes_count = int(re.match(r'[^\d]*(\d+)[^\d]*', yes_count).group(1)) no_count = page.xpath( "string(//span[contains(., 'Those voting Nay')])") no_count = int(re.match(r'[^\d]*(\d+)[^\d]*', no_count).group(1)) other_count = page.xpath( "string(//span[contains(., 'Those absent')])") other_count = int( re.match(r'[^\d]*(\d+)[^\d]*', other_count).group(1)) need_count = page.xpath( "string(//span[contains(., 'Necessary for')])") need_count = int( re.match(r'[^\d]*(\d+)[^\d]*', need_count).group(1)) date = page.xpath("string(//span[contains(., 'Taken on')])") date = re.match(r'.*Taken\s+on\s+(\d+/\s?\d+)', date).group(1) date = date.replace(' ', '') date = datetime.datetime.strptime(date + " " + bill.legislative_session, "%m/%d %Y").date() # not sure about classification. vote = Vote(chamber=vote_chamber, start_date=date, motion_text=name, result='pass' if yes_count > need_count else 'fail', classification='passage', bill=bill ) vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('other', other_count) vote.add_source(url) table = page.xpath("//table")[0] for row in table.xpath("tr"): for i in cols: name = row.xpath("string(td[%d])" % ( i + name_offset)).strip() if not name or name == 'VACANT': continue if "Y" in row.xpath("string(td[%d])" % (i + yes_offset)): vote.yes(name) elif "N" in row.xpath("string(td[%d])" % (i + no_offset)): vote.no(name) else: vote.vote('other', name) yield vote
def parse_vote(self, bill, link): # Server sometimes sends proper error headers, # sometimes not try: self.info("Get {}".format(link)) text = requests.get(link).text except requests.exceptions.HTTPError as err: self.warning("{} fetching vote {}, skipping".format(err, link)) return if "Varnish cache server" in text: self.warning("Scrape rate is too high, try re-scraping with " "The --rpm set to a lower number") return if "Page Not Found" in text or "Page Unavailable" in text: self.warning("missing vote, skipping") return member_doc = lxml.html.fromstring(text) motion = member_doc.xpath("//div[@id='main_content']/h4/text()") chamber_date_line = "".join( member_doc.xpath("//div[@id='main_content']/h3[1]//text()")) chamber_date_line_words = chamber_date_line.split() vote_chamber = chamber_date_line_words[0] vote_date = datetime.datetime.strptime(chamber_date_line_words[-1], "%m/%d/%Y") vote_status = " ".join(chamber_date_line_words[2:-2]) opinions = member_doc.xpath( "//div[@id='main_content']/h3[position() > 1]/text()") if len(opinions) > 0: vote_status = vote_status if vote_status.strip() else motion[0] vote_chamber = "upper" if vote_chamber == "Senate" else "lower" for i in opinions: try: count = int(i[i.find("(") + 1:i.find(")")]) except ValueError: # This is likely not a vote-count text chunk # It's probably '`On roll call the vote was:` pass else: if "yea" in i.lower(): yes_count = count elif "nay" in i.lower(): no_count = count elif "present" in i.lower(): p_count = count elif "absent" in i.lower(): a_count = count vote = VoteEvent( bill=bill, start_date=vote_date.strftime("%Y-%m-%d"), chamber=vote_chamber, motion_text=vote_status, result="pass" if yes_count > no_count else "fail", classification="passage", ) vote.pupa_id = link vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("abstain", p_count) vote.set_count("absent", a_count) vote.add_source(link) a_links = member_doc.xpath("//div[@id='main_content']/a/text()") for i in range(1, len(a_links)): if i <= yes_count: vote.vote("yes", re.sub(",", "", a_links[i]).split()[0]) elif no_count != 0 and i > yes_count and i <= yes_count + no_count: vote.vote("no", re.sub(",", "", a_links[i]).split()[0]) else: vote.vote("other", re.sub(",", "", a_links[i]).split()[0]) yield vote else: self.warning("No Votes for: %s", link)
def scrape_vote(self, bill, vote_chamber, bill_id, vote_id, vote_date, action_text): url = ('http://alisondb.legislature.state.al.us/Alison/' 'GetRollCallVoteResults.aspx?' 'VOTE={0}&BODY={1}&INST={2}&SESS={3}'.format( vote_id, vote_chamber, bill_id, self.session_id)) doc = lxml.html.fromstring(self.get(url=url).text) voters = {'Y': [], 'N': [], 'P': [], 'A': []} voters_and_votes = doc.xpath('//table/tr/td/font/text()') capture_vote = False name = '' for item in voters_and_votes: if capture_vote: capture_vote = False if name: voters[item].append(name) else: capture_vote = True name = item if (name.endswith(", Vacant") or name.startswith("Total ") or not name.strip()): name = '' # Check name counts against totals listed on the site total_yea = doc.xpath('//*[starts-with(text(), "Total Yea")]/text()') if total_yea: total_yea = int(total_yea[0].split(":")[-1]) assert total_yea == len(voters['Y']), "Yea count incorrect" else: total_yea = len(voters['Y']) total_nay = doc.xpath('//*[starts-with(text(), "Total Nay")]/text()') if total_nay: total_nay = int(total_nay[0].split(":")[-1]) assert total_nay == len(voters['N']), "Nay count incorrect" else: total_nay = len(voters['N']) total_absent = doc.xpath( '//*[starts-with(text(), "Total Absent")]/text()') if total_absent: total_absent = int(total_absent[0].split(":")[-1]) assert total_absent == len(voters['A']), "Absent count incorrect" total_other = len(voters['P']) + len(voters['A']) vote = VoteEvent( chamber=self.CHAMBERS[vote_chamber[0]], start_date=vote_date, motion_text=action_text, result='pass' if total_yea > total_nay else 'fail', classification='passage', bill=bill, ) vote.set_count('yes', total_yea) vote.set_count('no', total_nay) vote.set_count('other', total_other) vote.add_source(url) for member in voters['Y']: vote.vote('yes', member) for member in voters['N']: vote.vote('no', member) for member in (voters['A'] + voters['P']): vote.vote('other', member) yield vote
def parse_roll_call(self, bill, link, chamber, date): url = link.attrib['href'] page = self.get(url).text page = lxml.html.fromstring(page) xpath = 'string(//div[@class="Column-OneFourth"]/div[3])' motion = page.xpath(xpath).strip() motion = re.sub(r'\s+', ' ', motion) if motion == 'FP': motion = 'FINAL PASSAGE' if motion == 'FINAL PASSAGE': type = 'passage' elif re.match(r'CONCUR(RENCE)? IN \w+ AMENDMENTS', motion): type = 'amendment' else: type = 'other' motion = link.text_content() yeas = int(page.xpath("//div[text() = 'YEAS']")[0].getnext().text) nays = int(page.xpath("//div[text() = 'NAYS']")[0].getnext().text) lve = int(page.xpath("//div[text() = 'LVE']")[0].getnext().text) nv = int(page.xpath("//div[text() = 'N/V']")[0].getnext().text) other = lve + nv vote = VoteEvent( chamber=chamber, start_date=tz.localize(date), motion_text=motion, classification=type, result='pass' if yeas > (nays + other) else 'fail', bill=bill, ) # pupa_id situation here is a bit weird, same vote can be used for # multiple bills see: # http://www.legis.state.pa.us/CFDOCS/Legis/RC/Public/rc_view_action2.cfm?sess_yr=2017&sess_ind=0&rc_body=H&rc_nbr=11 # noqa # so we toss the bill id onto the end of the URL vote.pupa_id = url + '#' + bill.identifier vote.add_source(url) vote.set_count('yes', yeas) vote.set_count('no', nays) vote.set_count('other', other) for div in page.xpath('//*[contains(@class, "RollCalls-Vote")]'): name = div.text_content().strip() name = re.sub(r'^[\s,]+', '', name) name = re.sub(r'[\s,]+$', '', name) class_attr = div.attrib['class'].lower() if 'yea' in class_attr: voteval = 'yes' elif 'nay' in class_attr: voteval = 'no' elif 'nvote' in class_attr: voteval = 'other' elif 'lve' in class_attr: voteval = 'other' else: msg = 'Unrecognized vote val: %s' % class_attr raise Exception(msg) vote.vote(voteval, name) return vote
def scrape_votes(self, session, zip_url): votes = {} last_line = [] for line in self.zf.open('tblrollcallsummary.txt'): if line.strip() == "": continue line = line.split('|') if len(line) < 14: if len(last_line + line[1:]) == 14: line = last_line self.warning('used bad vote line') else: last_line = line self.warning('bad vote line %s' % '|'.join(line)) session_yr = line[0] body = line[1] vote_num = line[2] timestamp = line[3] bill_id = line[4].strip() yeas = int(line[5]) nays = int(line[6]) # present = int(line[7]) # absent = int(line[8]) motion = line[11].strip() or '[not available]' if session_yr == session and bill_id in self.bills_by_id: actor = 'lower' if body == 'H' else 'upper' time = dt.datetime.strptime(timestamp, '%m/%d/%Y %I:%M:%S %p') # TODO: stop faking passed somehow passed = yeas > nays vote = Vote(chamber=actor, start_date=time.strftime("%Y-%m-%d"), motion_text=motion, result='pass' if passed else 'fail', classification='passage', bill=self.bills_by_id[bill_id]) vote.set_count('yes', yeas) vote.set_count('no', nays) vote.add_source(zip_url) votes[body+vote_num] = vote for line in self.zf.open('tblrollcallhistory.txt'): # 2012 | H | 2 | 330795 | HB309 | Yea |1/4/2012 8:27:03 PM session_yr, body, v_num, employee, bill_id, vote, date \ = line.split('|') if not bill_id: continue if session_yr == session and bill_id.strip() in self.bills_by_id: try: leg = self.legislators[employee]['name'] except KeyError: self.warning("Error, can't find person %s" % employee) continue vote = vote.strip() if body+v_num not in votes: self.warning("Skipping processing this vote:") self.warning("Bad ID: %s" % (body+v_num)) continue other_count = 0 # code = self.legislators[employee]['seat'] if vote == 'Yea': votes[body+v_num].yes(leg) elif vote == 'Nay': votes[body+v_num].no(leg) else: votes[body+v_num].other(leg) other_count += 1 votes[body+v_num].set_count('other', other_count) for vote in votes.values(): yield vote
def scrape(self, session=None): HTML_TAGS_RE = r'<.*?>' if session is None: session = self.latest_session() year_slug = self.jurisdiction.get_year_slug(session) # Load all bills and resolutions via the private API bills_url = \ 'http://legislature.vermont.gov/bill/loadBillsReleased/{}/'.\ format(year_slug) bills_json = self.get(bills_url).text bills = json.loads(bills_json)['data'] or [] bills_url = \ 'http://legislature.vermont.gov/bill/loadBillsIntroduced/{}/'.\ format(year_slug) bills_json = self.get(bills_url).text bills.extend(json.loads(bills_json)['data'] or []) resolutions_url = \ 'http://legislature.vermont.gov/bill/loadAllResolutionsByChamber/{}/both'.\ format(year_slug) resolutions_json = self.get(resolutions_url).text bills.extend(json.loads(resolutions_json)['data'] or []) # Parse the information from each bill for info in bills: # Strip whitespace from strings info = {k: v.strip() for k, v in info.items()} # Identify the bill type and chamber if info['BillNumber'].startswith('J.R.H.'): bill_type = 'joint resolution' bill_chamber = 'lower' elif info['BillNumber'].startswith('J.R.S.'): bill_type = 'joint resolution' bill_chamber = 'upper' elif info['BillNumber'].startswith('H.C.R.'): bill_type = 'concurrent resolution' bill_chamber = 'lower' elif info['BillNumber'].startswith('S.C.R.'): bill_type = 'concurrent resolution' bill_chamber = 'upper' elif info['BillNumber'].startswith('H.R.'): bill_type = 'resolution' bill_chamber = 'lower' elif info['BillNumber'].startswith('S.R.'): bill_type = 'resolution' bill_chamber = 'upper' elif info['BillNumber'].startswith('PR.'): bill_type = 'constitutional amendment' if info['Body'] == 'H': bill_chamber = 'lower' elif info['Body'] == 'S': bill_chamber = 'upper' else: raise AssertionError("Amendment not tied to chamber") elif info['BillNumber'].startswith('H.'): bill_type = 'bill' bill_chamber = 'lower' elif info['BillNumber'].startswith('S.'): bill_type = 'bill' bill_chamber = 'upper' else: raise AssertionError( "Unknown bill type found: '{}'". format(info['BillNumber']) ) bill_id = info['BillNumber'].replace('.', '').replace(' ', '') # put one space back in between type and number bill_id = re.sub(r'([a-zA-Z]+)(\d+)', r'\1 \2', bill_id) # Create the bill using its basic information bill = Bill( identifier=bill_id, legislative_session=session, chamber=bill_chamber, title=info['Title'], classification=bill_type ) if 'resolution' in bill_type: bill.add_source(resolutions_url) else: bill.add_source(bills_url) # Load the bill's information page to access its metadata bill_url = 'http://legislature.vermont.gov/bill/status/{0}/{1}'.\ format(year_slug, info['BillNumber']) doc = self.lxmlize(bill_url) bill.add_source(bill_url) # Capture sponsors sponsors = doc.xpath( '//dl[@class="summary-table"]/dt[text()="Sponsor(s)"]/' 'following-sibling::dd[1]/ul/li' ) sponsor_type = 'primary' for sponsor in sponsors: if sponsor.xpath('span/text()') == ['Additional Sponsors']: sponsor_type = 'cosponsor' continue sponsor_name = sponsor.xpath('a/text()')[0].\ replace("Rep.", "").replace("Sen.", "").strip() if sponsor_name and not \ (sponsor_name[:5] == "Less" and len(sponsor_name) == 5): bill.add_sponsorship( name=sponsor_name, classification=sponsor_type, entity_type='person', primary=(sponsor_type == 'primary') ) # Capture bill text versions # Warning: There's a TODO in VT's source code saying 'move this to where it used to be' # so leave in the old and new positions versions = doc.xpath( '//dl[@class="summary-table"]/dt[text()="Bill/Resolution Text"]/' 'following-sibling::dd[1]/ul/li/a |' '//ul[@class="bill-path"]//a' ) for version in versions: if version.xpath('text()'): bill.add_version_link( note=version.xpath('text()')[0], url=version.xpath('@href')[0].replace(' ', '%20'), media_type='application/pdf' ) # Identify the internal bill ID, used for actions and votes # If there is no internal bill ID, then it has no extra information try: internal_bill_id = re.search( r'"bill/loadBillDetailedStatus/.+?/(\d+)"', lxml.etree.tostring(doc).decode('utf-8') ).group(1) except AttributeError: self.warning("Bill {} appears to have no activity".format(info['BillNumber'])) yield bill continue # Capture actions actions_url = 'http://legislature.vermont.gov/bill/loadBillDetailedStatus/{0}/{1}'.\ format(year_slug, internal_bill_id) actions_json = self.get(actions_url).text actions = json.loads(actions_json)['data'] bill.add_source(actions_url) chambers_passed = set() for action in actions: action = {k: v for k, v in action.items() if v is not None} if "Signed by Governor" in action['FullStatus']: actor = 'executive' elif action['ChamberCode'] == 'H': actor = 'lower' elif action['ChamberCode'] == 'S': actor = 'upper' else: raise AssertionError("Unknown actor for bill action") # Categorize action if "Signed by Governor" in action['FullStatus']: # assert chambers_passed == set("HS") action_type = 'executive-signature' elif "Vetoed by the Governor" in action['FullStatus']: action_type = 'executive-veto' elif "Read first time" in action['FullStatus'] \ or "Read 1st time" in action['FullStatus']: action_type = 'introduction' elif "Reported favorably" in action['FullStatus']: action_type = 'committee-passage-favorable' elif actor == 'lower' and any(x.lower().startswith('aspassed') for x in action['keywords'].split(';')): action_type = 'passage' chambers_passed.add("H") elif actor == 'upper' and any(x.lower().startswith(' aspassed') or x.lower().startswith('aspassed') for x in action['keywords'].split(';')): action_type = 'passage' chambers_passed.add("S") else: action_type = None bill.add_action( description=re.sub(HTML_TAGS_RE, "", action['FullStatus']), date=datetime.datetime.strftime( datetime.datetime.strptime(action['StatusDate'], '%m/%d/%Y'), '%Y-%m-%d' ), chamber=actor, classification=action_type ) # Capture votes votes_url = 'http://legislature.vermont.gov/bill/loadBillRollCalls/{0}/{1}'.format( year_slug, internal_bill_id) votes_json = self.get(votes_url).text votes = json.loads(votes_json)['data'] bill.add_source(votes_url) for vote in votes: roll_call_id = vote['VoteHeaderID'] roll_call_url = ('http://legislature.vermont.gov/bill/' 'loadBillRollCallDetails/{0}/{1}'.format( year_slug, roll_call_id)) roll_call_json = self.get(roll_call_url).text roll_call = json.loads(roll_call_json)['data'] roll_call_yea = [] roll_call_nay = [] roll_call_not_voting = [] for member in roll_call: (member_name, _district) = member['MemberName'].split(" of ") member_name = member_name.strip() if member['MemberVote'] == "Yea": roll_call_yea.append(member_name) elif member['MemberVote'] == "Nay": roll_call_nay.append(member_name) else: roll_call_not_voting.append(member_name) if ("Passed -- " in vote['FullStatus'] or "Veto of Governor overridden" in vote['FullStatus']): did_pass = True elif ("Failed -- " in vote['FullStatus'] or 'Veto of the Governor sustained' in vote['FullStatus']): did_pass = False else: raise AssertionError("Roll call vote result is unclear") # Check vote counts yea_count = int(re.search(r'Yeas = (\d+)', vote['FullStatus']).group(1)) nay_count = int(re.search(r'Nays = (\d+)', vote['FullStatus']).group(1)) vote_to_add = VoteEvent( bill=bill, chamber=('lower' if vote['ChamberCode'] == 'H' else 'upper'), start_date=datetime.datetime.strftime( datetime.datetime.strptime(vote['StatusDate'], '%m/%d/%Y'), '%Y-%m-%d' ), motion_text=re.sub(HTML_TAGS_RE, "", vote['FullStatus']).strip(), result='pass' if did_pass else 'fail', classification='passage', legislative_session=session, ) vote_to_add.add_source(roll_call_url) vote_to_add.set_count('yes', yea_count) vote_to_add.set_count('no', nay_count) vote_to_add.set_count('not voting', len(roll_call_not_voting)) for member in roll_call_yea: vote_to_add.yes(member) for member in roll_call_nay: vote_to_add.no(member) for member in roll_call_not_voting: vote_to_add.vote('not voting', member) yield vote_to_add # Capture extra information- Not yet implemented # Witnesses: # http://legislature.vermont.gov/bill/loadBillWitnessList/{year_slug}/{internal_bill_id} # Conference committee members: # http://legislature.vermont.gov/bill/loadBillConference/{year_slug}/{bill_number} # Committee meetings: # http://legislature.vermont.gov/committee/loadHistoryByBill/{year_slug}?LegislationId={internal_bill_id} yield bill
def scrape_action_page(self, bill, page): action_rows = page.xpath('//tbody/tr') for row in action_rows: action_date = row.xpath('td[1]/text()')[0] action_date = datetime.strptime(action_date, '%m/%d/%Y') action_year = action_date.year action_date = action_date.strftime('%Y-%m-%d') if row.xpath('td[2]/text()'): action_actor = row.xpath('td[2]/text()')[0] action_actor = self.chamber_map_reverse[action_actor.strip()] action_name = row.xpath('string(td[3])') # House votes if "Supplement" in action_name: actor = "lower" vote_action = action_name.split(' -')[0] y = int(action_name.strip().split('-')[1].split('YEAS')[0]) n = int(action_name.strip().split('YEAS to')[1].split('NAYS')[0]) # get supplement number n_supplement = int(action_name.strip().split('No. ')[1].split(r')')[0]) cached_vote = VoteEvent( chamber=actor, start_date=action_date, motion_text=vote_action, result='pass' if y > n else 'fail', classification='passage', bill=bill, ) cached_vote.set_count('yes', y) cached_vote.set_count('no', n) housevote_pdf = 'http://www.mass.gov/legis/journal/combined{}RCs.pdf'.format( action_year ) # note: 2014-2015 different format and no data on website for years prior to 2014 self.scrape_house_vote(cached_vote, housevote_pdf, n_supplement) cached_vote.add_source(housevote_pdf) cached_vote.pupa_id = '{}#{}'.format(housevote_pdf, n_supplement) yield cached_vote # Senate votes if "Roll Call" in action_name: actor = "upper" # placeholder vote_action = action_name.split(' -')[0] try: y, n = re.search('(\d+) yeas .*? (\d+) nays', action_name.lower()).groups() y = int(y) n = int(n) except AttributeError: y = int(re.search(r"yeas\s*(\d*)", action_name.lower()).group(1)) n = int(re.search(r"nays\s*(\d*)", action_name.lower()).group(1)) # TODO: other count isn't included, set later cached_vote = VoteEvent( chamber=actor, start_date=action_date, motion_text=vote_action, result='pass' if y > n else 'fail', classification='passage', bill=bill, ) cached_vote.set_count('yes', y) cached_vote.set_count('no', n) rollcall_pdf = 'http://malegislature.gov' + row.xpath('string(td[3]/a/@href)') self.scrape_senate_vote(cached_vote, rollcall_pdf) cached_vote.add_source(rollcall_pdf) yield cached_vote attrs = self.categorizer.categorize(action_name) action = bill.add_action( action_name.strip(), action_date, chamber=action_actor, classification=attrs['classification'], ) for com in attrs.get('committees', []): action.add_related_entity(com, entity_type='organization')