def record_votes(root, session, chamber): for el in root.xpath('//div{}'.format(''.join(vote_selectors))): mv = MaybeVote(el) if not mv.is_valid: continue v = VoteEvent( chamber=chamber, start_date=None, motion_text='passage' if mv.passed else 'other', result='pass' if mv.passed else 'fail', classification='passage' if mv.passed else 'other', legislative_session=session[0:2], bill=mv.bill_id, bill_chamber=mv.chamber ) v.set_count('yes', mv.yeas or 0) v.set_count('no', mv.nays or 0) v.set_count('not voting', mv.present or 0) for each in mv.votes['yeas']: v.yes(each) for each in mv.votes['nays']: v.no(each) for each in mv.votes['present']: v.vote('not voting', each) for each in mv.votes['absent']: v.vote('absent', each) yield v
def scrape_committee_vote(self, bill, actor, date, motion, page, url, uniqid): votes = page.xpath("//table")[0] rows = votes.xpath(".//tr")[0] if rows[0].text_content() == 'Votes:': # New webste rows = votes.xpath(".//tr")[2] yno = rows.xpath(".//td") if len(yno) < 3: yes = yno[0] no, other = None, None else: yes, no, other = rows.xpath(".//td")[:3] def proc_block(obj, typ): if obj is None: return { "type": None, "count": None, "votes": [] } votes = [] for vote in obj.xpath(".//br"): if vote.tail: vote = vote.tail.strip() if vote: votes.append(vote) count = len(votes) return { "type": typ, "count": count, "votes": votes } vote_dict = { "yes": proc_block(yes, 'yes'), "no": proc_block(no, 'no'), "other": proc_block(other, 'other'), } yes_count = vote_dict['yes']['count'] no_count = vote_dict['no']['count'] or 0 other_count = vote_dict['other']['count'] or 0 print(motion) vote = Vote(chamber=actor, start_date=date, motion_text=motion, identifier=str(uniqid), result='pass' if (yes_count > no_count) else 'fail', classification='passage', bill=bill) vote.extras = {'_vote_id': uniqid} vote.add_source(url) vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('other', other_count) for key in vote_dict: for voter in vote_dict[key]['votes']: vote.vote(key, voter) yield vote
def build_vote(session, bill_id, url, vote_record, chamber, motion_text): # When they vote in a substitute they mark it as XHB bill_id = bill_id.replace('XHB', 'HB') passed = len(vote_record['yes']) > len(vote_record['no']) vote_event = VoteEvent( result='pass' if passed else 'fail', chamber=chamber, start_date=vote_record['date'].strftime('%Y-%m-%d'), motion_text=motion_text, classification='passage', legislative_session=session, bill=bill_id, bill_chamber='upper' if bill_id[0] == 'S' else 'lower' ) vote_event.pupa_id = url vote_event.set_count('yes', len(vote_record['yes'])) vote_event.set_count('no', len(vote_record['no'])) vote_event.set_count('excused', len(vote_record['excused'])) vote_event.set_count('absent', len(vote_record['absent'])) vote_event.set_count('other', len(vote_record['other'])) for vote_type in ['yes', 'no', 'excused', 'absent', 'other']: for voter in vote_record[vote_type]: vote_event.vote(vote_type, voter) vote_event.add_source(url) return vote_event
def scrape_senate_vote(self, bill, url, date): try: filename, resp = self.urlretrieve(url) except scrapelib.HTTPError: self.warning("missing vote file %s" % url) return vote = Vote( chamber='upper', start_date=date.strftime("%Y-%m-%d"), motion_text='Passage', # setting 'fail' for now. result='fail', classification='passage', bill=bill ) vote.add_source(url) text = convert_pdf(filename, 'text').decode('utf-8') os.remove(filename) if re.search('Yea:\s+\d+\s+Nay:\s+\d+\s+Absent:\s+\d+', text): yield from self.scrape_senate_vote_3col(bill, vote, text, url, date) return data = re.split(r'(Yea|Nay|Absent)s?:', text)[::-1] data = filter(None, data) keymap = dict(yea='yes', nay='no') actual_vote = collections.defaultdict(int) vote_count = { 'yes': 0, 'no': 0, 'other': 0 } while True: if not data: break vote_val = data.pop() key = keymap.get(vote_val.lower(), 'other') values = data.pop() for name in re.split(r'(?:[\s,]+and\s|[\s,]{2,})', values): if name.lower().strip() == 'none.': continue name = name.replace('..', '') name = re.sub(r'\.$', '', name) name = name.strip('-1234567890 \n') if not name: continue vote.vote(key, name) actual_vote[vote_val] += 1 vote_count[key] += 1 assert actual_vote[vote_val] == vote_count[key] for key, value in vote_count.items(): vote.set_count(key, value) # updating result with actual value vote.result = 'pass' if vote_count['yes'] > (vote_count['no'] + vote_count['other']) else 'fail' yield vote
def get_vote_event(self, bill, act, votes, result): '''Make VoteEvent object from given Bill, action, votes and result.''' organization = json.loads(act['organization_id'].lstrip('~')) vote_event = VoteEvent(legislative_session=bill.legislative_session, motion_text=act['description'], organization=organization, classification=None, start_date=act['date'], result=result, bill=bill) legistar_web, legistar_api = [src['url'] for src in bill.sources] vote_event.add_source(legistar_web) vote_event.add_source(legistar_api + '/histories') for vote in votes: raw_option = vote['VoteValueName'].lower() if raw_option == 'suspended': continue clean_option = self.VOTE_OPTIONS.get(raw_option, raw_option) vote_event.vote(clean_option, vote['VotePersonName'].strip()) return vote_event
def parse_roll_call(self, bill, link, chamber, date): url = link.attrib['href'] page = self.get(url).text page = lxml.html.fromstring(page) xpath = 'string(//div[@class="Column-OneFourth"]/div[3])' motion = page.xpath(xpath).strip() motion = re.sub(r'\s+', ' ', motion) if motion == 'FP': motion = 'FINAL PASSAGE' if motion == 'FINAL PASSAGE': type = 'passage' elif re.match(r'CONCUR(RENCE)? IN \w+ AMENDMENTS', motion): type = 'amendment' else: type = 'other' motion = link.text_content() yeas = int(page.xpath("//div[text() = 'YEAS']")[0].getnext().text) nays = int(page.xpath("//div[text() = 'NAYS']")[0].getnext().text) lve = int(page.xpath("//div[text() = 'LVE']")[0].getnext().text) nv = int(page.xpath("//div[text() = 'N/V']")[0].getnext().text) other = lve + nv vote = VoteEvent( chamber=chamber, start_date=tz.localize(date), motion_text=motion, classification=type, result='pass' if yeas > (nays + other) else 'fail', bill=bill, ) vote.add_source(url) vote.set_count('yes', yeas) vote.set_count('no', nays) vote.set_count('other', other) for div in page.xpath('//*[contains(@class, "RollCalls-Vote")]'): name = div.text_content().strip() name = re.sub(r'^[\s,]+', '', name) name = re.sub(r'[\s,]+$', '', name) class_attr = div.attrib['class'].lower() if 'yea' in class_attr: voteval = 'yes' elif 'nay' in class_attr: voteval = 'no' elif 'nvote' in class_attr: voteval = 'other' elif 'lve' in class_attr: voteval = 'other' else: msg = 'Unrecognized vote val: %s' % class_attr raise Exception(msg) vote.vote(voteval, name) return vote
def scrape_vote(self, bill, vote_id, session): vote_url = 'https://legis.delaware.gov/json/RollCall/GetRollCallVoteByRollCallId' form = { 'rollCallId': vote_id, 'sort': '', 'group': '', 'filter': '', } page = self.post(url=vote_url, data=form, allow_redirects=True).json() if page: roll = page['Model'] vote_chamber = self.chamber_map[roll['ChamberName']] # "7/1/16 01:00 AM" vote_date = dt.datetime.strptime(roll['TakenAtDateTime'], '%m/%d/%y %I:%M %p').strftime('%Y-%m-%d') # TODO: What does this code mean? vote_motion = roll['RollCallVoteType'] vote_passed = 'pass' if roll['RollCallStatus'] == 'Passed' else 'fail' other_count = (int(roll['NotVotingCount']) + int(roll['VacantVoteCount']) + int(roll['AbsentVoteCount']) + int(roll['ConflictVoteCount']) ) vote = Vote(chamber=vote_chamber, start_date=vote_date, motion_text=vote_motion, result=vote_passed, classification='other', bill=bill.identifier, legislative_session=session ) vote.add_source(vote_url) vote.set_count('yes', roll['YesVoteCount']) vote.set_count('no', roll['NoVoteCount']) vote.set_count('other', other_count) for row in roll['AssemblyMemberVotes']: # AssemblyMemberId looks like it should work here, # but for some sessions it's bugged to only return session try: voter = self.legislators_by_short[str(row['ShortName'])] name = voter['DisplayName'] except KeyError: self.warning('could not find legislator short name %s', row['ShortName']) name = row['ShortName'] if row['SelectVoteTypeCode'] == 'Y': vote.yes(name) elif row['SelectVoteTypeCode'] == 'N': vote.no(name) else: vote.vote('other', name) # bill.add_vote_event(vote) yield vote
def _parse_senate_votes(self, vote_data, bill, url): vote_datetime = datetime.datetime.strptime( vote_data['voteDate'], '%Y-%m-%d') if vote_data['voteType'] == 'FLOOR': motion = 'Floor Vote' elif vote_data['voteType'] == 'COMMITTEE': motion = '{} Vote'.format(vote_data['committee']['name']) else: raise ValueError('Unknown vote type encountered.') vote = VoteEvent( chamber='upper', start_date=vote_datetime.strftime('%Y-%m-%d'), motion_text=motion, classification='passage', result='fail', bill=bill, ) vote.add_source(url) vote_rolls = vote_data['memberVotes']['items'] yes_count, no_count, other_count = 0, 0, 0 # Count all yea votes. if 'items' in vote_rolls.get('AYE', {}): for legislator in vote_rolls['AYE']['items']: vote.yes(legislator['fullName']) yes_count += 1 if 'items' in vote_rolls.get('AYEWR', {}): for legislator in vote_rolls['AYEWR']['items']: vote.yes(legislator['fullName']) yes_count += 1 # Count all nay votes. if 'items' in vote_rolls.get('NAY', {}): for legislator in vote_rolls['NAY']['items']: vote.no(legislator['fullName']) no_count += 1 # Count all other types of votes. other_vote_types = ('EXC', 'ABS', 'ABD') for vote_type in other_vote_types: if vote_rolls.get(vote_type, []): for legislator in vote_rolls[vote_type]['items']: vote.vote('other', legislator['fullName']) other_count += 1 vote.result = 'pass' if yes_count > no_count else 'fail' vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('other', other_count) return vote
def scrape_vote(self, bill, date, motion, url): try: page = self.get(url).text if 'not yet official' in page: # Sometimes they link to vote pages before they go live pass else: page = lxml.html.fromstring(page) if url.endswith('Senate'): actor = 'upper' else: actor = 'lower' votevals = ['yes', 'no', 'not voting', 'other'] count_path = "string(//td[@align = 'center' and contains(., '%s: ')])" yes_count = int(page.xpath(count_path % "Yeas").split()[-1]) no_count = int(page.xpath(count_path % "Nays").split()[-1]) not_voting_count = int(page.xpath(count_path % "Non Voting").split()[-1]) other_count = int(page.xpath(count_path % "Present").split()[-1]) passed = yes_count > no_count + not_voting_count + other_count vote = VoteEvent(start_date='2017-03-04', motion_text=motion, result='pass' if passed else 'fail', classification='passage', chamber=actor, bill=bill) try: excused_count = int(page.xpath(count_path % "Excused").split()[-1]) vote.set_count('excused', excused_count) votevals.append('excused') except: pass vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('not voting', not_voting_count) vote.set_count('other', other_count) vote.add_source(url) xpath = ( '//*[contains(@class, "ms-standardheader")]/' 'following-sibling::table') divs = page.xpath(xpath) for (voteval, div) in zip(votevals, divs): for a in div.xpath('.//a'): name = a.text_content().strip() if not name: continue else: vote.vote(voteval, name) yield vote except: # sometiems the link is there but is dead pass
def parse_bill_actions_table(self, bill, action_table, bill_id, session, url, bill_chamber): for action in action_table.xpath('*')[1:]: date = action[0].text_content() date = dt.datetime.strptime(date, "%m/%d/%Y").strftime('%Y-%m-%d') actor = action[1].text_content().upper() string = action[2].text_content() actor = { "S": "upper", "H": "lower", "D": "legislature", # "Data Systems", "$": "Appropriation measure", "CONAM": "Constitutional Amendment" }[actor] act_type, committees = categorize_action(string) # XXX: Translate short-code to full committee name for the # matcher. real_committees = [] if committees: for committee in committees: try: committee = self.short_ids[committee]['name'] real_committees.append(committee) except KeyError: pass act = bill.add_action(string, date, chamber=actor, classification=act_type) for committee in real_committees: act.add_related_entity(name=committee, entity_type="organization") vote = self.parse_vote(string) if vote: v, motion = vote vote = VoteEvent(start_date=date, chamber=actor, bill=bill_id, bill_chamber=bill_chamber, legislative_session=session, motion_text=motion, result='pass' if 'passed' in string.lower() else 'fail', classification='passage') vote.add_source(url) vote.set_count('yes', int(v['n_yes'] or 0)) vote.set_count('no', int(v['n_no'] or 0)) vote.set_count('not voting', int(v['n_excused'] or 0)) for voter in split_specific_votes(v['yes']): vote.yes(voter) for voter in split_specific_votes(v['yes_resv']): vote.yes(voter) for voter in split_specific_votes(v['no']): vote.no(voter) for voter in split_specific_votes(v['excused']): vote.vote('not voting', voter) yield vote
def handle_page(self): (date, ) = self.doc.xpath('//span[@id="ctl00_ContentPlaceHolder1_lblDate"]/text()') date = datetime.datetime.strptime(date, '%m/%d/%Y %I:%M:%S %p' ).isoformat().replace('T', ' ') totals = self.doc.xpath('//table//table')[-1].text_content() totals = re.sub(r'(?mu)\s+', " ", totals).strip() (yes_count, no_count, other_count) = [int(x) for x in re.search( r'(?m)Total Yeas:\s+(\d+)\s+Total Nays:\s+(\d+)\s+' 'Total Missed:\s+(\d+)', totals).groups()] result = 'pass' if yes_count > no_count else 'fail' (committee, ) = self.doc.xpath( '//span[@id="ctl00_ContentPlaceHolder1_lblCommittee"]/text()') (action, ) = self.doc.xpath('//span[@id="ctl00_ContentPlaceHolder1_lblAction"]/text()') motion = "{} ({})".format(action, committee) vote = VoteEvent(start_date=date, bill=self.kwargs['bill'], chamber='lower', motion_text=motion, result=result, classification='committee', ) vote.add_source(self.url) vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('not voting', other_count) for member_vote in self.doc.xpath('//table//table//table//td'): if not member_vote.text_content().strip(): continue (member, ) = member_vote.xpath('span[2]//text()') (member_vote, ) = member_vote.xpath('span[1]//text()') if member_vote == "Y": vote.yes(member) elif member_vote == "N": vote.no(member) elif member_vote == "-": vote.vote('not voting', member) # Parenthetical votes appear to not be counted in the # totals for Yea, Nay, _or_ Missed elif re.search(r'\([YN]\)', member_vote): continue else: raise ValueError("Unknown vote type found: {}".format(member_vote)) yield vote
def parse_vote(self, bill, actor, date, motion, url, uniqid): page = self.get(url).text bill.add_source(url) vote_re = re.compile('YEAS -?\s?(\d+)(.*)NAYS -?\s?(\d+)' '(.*)ABSENT( OR NOT VOTING)? -?\s?' '(\d+)(.*)', re.MULTILINE | re.DOTALL) match = vote_re.search(page) yes_count = int(match.group(1)) no_count = int(match.group(3)) other_count = int(match.group(6)) if yes_count > no_count: passed = True else: passed = False if actor == 'upper' or actor == 'lower': vote_chamber = actor else: vote_chamber = '' vote = Vote(chamber=vote_chamber, start_date=date, motion_text=motion, result='pass' if passed else 'fail', identifier=str(uniqid), classification='passage', bill=bill) vote.add_source(url) vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('other', other_count) yes_votes = re.split('\s{2,}', match.group(2).strip()) no_votes = re.split('\s{2,}', match.group(4).strip()) other_votes = re.split('\s{2,}', match.group(7).strip()) for yes in yes_votes: if yes: vote.yes(yes) for no in no_votes: if no: vote.no(no) for other in other_votes: if other: vote.vote('other', other) yield vote
def scrape_votes(self, bill): bill_num = bill.identifier.split()[1] url = ("http://wslwebservices.leg.wa.gov/legislationservice.asmx/" "GetRollCalls?billNumber=%s&biennium=%s" % ( bill_num, self.biennium)) page = self.get(url) page = lxml.etree.fromstring(page.content) for rc in xpath(page, "//wa:RollCall"): motion = xpath(rc, "string(wa:Motion)") seq_no = xpath(rc, "string(wa:SequenceNumber)") date = xpath(rc, "string(wa:VoteDate)").split("T")[0] date = datetime.datetime.strptime(date, "%Y-%m-%d").date() yes_count = int(xpath(rc, "string(wa:YeaVotes/wa:Count)")) no_count = int(xpath(rc, "string(wa:NayVotes/wa:Count)")) abs_count = int( xpath(rc, "string(wa:AbsentVotes/wa:Count)")) ex_count = int( xpath(rc, "string(wa:ExcusedVotes/wa:Count)")) other_count = abs_count + ex_count agency = xpath(rc, "string(wa:Agency)") chamber = {'House': 'lower', 'Senate': 'upper'}[agency] vote = Vote(chamber=chamber, start_date=date, motion_text='{} (#{})'.format(motion, seq_no), result='pass' if yes_count > (no_count + other_count) else 'fail', classification='other', bill=bill) vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('other', other_count) vote.add_source(url) for sv in xpath(rc, "wa:Votes/wa:Vote"): name = xpath(sv, "string(wa:Name)") vtype = xpath(sv, "string(wa:VOte)") if vtype == 'Yea': vote.yes(name) elif vtype == 'Nay': vote.no(name) else: vote.vote('other', name) yield vote
def parse_committee_votes(self, bill, url): bill.add_source(url) html = self.get(url).text doc = lxml.html.fromstring(html) doc.make_links_absolute(url) chamber = ('upper' if 'Senate' in doc.xpath('string(//h1)') else 'lower') committee = tuple(doc.xpath('//h2')[0].itertext())[-2].strip() for link in doc.xpath("//a[contains(@href, 'listVoteSummary.cfm')]"): # Date for fmt in ("%m/%d/%Y", "%m-%d-%Y"): date = link.xpath('../../td')[0].text_content() try: date = datetime.datetime.strptime(date, fmt) except ValueError: continue break # Motion motion = link.text_content().split(' - ')[-1].strip() motion = 'Committee vote (%s): %s' % (committee, motion) # Roll call vote_url = link.attrib['href'] rollcall = self.parse_upper_committee_vote_rollcall(bill, vote_url) vote = VoteEvent( chamber=chamber, start_date=tz.localize(date), motion_text=motion, classification='other', result='pass' if rollcall['passed'] else 'fail', bill=bill, ) vote.pupa_id = vote_url vote.set_count('yes', rollcall['yes_count']) vote.set_count('no', rollcall['no_count']) vote.set_count('other', rollcall['other_count']) for voteval in ('yes', 'no', 'other'): for name in rollcall.get(voteval + '_votes', []): vote.vote(voteval, name) vote.add_source(url) vote.add_source(vote_url) yield vote
def scrape_chamber_votes(self, chamber, session): url = { "upper": "%s/%s" % (RI_URL_BASE, "SVotes"), "lower": "%s/%s" % (RI_URL_BASE, "HVotes") }[chamber] action = "%s/%s" % (url, "votes.asp") dates = self.get_vote_dates(url, session) for date in dates: votes = self.parse_vote_page(self.post_to(action, date), url, session) for vote_dict in votes: for vote in vote_dict.values(): count = vote['count'] chamber = { "H": "lower", "S": "upper" }[vote['meta']['chamber']] try: bill_id = self._bill_id_by_type[(chamber, vote['meta']['bill'])] except KeyError: self.warning('no such bill_id %s %s', chamber, vote['meta']['bill']) continue v = VoteEvent( chamber=chamber, start_date=vote['time'].strftime('%Y-%m-%d'), motion_text=vote['meta']['extra']['motion'], result='pass' if count['passage'] else 'fail', classification='passage', legislative_session=session, bill=bill_id, bill_chamber=chamber, ) v.set_count('yes', int(count['YEAS'])) v.set_count('no', int(count['NAYS'])) v.set_count('other', int(count['NOT VOTING'])) v.add_source(vote['source']) v.pupa_id = vote['source'] for vt in vote['votes']: key = { 'Y': 'yes', 'N': 'no', }.get(vt['vote'], 'other') v.vote(key, vt['name']) yield v
def scrape_votes(self, bill, page): base_url = 'https://apps.azleg.gov/api/BillStatusFloorAction' for header in page['FloorHeaders']: params = { 'billStatusId': page['BillId'], 'billStatusActionId': header['BillStatusActionId'], 'includeVotes': 'true', } resp = self.get(base_url, params=params) actions = json.loads(resp.content.decode('utf-8')) for action in actions: if action['Action'] == 'No Action': continue action_date = datetime.datetime.strptime(action['ReportDate'], '%Y-%m-%dT%H:%M:%S') vote = VoteEvent( chamber={ 'S': 'upper', 'H': 'lower', }[header['LegislativeBody']], motion_text=action['Action'], classification='passage', result=( 'pass' if action['UnanimouslyAdopted'] or action['Ayes'] > action['Nays'] else 'fail' ), start_date=action_date.strftime('%Y-%m-%d'), bill=bill, ) vote.add_source(resp.url) vote.set_count('yes', action['Ayes'] or 0) vote.set_count('no', action['Nays'] or 0) vote.set_count('other', (action['Present'] or 0)) vote.set_count('absent', (action['Absent'] or 0)) vote.set_count('excused', (action['Excused'] or 0)) vote.set_count('not voting', (action['NotVoting'] or 0)) for v in action['Votes']: vote_type = { 'Y': 'yes', 'N': 'no', }.get(v['Vote'], 'other') vote.vote(vote_type, v['Legislator']['FullName']) vote.pupa_id = resp.url+str(action['ReferralNumber']) yield vote
def parse_vote(self, actor, date, row, session, bill_id, bill_chamber, source): """ takes the actor, date and row element and returns a Vote object """ spans = row.xpath('.//span') motion = row.text.replace(u'\u00a0', " ").replace("-", "").strip() motion = motion if motion else "passage" passed, yes_count, no_count, other_count = spans[0].text_content().rsplit('-', 3) yes_votes = self.get_names(spans[1].tail) no_votes = self.get_names(spans[2].tail) other_votes = [] for span in spans[3:]: if span.text.startswith(('Absent', 'Excused')): other_votes += self.get_names(span.tail) for key, val in {'adopted': 'pass', 'passed': 'pass', 'failed': 'fail'}.items(): if key in passed.lower(): passed = val break vote = VoteEvent(chamber=actor, start_date=date, motion_text=motion, bill=bill_id, bill_chamber=bill_chamber, result=passed, classification="passage", legislative_session=session) vote.add_source(source) vote.set_count('yes', int(yes_count)) vote.set_count('no', int(no_count)) vote.set_count('absent', int(other_count)) for name in yes_votes: if name and name != 'None': vote.yes(name) for name in no_votes: if name and name != 'None': vote.no(name) for name in other_votes: if name and name != 'None': vote.vote('absent', name) yield vote
def asvote(self): v = VoteEvent( chamber=self.chamber(), start_date=self.date(), motion_text=self.motion(), result='pass' if self.passed() else 'fail', classification='passage', bill=self.bill, ) v.set_count('yes', self.yes_count()) v.set_count('no', self.no_count()) v.set_count('other', self.other_count()) for voter in self.yes_votes(): v.yes(voter) for voter in self.no_votes(): v.no(voter) for voter in self.other_votes(): v.vote('other', voter) v.add_source(self.url) return v
def scrape_vote(self, bill, motion, url): page = self.get(url, retry_on_404=True).text page = lxml.html.fromstring(page) yeas_cell = page.xpath("//td[text() = 'Yeas (Y):']")[0] yes_count = int(yeas_cell.xpath("string(following-sibling::td)")) nays_cell = page.xpath("//td[text() = 'Nays (N):']")[0] no_count = int(nays_cell.xpath("string(following-sibling::td)")) abs_cell = page.xpath("//td[text() = 'Absent (X):']")[0] abs_count = int(abs_cell.xpath("string(following-sibling::td)")) ex_cell = page.xpath("//td[text() = 'Excused (E):']")[0] ex_count = int(ex_cell.xpath("string(following-sibling::td)")) other_count = abs_count + ex_count if 'chamber=House' in url: chamber = 'lower' elif 'chamber=Senate' in url: chamber = 'upper' date_cell = page.xpath("//td[text() = 'Date:']")[0] date = date_cell.xpath("string(following-sibling::td)") try: date = datetime.datetime.strptime(date, "%B %d, %Y") except ValueError: date = datetime.datetime.strptime(date, "%b. %d, %Y") outcome_cell = page.xpath("//td[text()='Outcome:']")[0] outcome = outcome_cell.xpath("string(following-sibling::td)") vote = VoteEvent( chamber=chamber, start_date=date.strftime('%Y-%m-%d'), motion_text=motion, result='pass' if outcome == 'PREVAILS' else 'fail', classification='passage', bill=bill, ) vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('other', other_count) vote.add_source(url) member_cell = page.xpath("//td[text() = 'Member']")[0] for row in member_cell.xpath("../../tr")[1:]: name = row.xpath("string(td[2])") # name = name.split(" of ")[0] vtype = row.xpath("string(td[4])") if vtype == 'Y': vote.vote('yes', name) elif vtype == 'N': vote.vote('no', name) elif vtype == 'X' or vtype == 'E': vote.vote('other', name) yield vote
def scrape_vote(self, bill, vote_json, session): if vote_json['amendmentNumber']: motion = '{}: {}'.format( vote_json['amendmentNumber'], vote_json['action']) else: motion = vote_json['action'] result = 'pass' if vote_json['yesVotesCount'] > vote_json['noVotesCount'] else 'fail' v = VoteEvent( chamber=self.chamber_abbrev_map[vote_json['chamber']], start_date=self.parse_local_date(vote_json['voteDate']), motion_text=motion, result=result, legislative_session=session, bill=bill, classification='other', ) v.set_count(option='yes', value=vote_json['yesVotesCount']) v.set_count('no', vote_json['noVotesCount']) v.set_count('absent', vote_json['absentVotesCount']) v.set_count('excused', vote_json['excusedVotesCount']) v.set_count('other', vote_json['conflictVotesCount']) for name in vote_json['yesVotes'].split(','): if name.strip(): v.yes(name.strip()) for name in vote_json['noVotes'].split(','): if name.strip(): v.no(name.strip()) # add votes with other classifications # option can be 'yes', 'no', 'absent', # 'abstain', 'not voting', 'paired', 'excused' for name in vote_json['absentVotes'].split(','): if name.strip(): v.vote(option="absent", voter=name) for name in vote_json['excusedVotes'].split(','): if name.strip(): v.vote(option="excused", voter=name) for name in vote_json['conflictVotes'].split(','): if name.strip(): v.vote(option="other", voter=name) source_url = 'http://lso.wyoleg.gov/Legislation/{}/{}'.format( session, vote_json['billNumber']) v.add_source(source_url) yield v
def parse_vote(self, bill, link): member_doc = lxml.html.fromstring(self.get(link).text) motion = member_doc.xpath("//div[@id='main_content']/h4/text()") opinions = member_doc.xpath("//div[@id='main_content']/h3/text()") if len(opinions) > 0: temp = opinions[0].split() vote_chamber = temp[0] vote_date = datetime.datetime.strptime(temp[-1], '%m/%d/%Y') vote_status = " ".join(temp[2:-2]) vote_status = vote_status if vote_status.strip() else motion[0] vote_chamber = 'upper' if vote_chamber == 'Senate' else 'lower' for i in opinions: try: count = int(i[i.find("(") + 1:i.find(")")]) except: pass if "yea" in i.lower(): yes_count = count elif "nay" in i.lower(): no_count = count elif "present" in i.lower(): p_count = count elif "absent" in i.lower(): a_count = count vote = VoteEvent( bill=bill, start_date=vote_date.strftime('%Y-%m-%d'), chamber=vote_chamber, motion_text=vote_status, result='pass' if yes_count > no_count else 'fail', classification='passage', ) vote.pupa_id = link vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('abstain', p_count) vote.set_count('absent', a_count) vote.add_source(link) a_links = member_doc.xpath("//div[@id='main_content']/a/text()") for i in range(1, len(a_links)): if i <= yes_count: vote.vote('yes', re.sub(',', '', a_links[i]).split()[0]) elif no_count != 0 and i > yes_count and i <= yes_count + no_count: vote.vote('no', re.sub(',', '', a_links[i]).split()[0]) else: vote.vote('other', re.sub(',', '', a_links[i]).split()[0]) yield vote else: self.warning("No Votes for: %s", link)
def scrape_vote(self, bill, motion, url): page = self.get(url, retry_on_404=True).text page = lxml.html.fromstring(page) yeas_cell = page.xpath("//td[text() = 'Yeas (Y):']")[0] yes_count = int(yeas_cell.xpath("string(following-sibling::td)")) nays_cell = page.xpath("//td[text() = 'Nays (N):']")[0] no_count = int(nays_cell.xpath("string(following-sibling::td)")) abs_cell = page.xpath("//td[text() = 'Absent (X):']")[0] abs_count = int(abs_cell.xpath("string(following-sibling::td)")) ex_cell = page.xpath("//td[text() = 'Excused (E):']")[0] ex_count = int(ex_cell.xpath("string(following-sibling::td)")) other_count = abs_count + ex_count if "chamber=House" in url: chamber = "lower" elif "chamber=Senate" in url: chamber = "upper" date_cell = page.xpath("//td[text() = 'Date:']")[0] date = date_cell.xpath("string(following-sibling::td)") try: date = datetime.datetime.strptime(date, "%B %d, %Y") except ValueError: date = datetime.datetime.strptime(date, "%b. %d, %Y") outcome_cell = page.xpath("//td[text()='Outcome:']")[0] outcome = outcome_cell.xpath("string(following-sibling::td)") vote = VoteEvent( chamber=chamber, start_date=date.strftime("%Y-%m-%d"), motion_text=motion, result="pass" if outcome == "PREVAILS" else "fail", classification="passage", bill=bill, ) vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("other", other_count) vote.add_source(url) vote.pupa_id = url member_cell = page.xpath("//td[text() = 'Member']")[0] for row in member_cell.xpath("../../tr")[1:]: name = row.xpath("string(td[2])") # name = name.split(" of ")[0] vtype = row.xpath("string(td[4])") if vtype == "Y": vote.vote("yes", name) elif vtype == "N": vote.vote("no", name) elif vtype == "X" or vtype == "E": vote.vote("other", name) yield vote
def scrape_assembly_votes(self, session, bill, assembly_url, bill_id): # parse the bill data page, finding the latest html text url = assembly_url + "&Floor%26nbspVotes=Y" data = self.get(url).text doc = lxml.html.fromstring(data) doc.make_links_absolute(url) if "Votes:" in doc.text_content(): vote_motions = [] additional_votes_on_motion = 2 for table in doc.xpath("//table"): date = table.xpath('caption/span[contains(., "DATE:")]') date = next(date[0].itersiblings()).text date = datetime.datetime.strptime(date, "%m/%d/%Y") date = eastern.localize(date) date = date.isoformat() spanText = table.xpath("caption/span/text()") motion = spanText[2].strip() + spanText[3].strip() if motion in vote_motions: motion = motion + f" - Vote {additional_votes_on_motion}" additional_votes_on_motion += 1 else: vote_motions.append(motion) votes = (table.xpath("caption/span/span")[0].text.split(":") [1].split("/")) yes_count, no_count = map(int, votes) passed = yes_count > no_count vote = VoteEvent( chamber="lower", start_date=date, motion_text=motion, bill=bill, result="pass" if passed else "fail", classification="passage", ) vote.set_count("yes", yes_count) vote.set_count("no", no_count) absent_count = 0 excused_count = 0 tds = table.xpath("tr/td/text()") votes = [tds[i:i + 2] for i in range(0, len(tds), 2)] vote_dictionary = { "Y": "yes", "NO": "no", "ER": "excused", "AB": "absent", "NV": "not voting", "EL": "other" } for vote_pair in votes: name, vote_val = vote_pair vote.vote(vote_dictionary[vote_val], name) if vote_val == "AB": absent_count += 1 elif vote_val == "ER": excused_count += 1 vote.set_count("absent", absent_count) vote.set_count("excused", excused_count) vote.add_source(url) vote.pupa_id = url + motion + spanText[1] yield vote
def process_vote(self, votes, url, base_url, bill, legislators, chamber_dict, vote_results): for v in votes["items"]: try: v["yeas"] except KeyError: # sometimes the actual vote is buried a second layer deep v = self.get(base_url + v["link"]).json() try: v["yeas"] except KeyError: self.logger.warning("No vote info available, skipping") continue try: chamber = chamber_dict[v["chamber"]] except KeyError: chamber = "lower" if "house" in v["apn"] else "upper" try: date = self._tz.localize( datetime.datetime.strptime(v["date"], "%m/%d/%y")) date = "{:%Y-%m-%d}".format(date) except KeyError: try: date = self._tz.localize( datetime.datetime.strptime(v["occurred"], "%m/%d/%y")) date = "{:%Y-%m-%d}".format(date) except KeyError: self.logger.warning("No date found for vote, skipping") continue try: motion = v["action"] except KeyError: motion = v["motiontype"] # Sometimes Ohio's SOLAR will only return part of the JSON, so in that case skip if (not motion and isinstance(v['yeas'], str) and isinstance(v['nays'], str)): waringText = 'Malformed JSON found for vote ("revno" of {}); skipping' self.warning(waringText.format(v['revno'])) continue result = v.get("results") or v.get("passed") if result is None: if len(v['yeas']) > len(v['nays']): result = "passed" else: result = "failed" passed = vote_results[result.lower()] if "committee" in v: vote = VoteEvent( chamber=chamber, start_date=date, motion_text=motion, result='pass' if passed else 'fail', # organization=v["committee"], bill=bill, classification='passed') else: vote = VoteEvent(chamber=chamber, start_date=date, motion_text=motion, result='pass' if passed else 'fail', classification='passed', bill=bill) vote.pupa_id = str(v['revno']) # the yea and nay counts are not displayed, but vote totals are # and passage status is. yes_count = 0 no_count = 0 absent_count = 0 excused_count = 0 for voter_id in v["yeas"]: vote.yes(legislators[voter_id]) yes_count += 1 for voter_id in v["nays"]: vote.no(legislators[voter_id]) no_count += 1 if "absent" in v: for voter_id in v["absent"]: vote.vote('absent', legislators[voter_id]) absent_count += 1 if "excused" in v: for voter_id in v["excused"]: vote.vote('excused', legislators[voter_id]) excused_count += 1 vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('absent', absent_count) vote.set_count('excused', excused_count) # check to see if there are any other things that look # like vote categories, throw a warning if so for key, val in v.items(): if (type(val) == list and len(val) > 0 and key not in ["yeas", "nays", "absent", "excused"]): if val[0] in legislators: self.logger.warning( "{k} looks like a vote type that's not being counted." " Double check it?".format(k=key)) vote.add_source(url) yield vote
def parse_roll_call(self, bill, link, chamber, date): url = link.attrib['href'] page = self.get(url).text page = lxml.html.fromstring(page) xpath = 'string(//div[@class="Column-OneFourth"]/div[3])' motion = page.xpath(xpath).strip() motion = re.sub(r'\s+', ' ', motion) if motion == 'FP': motion = 'FINAL PASSAGE' if motion == 'FINAL PASSAGE': type = 'passage' elif re.match(r'CONCUR(RENCE)? IN \w+ AMENDMENTS', motion): type = 'amendment' else: type = 'other' motion = link.text_content() yeas = int(page.xpath("//div[text() = 'YEAS']")[0].getnext().text) nays = int(page.xpath("//div[text() = 'NAYS']")[0].getnext().text) lve = int(page.xpath("//div[text() = 'LVE']")[0].getnext().text) nv = int(page.xpath("//div[text() = 'N/V']")[0].getnext().text) other = lve + nv vote = VoteEvent( chamber=chamber, start_date=tz.localize(date), motion_text=motion, classification=type, result='pass' if yeas > (nays + other) else 'fail', bill=bill, ) # pupa_id situation here is a bit weird, same vote can be used for # multiple bills see: # http://www.legis.state.pa.us/CFDOCS/Legis/RC/Public/rc_view_action2.cfm?sess_yr=2017&sess_ind=0&rc_body=H&rc_nbr=11 # noqa # so we toss the bill id onto the end of the URL vote.pupa_id = url + '#' + bill.identifier vote.add_source(url) vote.set_count('yes', yeas) vote.set_count('no', nays) vote.set_count('other', other) for div in page.xpath('//*[contains(@class, "RollCalls-Vote")]'): name = div.text_content().strip() name = re.sub(r'^[\s,]+', '', name) name = re.sub(r'[\s,]+$', '', name) class_attr = div.attrib['class'].lower() if 'yea' in class_attr: voteval = 'yes' elif 'nay' in class_attr: voteval = 'no' elif 'nvote' in class_attr: voteval = 'other' elif 'lve' in class_attr: voteval = 'other' else: msg = 'Unrecognized vote val: %s' % class_attr raise Exception(msg) vote.vote(voteval, name) return vote
def scrape_house_vote(self, bill, url): try: filename, resp = self.urlretrieve(url) except scrapelib.HTTPError: self.warning("missing vote file %s" % url) return text = convert_pdf(filename, 'text') os.remove(filename) lines = text.splitlines() vote_type = None votes = collections.defaultdict(list) date = None for idx, line in enumerate(lines): line = line.rstrip().decode('utf-8') match = re.search(r'(\d+)/(\d+)/(\d{4,4})$', line) if match: date = datetime.datetime.strptime(match.group(0), "%m/%d/%Y") continue match = re.match( r'\s+YEAS: (\d+)\s+NAYS: (\d+)\s+NOT VOTING: (\d+)', line) if match: motion = (lines[idx - 2].strip()).decode('utf-8') if not motion: self.warning("No motion text found for vote") motion = "PASSAGE" yes_count, no_count, other_count = [ int(g) for g in match.groups() ] exc_match = re.search(r'EXCUSED: (\d+)', line) if exc_match: other_count += int(exc_match.group(1)) if line.endswith('ADOPTED') or line.endswith('PASSED'): passed = True else: passed = False continue match = re.match( r'(YEAS|NAYS|NOT VOTING|PAIRED|EXCUSED):\s+(\d+)\s*$', line) if match: vote_type = { 'YEAS': 'yes', 'NAYS': 'no', 'NOT VOTING': 'other', 'EXCUSED': 'other', 'PAIRED': 'paired' }[match.group(1)] continue if vote_type == 'paired': for part in line.split(' '): part = part.strip() if not part: continue name, pair_type = re.match(r'([^\(]+)\((YEA|NAY)\)', line).groups() name = name.strip() if pair_type == 'YEA': votes['yes'].append(name) elif pair_type == 'NAY': votes['no'].append(name) elif vote_type: for name in line.split(' '): name = name.strip() if not name: continue votes[vote_type].append(name) if date: vote = VoteEvent(chamber='lower', start_date=date.strftime("%Y-%m-%d"), motion_text=motion, result='pass' if passed else 'fail', classification='passage', bill=bill) vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('other', other_count) vote.add_source(url) vote.pupa_id = url for key, values in votes.items(): for value in values: vote.vote(key, value) yield vote else: self.warning("Syntax Error/Warning using 'convert_pdf'")
def handle_page(self): MOTION_INDEX = 4 TOTALS_INDEX = 6 VOTE_START_INDEX = 9 if len(self.lines) < 2: self.scraper.warning("Bad PDF! " + self.url) return motion = self.lines[MOTION_INDEX].strip() # Sometimes there is no motion name, only "Passage" in the line above if (not motion and not self.lines[MOTION_INDEX - 1].startswith("Calendar Page:")): motion = self.lines[MOTION_INDEX - 1] MOTION_INDEX -= 1 TOTALS_INDEX -= 1 VOTE_START_INDEX -= 1 else: assert motion, "Floor vote's motion name appears to be empty" for _extra_motion_line in range(2): MOTION_INDEX += 1 if self.lines[MOTION_INDEX].strip(): motion = "{}, {}".format(motion, self.lines[MOTION_INDEX].strip()) TOTALS_INDEX += 1 VOTE_START_INDEX += 1 else: break (yes_count, no_count, nv_count) = [ int(x) for x in re.search( r'^\s+Yeas - (\d+)\s+Nays - (\d+)\s+Not Voting - (\d+)\s*$', self.lines[TOTALS_INDEX]).groups() ] result = 'pass' if yes_count > no_count else 'fail' vote = VoteEvent( start_date=self.kwargs['date'], chamber=self.kwargs['chamber'], bill=self.kwargs['bill'], motion_text=motion, result=result, classification='passage', ) vote.add_source(self.url) vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('not voting', nv_count) for line in self.lines[VOTE_START_INDEX:]: if not line.strip(): break if " President " in line: line = line.replace(" President ", " ") elif " Speaker " in line: line = line.replace(" Speaker ", " ") # Votes follow the pattern of: # [vote code] [member name]-[district number] for vtype, member in re.findall( r'\s*(Y|N|EX|AV)\s+(.*?)-\d{1,3}\s*', line): vtype = { 'Y': 'yes', 'N': 'no', 'EX': 'excused', 'AV': 'abstain' }[vtype] vote.vote(vtype, member) # check totals line up yes_count = no_count = nv_count = 0 for vc in vote.counts: if vc['option'] == 'yes': yes_count = vc['value'] elif vc['option'] == 'no': no_count = vc['value'] else: nv_count += vc['value'] for vr in vote.votes: if vr['option'] == 'yes': yes_count -= 1 elif vr['option'] == 'no': no_count -= 1 else: nv_count -= 1 if yes_count != 0 or no_count != 0: raise ValueError('vote count incorrect: ' + self.url) if nv_count != 0: # On a rare occasion, a member won't have a vote code, # which indicates that they didn't vote. The totals reflect # this. self.scraper.info( "Votes don't add up; looking for additional ones") for line in self.lines[VOTE_START_INDEX:]: if not line.strip(): break for member in re.findall(r'\s{8,}([A-Z][a-z\'].*?)-\d{1,3}', line): vote.vote('not voting', member) yield vote
def scrape_vote(self, bill, motion, url): page = self.get(url, retry_on_404=True).text page = lxml.html.fromstring(page) yeas_cell = page.xpath("//td[text() = 'Yeas (Y):']")[0] yes_count = int(yeas_cell.xpath("string(following-sibling::td)")) nays_cell = page.xpath("//td[text() = 'Nays (N):']")[0] no_count = int(nays_cell.xpath("string(following-sibling::td)")) abs_cell = page.xpath("//td[text() = 'Absent (X):']")[0] abs_count = int(abs_cell.xpath("string(following-sibling::td)")) ex_cell = page.xpath("//td[text() = 'Excused (E):']")[0] ex_count = int(ex_cell.xpath("string(following-sibling::td)")) other_count = abs_count + ex_count if 'chamber=House' in url: chamber = 'lower' elif 'chamber=Senate' in url: chamber = 'upper' date_cell = page.xpath("//td[text() = 'Date:']")[0] date = date_cell.xpath("string(following-sibling::td)") try: date = datetime.datetime.strptime(date, "%B %d, %Y") except ValueError: date = datetime.datetime.strptime(date, "%b. %d, %Y") outcome_cell = page.xpath("//td[text()='Outcome:']")[0] outcome = outcome_cell.xpath("string(following-sibling::td)") vote = VoteEvent( chamber=chamber, start_date=date.strftime('%Y-%m-%d'), motion_text=motion, result='pass' if outcome == 'PREVAILS' else 'fail', classification='passage', bill=bill, ) vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('other', other_count) vote.add_source(url) vote.pupa_id = url member_cell = page.xpath("//td[text() = 'Member']")[0] for row in member_cell.xpath("../../tr")[1:]: name = row.xpath("string(td[2])") name = name.split(" of ")[0] vtype = row.xpath("string(td[4])") if vtype == 'Y': vote.vote('yes', name) elif vtype == 'N': vote.vote('no', name) elif vtype == 'X' or vtype == 'E': vote.vote('other', name) yield vote
def scrape_vote(self, bill, vote_url, chamber, date): page = self.lxmlize(vote_url) try: motion = page.xpath( '//td/b/font[text()="MOTION:"]/../../following-sibling::td/font/text()')[0] except: self.warning("Vote Summary Page Broken ") return if 'withdrawn' not in motion: # Every table row after the one with VOTE in a td/div/b/font rolls = page.xpath('//tr[preceding-sibling::tr/td/div/b/font/text()="VOTE"]') count_row = rolls[-1] yes_count = count_row.xpath('.//b/font[normalize-space(text())="YES:"]' '/../following-sibling::font[1]/text()')[0] no_count = count_row.xpath('.//b/font[normalize-space(text())="NO:"]' '/../following-sibling::font[1]/text()')[0] exc_count = count_row.xpath('.//b/font[normalize-space(text())="EXC:"]' '/../following-sibling::font[1]/text()')[0] nv_count = count_row.xpath('.//b/font[normalize-space(text())="ABS:"]' '/../following-sibling::font[1]/text()')[0] if count_row.xpath('.//b/font[normalize-space(text())="FINAL ACTION:"]' '/../following-sibling::b[1]/font/text()'): final = count_row.xpath('.//b/font[normalize-space(text())="FINAL ACTION:"]' '/../following-sibling::b[1]/font/text()')[0] passed = ('pass' in final.lower() or int(yes_count) > int(no_count)) elif 'passed without objection' in motion.lower(): passed = True yes_count = int(len(rolls[:-2])) else: self.warning("No vote breakdown found for %s" % vote_url) return vote = VoteEvent(chamber=chamber, start_date=self._tz.localize(date), motion_text=motion, result='pass' if passed else 'fail', bill=bill, classification='passage', ) vote.pupa_id = vote_url vote.set_count('yes', int(yes_count)) vote.set_count('no', int(no_count)) vote.set_count('excused', int(exc_count)) vote.set_count('not voting', int(nv_count)) vote.add_source(vote_url) for roll in rolls[:-2]: voter = roll.xpath('td[2]/div/font')[0].text_content() voted = roll.xpath('td[3]/div/font')[0].text_content().strip() if voted: if 'Yes' in voted: vote.yes(voter) elif 'No' in voted: vote.no(voter) elif 'Excused' in voted: vote.vote('excused', voter) else: vote.vote("other", voter) elif 'passed without objection' in motion.lower() and voter: vote.yes(voter) yield vote
def scrape_votes(self, bill, doc): vote_tr_path = ('//h6[@id="vote-header"]' '/ancestor::div[contains(@class, "gray-card")]' '//div[contains(@class, "card-body")]' '//div[@class="row"]') for vote_row in doc.xpath(vote_tr_path): entries = [ each.text_content() for each in vote_row.xpath('div')[1:-1:2] ] date, subject, rcs, aye, no, nv, abs, exc, total = entries result = vote_row.xpath('div/a')[0] result_text = result.text result_link = result.get('href') if 'H' in rcs: chamber = 'lower' elif 'S' in rcs: chamber = 'upper' date = eastern.localize( dt.datetime.strptime(date.replace('.', ''), "%m/%d/%Y %H:%M %p")) date = date.isoformat() ve = VoteEvent( chamber=chamber, start_date=date, motion_text=subject, result='pass' if 'PASS' in result_text else 'fail', bill=bill, classification='passage', # TODO: classify votes ) ve.set_count('yes', int(aye)) ve.set_count('no', int(no)) ve.set_count('not voting', int(nv)) ve.set_count('absent', int(abs)) ve.set_count('excused', int(exc)) ve.add_source(result_link) data = self.get(result_link).text vdoc = lxml.html.fromstring(data) # only one table that looks like this vote_table, = vdoc.xpath('//table[@cellpadding="5"]') # skip party row for row in vote_table.xpath('tr')[1:]: vote_type, dems, reps = row.xpath('td') vote_type = vote_type.text_content() if 'Ayes' in vote_type: vote_type = 'yes' elif 'Noes' in vote_type: vote_type = 'no' elif 'Not Voting' in vote_type: vote_type = 'not voting' elif 'Exc. Absence' in vote_type: vote_type = 'absent' elif 'Exc. Vote' in vote_type: vote_type = 'excused' else: raise ValueError('unknown vote type: ' + vote_type) for name in (vote_list_to_names(dems.text_content()) + vote_list_to_names(reps.text_content())): ve.vote(vote_type, name) yield ve
def scrape_votes_for_chamber(self, chamber, vote_data, bill, link): raw_vote_data = re.split('\w+? by [\w ]+?\s+-', vote_data.strip())[1:] motion_count = 1 for raw_vote in raw_vote_data: raw_vote = raw_vote.split( u'\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0') motion = raw_vote[0] vote_date = re.search('(\d+/\d+/\d+)', motion) if vote_date: vote_date = datetime.datetime.strptime(vote_date.group(), '%m/%d/%Y') passed = ('Passed' in motion or 'Recommended for passage' in motion or 'Rec. for pass' in motion or 'Adopted' in raw_vote[1]) vote_regex = re.compile('\d+$') aye_regex = re.compile('^.+voting aye were: (.+) -') no_regex = re.compile('^.+voting no were: (.+) -') not_voting_regex = re.compile( '^.+present and not voting were: (.+) -') yes_count = 0 no_count = 0 not_voting_count = 0 ayes = [] nos = [] not_voting = [] for v in raw_vote[1:]: v = v.strip() if v.startswith('Ayes...') and vote_regex.search(v): yes_count = int(vote_regex.search(v).group()) elif v.startswith('Noes...') and vote_regex.search(v): no_count = int(vote_regex.search(v).group()) elif v.startswith( 'Present and not voting...') and vote_regex.search(v): not_voting_count += int(vote_regex.search(v).group()) elif aye_regex.search(v): ayes = aye_regex.search(v).groups()[0].split(', ') elif no_regex.search(v): nos = no_regex.search(v).groups()[0].split(', ') elif not_voting_regex.search(v): not_voting += not_voting_regex.search(v).groups()[0].split( ', ') motion = motion.strip() motion = motion.replace('&', '&') # un-escape ampersands if motion in self._seen_votes: motion = '{} ({})'.format(motion, motion_count) motion_count += 1 self._seen_votes.add(motion) vote = VoteEvent( motion_text=motion, start_date=vote_date.strftime('%Y-%m-%d') if vote_date else None, classification='passage', result='pass' if passed else 'fail', chamber=chamber, bill=bill, ) vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('not voting', not_voting_count) vote.add_source(link) seen = set() for a in ayes: if a in seen: continue vote.yes(a) seen.add(a) for n in nos: if n in seen: continue vote.no(n) seen.add(n) for n in not_voting: if n in seen: continue vote.vote('not voting', n) seen.add(n) yield vote
def scrape_vote(self, bill, date, url): page = self.get(url).text page = lxml.html.fromstring(page) header = page.xpath("string(//h4[contains(@id, 'hdVote')])") if 'No Bill Action' in header: self.warning("bad vote header -- skipping") return location = header.split(', ')[1] if location.startswith('House'): chamber = 'lower' elif location.startswith('Senate'): chamber = 'upper' elif location.startswith('Joint'): chamber = 'joint' else: raise ScrapeError("Bad chamber: %s" % location) # committee = ' '.join(location.split(' ')[1:]).strip() # if not committee or committee.startswith('of Representatives'): # committee = None motion = ', '.join(header.split(', ')[2:]).strip() if motion: # If we can't detect a motion, skip this vote yes_count = int( page.xpath("string(//td[contains(@id, 'tdAyes')])")) no_count = int(page.xpath("string(//td[contains(@id, 'tdNays')])")) excused_count = int( page.xpath("string(//td[contains(@id, 'tdExcused')])")) absent_count = int( page.xpath("string(//td[contains(@id, 'tdAbsent')])")) passed = yes_count > no_count if motion.startswith('Do Pass'): type = 'passage' elif motion == 'Concurred in amendments': type = 'amendment' elif motion == 'Veto override': type = 'veto_override' else: type = 'other' vote = VoteEvent(chamber=chamber, start_date=date, motion_text=motion, result='pass' if passed else 'fail', classification=type, bill=bill) vote.add_source(url) vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('excused', excused_count) vote.set_count('absent', absent_count) for td in page.xpath("//table[contains(@id, 'tblVotes')]/tr/td"): if td.text in ('Aye', 'Yea'): vote.yes(td.getprevious().text.strip()) elif td.text == 'Nay': vote.no(td.getprevious().text.strip()) elif td.text == 'Excused': vote.vote('excused', td.getprevious().text.strip()) elif td.text == 'Absent': vote.vote('absent', td.getprevious().text.strip()) yield vote
def scrape_votes(self, bill, url): page = lxml.html.fromstring(self.get(url).text.replace(u'\xa0', ' ')) seen_rcs = set() re_ns = "http://exslt.org/regular-expressions" path = r"//p[re:test(text(), 'OKLAHOMA\s+(HOUSE|STATE\s+SENATE)')]" for header in page.xpath(path, namespaces={'re': re_ns}): bad_vote = False # Each chamber has the motion name on a different line of the file if 'HOUSE' in header.xpath("string()"): chamber = 'lower' motion_index = 8 else: chamber = 'upper' motion_index = 13 motion = header.xpath("string(following-sibling::p[%d])" % motion_index).strip() motion = re.sub(r'\s+', ' ', motion) if not motion.strip(): self.warning("Motion text not found") return match = re.match(r'^(.*) (PASSED|FAILED)$', motion) if match: motion = match.group(1) passed = match.group(2) == 'PASSED' else: passed = None rcs_p = header.xpath( "following-sibling::p[contains(., 'RCS#')]")[0] rcs_line = rcs_p.xpath("string()").replace(u'\xa0', ' ') rcs = re.search(r'RCS#\s+(\d+)', rcs_line).group(1) if rcs in seen_rcs: continue else: seen_rcs.add(rcs) date_line = rcs_p.getnext().xpath("string()") date = re.search(r'\d+/\d+/\d+', date_line).group(0) date = datetime.datetime.strptime(date, "%m/%d/%Y").date() vtype = None counts = collections.defaultdict(int) votes = collections.defaultdict(list) seen_yes = False for sib in header.xpath("following-sibling::p")[13:]: line = sib.xpath("string()").replace('\r\n', ' ').strip() if "*****" in line: break regex = (r'(YEAS|NAYS|EXCUSED|VACANT|CONSTITUTIONAL ' r'PRIVILEGE|NOT VOTING|N/V)\s*:\s*(\d+)(.*)') match = re.match(regex, line) if match: if match.group(1) == 'YEAS' and 'RCS#' not in line: vtype = 'yes' seen_yes = True elif match.group(1) == 'NAYS' and seen_yes: vtype = 'no' elif match.group(1) == 'VACANT': continue # skip these elif seen_yes: vtype = 'other' if seen_yes and match.group(3).strip(): self.warning("Bad vote format, skipping.") bad_vote = True counts[vtype] += int(match.group(2)) elif seen_yes: for name in line.split(' '): if not name: continue if 'HOUSE' in name or 'SENATE ' in name: continue votes[vtype].append(name.strip()) if bad_vote: continue if passed is None: passed = counts['yes'] > (counts['no'] + counts['other']) vote = Vote(chamber=chamber, start_date=date.strftime('%Y-%m-%d'), motion_text=motion, result='pass' if passed else 'fail', bill=bill, classification='passage') vote.set_count('yes', counts['yes']) vote.set_count('no', counts['no']) vote.set_count('other', counts['other']) vote.pupa_id = url + '#' + rcs vote.add_source(url) for name in votes['yes']: vote.yes(name) for name in votes['no']: if ':' in name: raise Exception(name) vote.no(name) for name in votes['other']: vote.vote('other', name) yield vote
def handle_page(self): (_, motion) = self.lines[5].split("FINAL ACTION:") motion = motion.strip() if not motion: self.scraper.warning("Vote appears to be empty") return vote_top_row = [ self.lines.index(x) for x in self.lines if re.search(r'^\s+Yea\s+Nay.*?(?:\s+Yea\s+Nay)+$', x) ][0] yea_columns_end = self.lines[vote_top_row].index("Yea") + len("Yea") nay_columns_begin = self.lines[vote_top_row].index("Nay") votes = {'yes': [], 'no': [], 'other': []} for line in self.lines[(vote_top_row + 1):]: if line.strip(): member = re.search( r'''(?x) ^\s+(?:[A-Z\-]+)?\s+ # Possible vote indicator ([A-Z][a-z]+ # Name must have lower-case characters [\w\-\s]+) # Continue looking for the rest of the name (?:,[A-Z\s]+?)? # Leadership has an all-caps title (?:\s{2,}.*)? # Name ends when many spaces are seen ''', line).group(1) # sometimes members have trailing X's from other motions in the # vote sheet we aren't collecting member = re.sub('(\s+X)+', '', member) # Usually non-voting members won't even have a code listed # Only a couple of codes indicate an actual vote: # "VA" (vote after roll call) and "VC" (vote change) did_vote = bool(re.search(r'^\s+(X|VA|VC)\s+[A-Z][a-z]', line)) if did_vote: # Check where the "X" or vote code is on the page vote_column = len(line) - len(line.lstrip()) if vote_column <= yea_columns_end: votes['yes'].append(member) elif vote_column >= nay_columns_begin: votes['no'].append(member) else: raise ValueError( "Unparseable vote found for {0} in {1}:\n{2}". format(member, self.url, line)) else: votes['other'].append(member) # End loop as soon as no more members are found else: break totals = re.search(r'(?msu)\s+(\d{1,3})\s+(\d{1,3})\s+.*?TOTALS', self.text).groups() yes_count = int(totals[0]) no_count = int(totals[1]) result = 'pass' if (yes_count > no_count) else 'fail' vote = VoteEvent(start_date=self.kwargs['date'], bill=self.kwargs['bill'], chamber='upper', motion_text=motion, classification='committee', result=result) vote.add_source(self.url) vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('other', len(votes['other'])) # set voters for vtype, voters in votes.items(): for voter in voters: vote.vote(vtype, voter) yield vote
def parse_html_vote(self, bill, actor, date, motion, url, uniqid): try: page = self.get(url).text except scrapelib.HTTPError: self.warning("A vote page not found for bill {}".format( bill.identifier)) return page = lxml.html.fromstring(page) page.make_links_absolute(url) descr = page.xpath("//b")[0].text_content() if descr == '': # New page method descr = page.xpath("//div[@id='content']/center")[0].text if "on voice vote" in descr: return if "committee" in descr.lower(): yield from self.scrape_committee_vote(bill, actor, date, motion, page, url, uniqid) return passed = None if "Passed" in descr: passed = True elif "Failed" in descr: passed = False elif "UTAH STATE LEGISLATURE" in descr: return elif descr.strip() == '-': return else: self.warning(descr) raise NotImplementedError("Can't see if we passed or failed") headings = page.xpath("//b")[1:] votes = page.xpath("//table") sets = zip(headings, votes) vdict = {} for (typ, votes) in sets: txt = typ.text_content() arr = [x.strip() for x in txt.split("-", 1)] if len(arr) != 2: continue v_txt, count = arr v_txt = v_txt.strip() count = int(count) people = [ x.text_content().strip() for x in votes.xpath(".//font[@face='Arial']") ] vdict[v_txt] = {"count": count, "people": people} vote = Vote(chamber=actor, start_date=date, motion_text=motion, result='pass' if passed else 'fail', bill=bill, classification='passage', identifier=str(uniqid)) vote.set_count('yes', vdict['Yeas']['count']) vote.set_count('no', vdict['Nays']['count']) vote.set_count('other', vdict['Absent or not voting']['count']) vote.add_source(url) for person in vdict['Yeas']['people']: vote.yes(person) for person in vdict['Nays']['people']: vote.no(person) for person in vdict['Absent or not voting']['people']: vote.vote('other', person) yield vote
def scrape_vote(self, bill, vote_chamber, bill_id, vote_id, vote_date, action_text): url = ('http://alisondb.legislature.state.al.us/Alison/' 'GetRollCallVoteResults.aspx?' 'VOTE={0}&BODY={1}&INST={2}&SESS={3}'. format(vote_id, vote_chamber, bill_id, self.session_id)) doc = lxml.html.fromstring(self.get(url=url).text) voters = {'Y': [], 'N': [], 'P': [], 'A': []} voters_and_votes = doc.xpath('//table/tr/td/font/text()') capture_vote = False name = '' for item in voters_and_votes: if capture_vote: capture_vote = False if name: voters[item].append(name) else: capture_vote = True name = item if (name.endswith(", Vacant") or name.startswith("Total ") or not name.strip()): name = '' # Check name counts against totals listed on the site total_yea = doc.xpath('//*[starts-with(text(), "Total Yea")]/text()') if total_yea: total_yea = int(total_yea[0].split(":")[-1]) assert total_yea == len(voters['Y']), "Yea count incorrect" else: total_yea = len(voters['Y']) total_nay = doc.xpath('//*[starts-with(text(), "Total Nay")]/text()') if total_nay: total_nay = int(total_nay[0].split(":")[-1]) assert total_nay == len(voters['N']), "Nay count incorrect" else: total_nay = len(voters['N']) total_absent = doc.xpath( '//*[starts-with(text(), "Total Absent")]/text()') if total_absent: total_absent = int(total_absent[0].split(":")[-1]) assert total_absent == len(voters['A']), "Absent count incorrect" total_other = len(voters['P']) + len(voters['A']) vote = VoteEvent( chamber=self.CHAMBERS[vote_chamber[0]], start_date=vote_date, motion_text=action_text, result='pass' if total_yea > total_nay else 'fail', classification='passage', bill=bill, ) vote.set_count('yes', total_yea) vote.set_count('no', total_nay) vote.set_count('other', total_other) vote.add_source(url) for member in voters['Y']: vote.vote('yes', member) for member in voters['N']: vote.vote('no', member) for member in (voters['A'] + voters['P']): vote.vote('other', member) yield vote
def scrape_votes_for_chamber(self, chamber, vote_data, bill, link): raw_vote_data = re.split(r"\w+? by [\w ]+?\s+-", vote_data.strip())[1:] motion_count = 1 for raw_vote in raw_vote_data: raw_vote = raw_vote.split(u"\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0") motion = raw_vote[0] vote_date = re.search(r"(\d+/\d+/\d+)", motion) if vote_date: vote_date = datetime.datetime.strptime(vote_date.group(), "%m/%d/%Y") passed = ( "Passed" in motion or "Recommended for passage" in motion or "Rec. for pass" in motion or "Adopted" in raw_vote[1] ) vote_regex = re.compile(r"\d+$") aye_regex = re.compile(r"^.+voting aye were: (.+) -") no_regex = re.compile(r"^.+voting no were: (.+) -") not_voting_regex = re.compile(r"^.+present and not voting were: (.+) -") yes_count = 0 no_count = 0 not_voting_count = 0 ayes = [] nos = [] not_voting = [] for v in raw_vote[1:]: v = v.strip() if v.startswith("Ayes...") and vote_regex.search(v): yes_count = int(vote_regex.search(v).group()) elif v.startswith("Noes...") and vote_regex.search(v): no_count = int(vote_regex.search(v).group()) elif v.startswith("Present and not voting...") and vote_regex.search(v): not_voting_count += int(vote_regex.search(v).group()) elif aye_regex.search(v): ayes = aye_regex.search(v).groups()[0].split(", ") elif no_regex.search(v): nos = no_regex.search(v).groups()[0].split(", ") elif not_voting_regex.search(v): not_voting += not_voting_regex.search(v).groups()[0].split(", ") motion = motion.strip() motion = motion.replace("&", "&") # un-escape ampersands if motion in self._seen_votes: motion = "{} ({})".format(motion, motion_count) motion_count += 1 self._seen_votes.add(motion) vote = VoteEvent( motion_text=motion, start_date=vote_date.strftime("%Y-%m-%d") if vote_date else None, classification="passage", result="pass" if passed else "fail", chamber=chamber, bill=bill, ) vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("not voting", not_voting_count) vote.add_source(link) seen = set() for a in ayes: if a in seen: continue vote.yes(a) seen.add(a) for n in nos: if n in seen: continue vote.no(n) seen.add(n) for n in not_voting: if n in seen: continue vote.vote("not voting", n) seen.add(n) yield vote
def scrape_bill(self, chamber, session, bill_id): # try and get bill for current year url = 'http://legislature.mi.gov/doc.aspx?%s-%s' % ( session[:4], bill_id.replace(' ', '-')) html = self.get(url).text # if first page isn't found, try second year if ('Page Not Found' in html or 'The bill you are looking for is not available yet' in html): html = self.get('http://legislature.mi.gov/doc.aspx?%s-%s' % (session[-4:], bill_id.replace(' ', '-'))).text if ('Page Not Found' in html or 'The bill you are looking for is not available yet' in html): return doc = lxml.html.fromstring(html) title = doc.xpath( '//span[@id="frg_billstatus_ObjectSubject"]')[0].text_content() # get B/R/JR/CR part and look up bill type bill_type = bill_types[bill_id.split(' ')[0][1:]] bill = Bill(bill_id, session, title, chamber=chamber, classification=bill_type) bill.add_source(url) # sponsors sponsors = doc.xpath('//span[@id="frg_billstatus_SponsorList"]/a') for sponsor in sponsors: name = sponsor.text.replace(u'\xa0', ' ') if len(sponsors) > 1: classification = ('primary' if sponsor.tail and 'primary' in sponsor.tail else 'cosponsor') else: classification = 'primary' bill.add_sponsorship( name=name, chamber=chamber, entity_type='person', primary=classification == 'primary', classification=classification, ) bill.subject = doc.xpath( '//span[@id="frg_billstatus_CategoryList"]/a/text()') # actions (skip header) for row in doc.xpath( '//table[@id="frg_billstatus_HistoriesGridView"]/tr')[1:]: tds = row.xpath('td') # date, journal link, action date = tds[0].text_content() journal = tds[1].text_content() action = tds[2].text_content() date = TIMEZONE.localize( datetime.datetime.strptime(date, "%m/%d/%Y")) # instead of trusting upper/lower case, use journal for actor actor = 'upper' if 'SJ' in journal else 'lower' classification = categorize_action(action) bill.add_action(action, date, chamber=actor, classification=classification) # check if action mentions a vote rcmatch = re.search('Roll Call # (\d+)', action, re.IGNORECASE) if rcmatch: rc_num = rcmatch.groups()[0] # in format mileg.aspx?page=getobject&objectname=2011-SJ-02-10-011 journal_link = tds[1].xpath('a/@href') if journal_link: objectname = journal_link[0].rsplit('=', 1)[-1] chamber_name = {'upper': 'Senate', 'lower': 'House'}[actor] vote_url = BASE_URL + '/documents/%s/Journal/%s/htm/%s.htm' % ( session, chamber_name, objectname) results = self.parse_roll_call(vote_url, rc_num) vote = VoteEvent( start_date=date, chamber=actor, bill=bill, motion_text=action, result='pass' if len(results['yes']) > len(results['no']) else 'fail', classification='passage', ) # check the expected counts vs actual count = re.search('YEAS (\d+)', action, re.IGNORECASE) count = int(count.groups()[0]) if count else 0 if count != len(results['yes']): self.warning( 'vote count mismatch for %s %s, %d != %d' % (bill_id, action, count, len(results['yes']))) count = re.search('NAYS (\d+)', action, re.IGNORECASE) count = int(count.groups()[0]) if count else 0 if count != len(results['no']): self.warning( 'vote count mismatch for %s %s, %d != %d' % (bill_id, action, count, len(results['no']))) vote.set_count('yes', len(results['yes'])) vote.set_count('no', len(results['no'])) vote.set_count('other', len(results['other'])) for name in results['yes']: vote.yes(name) for name in results['no']: vote.no(name) for name in results['other']: vote.vote('other', name) vote.add_source(vote_url) yield vote else: self.warning("missing journal link for %s %s" % (bill_id, journal)) # versions for row in doc.xpath( '//table[@id="frg_billstatus_DocumentGridTable"]/tr'): parsed = self.parse_doc_row(row) if parsed: name, url = parsed if url.endswith('.pdf'): mimetype = 'application/pdf' elif url.endswith('.htm'): mimetype = 'text/html' bill.add_version_link(name, url, media_type=mimetype) # documents for row in doc.xpath('//table[@id="frg_billstatus_HlaTable"]/tr'): document = self.parse_doc_row(row) if document: name, url = document bill.add_document_link(name, url) for row in doc.xpath('//table[@id="frg_billstatus_SfaTable"]/tr'): document = self.parse_doc_row(row) if document: name, url = document bill.add_document_link(name, url) yield bill
def scrape(self, window=28) : n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(float(window)) for matter in self.matters(n_days_ago) : matter_id = matter['MatterId'] date = matter['MatterIntroDate'] title = matter['MatterTitle'] identifier = matter['MatterFile'] if not all((date, title, identifier)) : continue bill_session = self.session(self.toTime(date)) bill_type = BILL_TYPES[matter['MatterTypeName']] if identifier.startswith('S'): alternate_identifiers = [identifier] identifier = identifier[1:] else: alternate_identifiers = [] bill = Bill(identifier=identifier, legislative_session=bill_session, title=title, classification=bill_type, from_organization={"name":"Board of Directors"}) legistar_web = matter['legistar_url'] legistar_api = self.BASE_URL + '/matters/{0}'.format(matter_id) bill.add_source(legistar_web, note='web') bill.add_source(legistar_api, note='api') for identifier in alternate_identifiers: bill.add_identifier(identifier) for action, vote in self.actions(matter_id) : act = bill.add_action(**action) if action['description'] == 'Referred' : body_name = matter['MatterBodyName'] act.add_related_entity(body_name, 'organization', entity_id = _make_pseudo_id(name=body_name)) result, votes = vote if result : vote_event = VoteEvent(legislative_session=bill.legislative_session, motion_text=action['description'], organization=action['organization'], classification=None, start_date=action['date'], result=result, bill=bill) vote_event.add_source(legistar_web) vote_event.add_source(legistar_api + '/histories') for vote in votes : raw_option = vote['VoteValueName'].lower() clean_option = self.VOTE_OPTIONS.get(raw_option, raw_option) vote_event.vote(clean_option, vote['VotePersonName'].strip()) yield vote_event for sponsorship in self.sponsorships(matter_id) : bill.add_sponsorship(**sponsorship) for topic in self.topics(matter_id) : bill.add_subject(topic['MatterIndexName'].strip()) for relation in self.relations(matter_id): try: # Get data (i.e., json) for the related bill. # Then, we can find the 'MatterFile' (i.e., identifier) and the 'MatterIntroDate' (i.e., to determine its legislative session). # Sometimes, the related bill does not yet exist: in this case, throw an error, and continue. related_bill = self.endpoint('/matters/{0}', relation['MatterRelationMatterId']) except scrapelib.HTTPError: continue else: date = related_bill['MatterIntroDate'] related_bill_session = self.session(self.toTime(date)) identifier = related_bill['MatterFile'] bill.add_related_bill(identifier=identifier, legislative_session=related_bill_session, relation_type='companion') # Currently, the relation type for bills can be one of a few possibilites: https://github.com/opencivicdata/python-opencivicdata/blob/master/opencivicdata/common.py#L104 # Metro simply understands these as related files, suggesting that they receive a relation of 'companion'. bill.add_version_link('Board Report', 'https://metro.legistar.com/ViewReport.ashx?M=R&N=TextL5&GID=557&ID={}&GUID=LATEST&Title=Board+Report'.format(matter_id), media_type="application/pdf") for attachment in self.attachments(matter_id) : if attachment['MatterAttachmentName'] : bill.add_document_link(attachment['MatterAttachmentName'], attachment['MatterAttachmentHyperlink'], media_type="application/pdf") bill.extras = {'local_classification' : matter['MatterTypeName']} text = self.text(matter_id) if text : if text['MatterTextPlain'] : bill.extras['plain_text'] = text['MatterTextPlain'] if text['MatterTextRtf'] : bill.extras['rtf_text'] = text['MatterTextRtf'].replace(u'\u0000', '') yield bill
def scrape_house_vote(self, bill, url): try: filename, resp = self.urlretrieve(url) except scrapelib.HTTPError: self.warning("missing vote file %s" % url) return text = convert_pdf(filename, 'text') os.remove(filename) lines = text.splitlines() vote_type = None votes = collections.defaultdict(list) date = None for idx, line in enumerate(lines): line = line.rstrip().decode('utf-8') match = re.search(r'(\d+)/(\d+)/(\d{4,4})$', line) if match: date = datetime.datetime.strptime(match.group(0), "%m/%d/%Y") continue match = re.match( r'\s+YEAS: (\d+)\s+NAYS: (\d+)\s+NOT VOTING: (\d+)', line) if match: motion = (lines[idx - 2].strip()).decode('utf-8') if not motion: self.warning("No motion text found for vote") motion = "PASSAGE" yes_count, no_count, other_count = [ int(g) for g in match.groups()] exc_match = re.search(r'EXCUSED: (\d+)', line) if exc_match: other_count += int(exc_match.group(1)) if line.endswith('ADOPTED') or line.endswith('PASSED'): passed = True else: passed = False continue match = re.match( r'(YEAS|NAYS|NOT VOTING|PAIRED|EXCUSED):\s+(\d+)\s*$', line) if match: vote_type = {'YEAS': 'yes', 'NAYS': 'no', 'NOT VOTING': 'other', 'EXCUSED': 'other', 'PAIRED': 'paired'}[match.group(1)] continue if vote_type == 'paired': for part in line.split(' '): part = part.strip() if not part: continue name, pair_type = re.match( r'([^\(]+)\((YEA|NAY)\)', line).groups() name = name.strip() if pair_type == 'YEA': votes['yes'].append(name) elif pair_type == 'NAY': votes['no'].append(name) elif vote_type: for name in line.split(' '): name = name.strip() if not name: continue votes[vote_type].append(name) if date: vote = Vote(chamber='lower', start_date=date.strftime("%Y-%m-%d"), motion_text=motion, result='pass' if passed else 'fail', classification='passage', bill=bill) vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('other', other_count) vote.add_source(url) for key, values in votes.items(): for value in values: vote.vote(key, value) yield vote else: self.warning("Syntax Error/Warning using 'convert_pdf'")
def scrape_journal(self, url, chamber, session, date): filename, response = self.urlretrieve(url) self.logger.info('Saved journal to %r' % filename) all_text = convert_pdf(filename, type="text") lines = all_text.split(b'\n') lines = [line.decode('utf-8') for line in lines] lines = [line. strip(). replace('–', '-'). replace('―', '"'). replace('‖', '"'). replace('“', '"'). replace('”', '"') for line in lines] # Do not process headers or completely empty lines header_date_re = r"\d+\w{2} Day\s+\w+DAY, \w+ \d{1,2}, \d{4}\s+\d+" header_journal_re = r"\d+\s+JOURNAL OF THE \w+\s+\d+\w{2} Day" lines = iter([line for line in lines if not( line == "" or re.match(header_date_re, line) or re.match(header_journal_re, line))]) for line in lines: # Go through with vote parse if any of # these conditions match. if not line.startswith("On the question") or \ "shall" not in line.lower(): continue # Get the bill_id bill_id = None bill_re = r'\(\s*([A-Z\.]+\s\d+)\s*\)' # The Senate ends its motion text with a vote announcement if chamber == "upper": end_of_motion_re = r'.* the vote was:\s*' # The House may or may not end motion text with a bill name elif chamber == "lower": end_of_motion_re = r'.*Shall.*\?"?(\s{})?\s*'.format(bill_re) while not re.match(end_of_motion_re, line, re.IGNORECASE): line += " " + next(lines) try: bill_id = re.search(bill_re, line).group(1) except AttributeError: self.warning("This motion did not pertain to legislation: {}". format(line)) continue # Get the motion text motion_re = r''' ^On\sthe\squestion\s # Precedes any motion "+ # Motion is preceded by a quote mark (or two) (Shall\s.+?\??) # The motion text begins with "Shall" \s*"\s+ # Motion is followed by a quote mark (?:{})? # If the vote regards a bill, its number is listed {} # Senate has trailing text \s*$ '''.format( bill_re, r',?.*?the\svote\swas:' if chamber == 'upper' else '' ) print(line) motion = re.search(motion_re, line, re.VERBOSE | re.IGNORECASE).group(1) for word, letter in (('Senate', 'S'), ('House', 'H'), ('File', 'F')): if bill_id is None: return bill_id = bill_id.replace(word, letter) bill_id = bill_id.replace('.', '') bill_chamber = dict(h='lower', s='upper')[bill_id.lower()[0]] self.current_id = bill_id votes, passed = self.parse_votes(lines) # at the very least, there should be a majority # for the bill to have passed, so check that, # but if the bill didn't pass, it could still be OK if it got a majority # eg constitutional amendments if not ((passed == (votes['yes_count'] > votes['no_count'])) or (not passed)): self.error("The bill passed without a majority?") raise ValueError('invalid vote') # also throw a warning if the bill failed but got a majority # it could be OK, but is probably something we'd want to check if not passed and votes['yes_count'] > votes['no_count']: self.logger.warning("The bill got a majority but did not pass. " "Could be worth confirming.") result = "" if passed: result = "pass" else: result = "fail" vote = VoteEvent(chamber=chamber, start_date=date, motion_text=re.sub('\xad', '-', motion), result=result, classification='passage', legislative_session=session, bill=bill_id, bill_chamber=bill_chamber ) # add votes and counts for vtype in ('yes', 'no', 'absent', 'abstain'): vcount = votes['{}_count'.format(vtype)] or 0 vote.set_count(vtype, vcount) for voter in votes['{}_votes'.format(vtype)]: vote.vote(vtype, voter) vote.add_source(url) yield vote
def scrape_journal(self, url, chamber, session, date): filename, response = self.urlretrieve(url) self.logger.info('Saved journal to %r' % filename) all_text = convert_pdf(filename, type="text") lines = all_text.split(b'\n') lines = [line.decode('utf-8') for line in lines] lines = [ line.strip().replace('–', '-').replace('―', '"').replace( '‖', '"').replace('“', '"').replace('”', '"') for line in lines ] # Do not process headers or completely empty lines header_date_re = r"\d+\w{2} Day\s+\w+DAY, \w+ \d{1,2}, \d{4}\s+\d+" header_journal_re = r"\d+\s+JOURNAL OF THE \w+\s+\d+\w{2} Day" lines = iter([ line for line in lines if not (line == "" or re.match(header_date_re, line) or re.match(header_journal_re, line)) ]) for line in lines: # Go through with vote parse if any of # these conditions match. if not line.startswith("On the question") or \ "shall" not in line.lower(): continue # Get the bill_id bill_id = None bill_re = r'\(\s*([A-Z\.]+\s\d+)\s*\)' # The Senate ends its motion text with a vote announcement if chamber == "upper": end_of_motion_re = r'.* the vote was:\s*' # The House may or may not end motion text with a bill name elif chamber == "lower": end_of_motion_re = r'.*Shall.*(?:\?"?|")(\s{})?\s*'.format( bill_re) while not re.match(end_of_motion_re, line, re.IGNORECASE): line += " " + next(lines) try: bill_id = re.search(bill_re, line).group(1) except AttributeError: self.warning( "This motion did not pertain to legislation: {}".format( line)) continue # Get the motion text motion_re = r''' ^On\sthe\squestion\s # Precedes any motion "+ # Motion is preceded by a quote mark (or two) (Shall\s.+?\??) # The motion text begins with "Shall" \s*(?:\?"?|"|’)\s+ # Motion is followed by a question mark and/or a quote mark (?:{})? # If the vote regards a bill, its number is listed {} # Senate has trailing text \s*$ '''.format( bill_re, r',?.*?the\svote\swas:' if chamber == 'upper' else '') print(line) motion = re.search(motion_re, line, re.VERBOSE | re.IGNORECASE).group(1) for word, letter in (('Senate', 'S'), ('House', 'H'), ('File', 'F')): if bill_id is None: return bill_id = bill_id.replace(word, letter) bill_id = bill_id.replace('.', '') bill_chamber = dict(h='lower', s='upper')[bill_id.lower()[0]] votes, passed = self.parse_votes(lines) # at the very least, there should be a majority # for the bill to have passed, so check that, # but if the bill didn't pass, it could still be OK if it got a majority # eg constitutional amendments if not ((passed == (votes['yes_count'] > votes['no_count'])) or (not passed)): self.error("The bill passed without a majority?") raise ValueError('invalid vote') # also throw a warning if the bill failed but got a majority # it could be OK, but is probably something we'd want to check if not passed and votes['yes_count'] > votes['no_count']: self.logger.warning( "The bill got a majority but did not pass. " "Could be worth confirming.") result = "" if passed: result = "pass" else: result = "fail" vote = VoteEvent(chamber=chamber, start_date=date, motion_text=re.sub('\xad', '-', motion), result=result, classification='passage', legislative_session=session, bill=bill_id, bill_chamber=bill_chamber) # add votes and counts for vtype in ('yes', 'no', 'absent', 'abstain'): vcount = votes['{}_count'.format(vtype)] or 0 vote.set_count(vtype, vcount) for voter in votes['{}_votes'.format(vtype)]: vote.vote(vtype, voter) vote.add_source(url) yield vote
def scrape_bill_type(self, chamber, session, bill_type, type_abbr, committee_abbr_regex=get_committee_name_regex()): bills = self.session.query(CABill).filter_by( session_year=session).filter_by( measure_type=type_abbr) for bill in bills: bill_session = session if bill.session_num != '0': bill_session += ' Special Session %s' % bill.session_num bill_id = bill.short_bill_id fsbill = Bill(bill_id, session, title='', chamber=chamber) if ((bill_id.startswith('S') and chamber == 'lower') or (bill_id.startswith('A') and chamber == 'upper')): print("!!!! BAD ID/CHAMBER PAIR !!!!", bill) continue # # Construct session for web query, going from '20092010' to '0910' # source_session = session[2:4] + session[6:8] # # Turn 'AB 10' into 'ab_10' # source_num = "%s_%s" % (bill.measure_type.lower(), # bill.measure_num) # Construct a fake source url source_url = ('http://leginfo.legislature.ca.gov/faces/' 'billNavClient.xhtml?bill_id=%s') % bill.bill_id fsbill.add_source(source_url) fsbill.add_version_link(bill_id, source_url, media_type='text/html') title = '' type_ = ['bill'] subject = '' all_titles = set() # Get digest test (aka "summary") from latest version. if bill.versions: version = bill.versions[-1] nsmap = version.xml.nsmap xpath = '//caml:DigestText/xhtml:p' els = version.xml.xpath(xpath, namespaces=nsmap) chunks = [] for el in els: t = etree_text_content(el) t = re.sub(r'\s+', ' ', t) t = re.sub(r'\)(\S)', lambda m: ') %s' % m.group(1), t) chunks.append(t) summary = '\n\n'.join(chunks) for version in bill.versions: if not version.bill_xml: continue version_date = self._tz.localize(version.bill_version_action_date) # create a version name to match the state's format # 02/06/17 - Enrolled version_date_human = version_date.strftime( '%m/%d/%y') version_name = "{} - {}".format( version_date_human, version.bill_version_action) version_base = "https://leginfo.legislature.ca.gov/faces" version_url_pdf = "{}/billPdf.xhtml?bill_id={}&version={}".format( version_base, version.bill_id, version.bill_version_id) fsbill.add_version_link( version_name, version_url_pdf, media_type='application/pdf', date=version_date.date()) # CA is inconsistent in that some bills have a short title # that is longer, more descriptive than title. if bill.measure_type in ('AB', 'SB'): impact_clause = clean_title(version.title) title = clean_title(version.short_title) else: impact_clause = None if len(version.title) < len(version.short_title) and \ not version.title.lower().startswith('an act'): title = clean_title(version.short_title) else: title = clean_title(version.title) if title: all_titles.add(title) type_ = [bill_type] if version.appropriation == 'Yes': type_.append('appropriation') tags = [] if version.fiscal_committee == 'Yes': tags.append('fiscal committee') if version.local_program == 'Yes': tags.append('local program') if version.urgency == 'Yes': tags.append('urgency') if version.taxlevy == 'Yes': tags.append('tax levy') if version.subject: subject = clean_title(version.subject) if not title: self.warning("Couldn't find title for %s, skipping" % bill_id) continue fsbill.title = title if summary: fsbill.add_abstract(summary, note='summary') fsbill.classification = type_ fsbill.subject = [subject] if subject else [] fsbill.extras['impact_clause'] = impact_clause fsbill.extras['tags'] = tags # We don't want the current title in alternate_titles all_titles.remove(title) for title in all_titles: fsbill.add_title(title) for author in version.authors: fsbill.add_sponsorship( author.name, classification=SPONSOR_TYPES[author.contribution], primary=author.primary_author_flg == 'Y', entity_type='person', ) # fsbill.sponsorships[-1]['extras'] = {'official_type': author.contribution} seen_actions = set() for action in bill.actions: if not action.action: # NULL action text seems to be an error on CA's part, # unless it has some meaning I'm missing continue actor = action.actor or chamber actor = actor.strip() match = re.match(r'(Assembly|Senate)($| \(Floor)', actor) if match: actor = {'Assembly': 'lower', 'Senate': 'upper'}[match.group(1)] elif actor.startswith('Governor'): actor = 'executive' else: def replacer(matchobj): if matchobj: return {'Assembly': 'lower', 'Senate': 'upper'}[matchobj.group()] else: return matchobj.group() actor = re.sub(r'^(Assembly|Senate)', replacer, actor) type_ = [] act_str = action.action act_str = re.sub(r'\s+', ' ', act_str) attrs = self.categorizer.categorize(act_str) # Add in the committee strings of the related committees, if any. kwargs = attrs matched_abbrs = committee_abbr_regex.findall(action.action) if re.search(r'Com[s]?. on', action.action) and not matched_abbrs: msg = 'Failed to extract committee abbr from %r.' self.logger.warning(msg % action.action) if matched_abbrs: committees = [] for abbr in matched_abbrs: try: name = self.committee_abbr_to_name(chamber, abbr) committees.append(name) except KeyError: msg = ('Mapping contains no committee name for ' 'abbreviation %r. Action text was %r.') args = (abbr, action.action) raise KeyError(msg % args) committees = filter(None, committees) kwargs['committees'] = committees code = re.search(r'C[SXZ]\d+', actor) if code is not None: code = code.group() kwargs['actor_info'] = {'committee_code': code} assert len(list(committees)) == len(matched_abbrs) for committee, abbr in zip(committees, matched_abbrs): act_str = act_str.replace('Coms. on ', '') act_str = act_str.replace('Com. on ' + abbr, committee) act_str = act_str.replace(abbr, committee) if not act_str.endswith('.'): act_str = act_str + '.' # Determine which chamber the action originated from. changed = False for committee_chamber in ['upper', 'lower', 'legislature']: if actor.startswith(committee_chamber): actor = committee_chamber changed = True break if not changed: actor = 'legislature' if actor != action.actor: actor_info = kwargs.get('actor_info', {}) actor_info['details'] = action.actor kwargs['actor_info'] = actor_info # Add strings for related legislators, if any. rgx = r'(?:senator|assembly[mwp][^ .,:;]+)\s+[^ .,:;]+' legislators = re.findall(rgx, action.action, re.I) if legislators: kwargs['legislators'] = legislators date = action.action_date date = self._tz.localize(date) date = date.date() if (actor, act_str, date) in seen_actions: continue kwargs.update(self.categorizer.categorize(act_str)) action = fsbill.add_action(act_str, date.strftime('%Y-%m-%d'), chamber=actor, classification=kwargs['classification']) for committee in kwargs.get('committees', []): action.add_related_entity( committee, entity_type='organization') seen_actions.add((actor, act_str, date)) for vote_num, vote in enumerate(bill.votes): if vote.vote_result == '(PASS)': result = True else: result = False if not vote.location: continue full_loc = vote.location.description first_part = full_loc.split(' ')[0].lower() if first_part in ['asm', 'assembly']: vote_chamber = 'lower' # vote_location = ' '.join(full_loc.split(' ')[1:]) elif first_part.startswith('sen'): vote_chamber = 'upper' # vote_location = ' '.join(full_loc.split(' ')[1:]) else: raise ScrapeError("Bad location: %s" % full_loc) if vote.motion: motion = vote.motion.motion_text or '' else: motion = '' if "Third Reading" in motion or "3rd Reading" in motion: vtype = 'passage' elif "Do Pass" in motion: vtype = 'passage' else: vtype = 'other' motion = motion.strip() # Why did it take until 2.7 to get a flags argument on re.sub? motion = re.compile(r'(\w+)( Extraordinary)? Session$', re.IGNORECASE).sub('', motion) motion = re.compile(r'^(Senate|Assembly) ', re.IGNORECASE).sub('', motion) motion = re.sub(r'^(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? ', '', motion) motion = re.sub(r' \(\w+\)$', '', motion) motion = re.sub(r'(SCR|SB|AB|AJR|ACR)\s?\d+ \w+\.?$', '', motion) motion = re.sub(r'(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? ' r'Urgency Clause$', '(Urgency Clause)', motion) motion = re.sub(r'\s+', ' ', motion) if not motion: self.warning("Got blank motion on vote for %s" % bill_id) continue # XXX this is responsible for all the CA 'committee' votes, not # sure if that's a feature or bug, so I'm leaving it as is... # vote_classification = chamber if (vote_location == 'Floor') else 'committee' # org = { # 'name': vote_location, # 'classification': vote_classification # } fsvote = VoteEvent( motion_text=motion, start_date=self._tz.localize(vote.vote_date_time), result='pass' if result else 'fail', classification=vtype, # organization=org, chamber=vote_chamber, bill=fsbill, ) fsvote.extras = {'threshold': vote.threshold} source_url = ( 'http://leginfo.legislature.ca.gov/faces' '/billVotesClient.xhtml?bill_id={}' ).format(fsbill.identifier) fsvote.add_source(source_url) fsvote.pupa_id = source_url + '#' + str(vote_num) rc = {'yes': [], 'no': [], 'other': []} for record in vote.votes: if record.vote_code == 'AYE': rc['yes'].append(record.legislator_name) elif record.vote_code.startswith('NO'): rc['no'].append(record.legislator_name) else: rc['other'].append(record.legislator_name) # Handle duplicate votes for key in rc.keys(): rc[key] = list(set(rc[key])) for key, voters in rc.items(): for voter in voters: fsvote.vote(key, voter) # Set counts by summed votes for accuracy fsvote.set_count(key, len(voters)) yield fsvote yield fsbill self.session.expire_all()
def scrape_vote(self, bill, name, url): if "VOTE/h" in url: vote_chamber = 'lower' cols = (1, 5, 9, 13) name_offset = 3 yes_offset = 0 no_offset = 1 else: vote_chamber = 'upper' cols = (1, 6) name_offset = 4 yes_offset = 1 no_offset = 2 page = self.get(url, verify=False).text if 'BUDGET ADDRESS' in page: return page = lxml.html.fromstring(page) yes_count = page.xpath( "string(//span[contains(., 'Those voting Yea')])") yes_count = int(re.match(r'[^\d]*(\d+)[^\d]*', yes_count).group(1)) no_count = page.xpath( "string(//span[contains(., 'Those voting Nay')])") no_count = int(re.match(r'[^\d]*(\d+)[^\d]*', no_count).group(1)) other_count = page.xpath( "string(//span[contains(., 'Those absent')])") other_count = int( re.match(r'[^\d]*(\d+)[^\d]*', other_count).group(1)) need_count = page.xpath( "string(//span[contains(., 'Necessary for')])") need_count = int( re.match(r'[^\d]*(\d+)[^\d]*', need_count).group(1)) date = page.xpath("string(//span[contains(., 'Taken on')])") date = re.match(r'.*Taken\s+on\s+(\d+/\s?\d+)', date).group(1) date = date.replace(' ', '') date = datetime.datetime.strptime(date + " " + bill.legislative_session, "%m/%d %Y").date() # not sure about classification. vote = Vote(chamber=vote_chamber, start_date=date, motion_text=name, result='pass' if yes_count > need_count else 'fail', classification='passage', bill=bill ) vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('other', other_count) vote.add_source(url) table = page.xpath("//table")[0] for row in table.xpath("tr"): for i in cols: name = row.xpath("string(td[%d])" % ( i + name_offset)).strip() if not name or name == 'VACANT': continue if "Y" in row.xpath("string(td[%d])" % (i + yes_offset)): vote.yes(name) elif "N" in row.xpath("string(td[%d])" % (i + no_offset)): vote.no(name) else: vote.vote('other', name) yield vote
def _parse_votes(self, url, vote, bill): '''Given a vote url and a vote object, extract the voters and the vote counts from the vote page and update the vote object. ''' if url.lower().endswith('.pdf'): try: resp = self.get(url) except HTTPError: # This vote document wasn't found. msg = 'No document found at url %r' % url self.logger.warning(msg) return try: v = PDFCommitteeVote(url, resp.content, bill) return v.asvote() except PDFCommitteeVoteParseError: # Warn and skip. self.warning("Could't parse committee vote at %r" % url) return html = self.get(url).text doc = lxml.html.fromstring(html) doc.make_links_absolute(url) # Yes, no, excused, absent. try: vals = doc.xpath('//table')[1].xpath('tr/td/text()') except IndexError: # Most likely was a bogus link lacking vote data. return yes_count, no_count, excused_count, absent_count = map(int, vals) # Get the motion. try: motion = doc.xpath('//br')[-1].tail.strip() except IndexError: # Some of them mysteriously have no motion listed. motion = vote['action'] if not motion: motion = vote['action'] vote['motion'] = motion action = vote['action'] vote_url = vote['vote_url'] vote = VoteEvent( chamber=vote['chamber'], start_date=vote['date'], motion_text=vote['motion'], result='fail', # placeholder classification='passage', bill=bill, bill_action=vote['action'], ) vote.pupa_id = vote_url # URL contains sequence number vote.add_source(vote_url) vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('excused', excused_count) vote.set_count('absent', absent_count) for text in doc.xpath('//table')[2].xpath('tr/td/text()'): if not text.strip(u'\xa0'): continue v, name = filter(None, text.split(u'\xa0')) # Considering Name is brackets as short name regex = re.compile(r".*?\((.*?)\)") short_name = re.findall(regex, name) if len(short_name) > 0: note = 'Short Name: ' + short_name[0] else: note = '' # Name without brackets like 'Kary, Douglas' name = re.sub(r"[\(\[].*?[\)\]]", "", name) if v == 'Y': vote.yes(name, note=note) elif v == 'N': vote.no(name, note=note) elif v == 'E': vote.vote('excused', name, note=note) elif v == 'A': vote.vote('absent', name, note=note) # code to deterimine value of `passed` passed = None # some actions take a super majority, so we aren't just # comparing the yeas and nays here. for i in vote_passage_indicators: if i in action: passed = True break for i in vote_failure_indicators: if i in action and passed: # a quick explanation: originally an exception was # thrown if both passage and failure indicators were # present because I thought that would be a bug in my # lists. Then I found 2007 HB 160. # Now passed = False if the nays outnumber the yays.. # I won't automatically mark it as passed if the yays # ounumber the nays because I don't know what requires # a supermajority in MT. if no_count >= yes_count: passed = False break else: raise Exception("passage and failure indicator" "both present at: %s" % url) if i in action and passed is None: passed = False break for i in vote_ambiguous_indicators: if i in action: passed = yes_count > no_count break if passed is None: raise Exception("Unknown passage at: %s" % url) vote.result = 'pass' if passed else 'fail' return vote
def _parse_votes(self, url, vote, bill): '''Given a vote url and a vote object, extract the voters and the vote counts from the vote page and update the vote object. ''' if url.lower().endswith('.pdf'): try: resp = self.get(url) except HTTPError: # This vote document wasn't found. msg = 'No document found at url %r' % url self.logger.warning(msg) return try: v = PDFCommitteeVote(url, resp.content, bill) return v.asvote() except PDFCommitteeVoteParseError: # Warn and skip. self.warning("Could't parse committee vote at %r" % url) return html = self.get(url).text doc = lxml.html.fromstring(html) # Yes, no, excused, absent. try: vals = doc.xpath('//table')[1].xpath('tr/td/text()') except IndexError: # Most likely was a bogus link lacking vote data. return yes_count, no_count, excused_count, absent_count = map(int, vals) # Get the motion. try: motion = doc.xpath('//br')[-1].tail.strip() except: # Some of them mysteriously have no motion listed. motion = vote['action'] if not motion: motion = vote['action'] vote['motion'] = motion action = vote['action'] vote_url = vote['vote_url'] vote = VoteEvent( chamber=vote['chamber'], start_date=vote['date'], motion_text=vote['motion'], result='fail', # placeholder classification='passage', bill=bill, bill_action=vote['action'], ) vote.pupa_id = vote_url # URL contains sequence number vote.add_source(vote_url) vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('excused', excused_count) vote.set_count('absent', absent_count) for text in doc.xpath('//table')[2].xpath('tr/td/text()'): if not text.strip(u'\xa0'): continue v, name = filter(None, text.split(u'\xa0')) # Considering Name is brackets as short name regex = re.compile(".*?\((.*?)\)") short_name = re.findall(regex, name) if len(short_name) > 0: note = 'Short Name: ' + short_name[0] else: note = '' # Name without brackets like 'Kary, Douglas' name = re.sub("[\(\[].*?[\)\]]", "", name) if v == 'Y': vote.yes(name, note=note) elif v == 'N': vote.no(name, note=note) elif v == 'E': vote.vote('excused', name, note=note) elif v == 'A': vote.vote('absent', name, note=note) # code to deterimine value of `passed` passed = None # some actions take a super majority, so we aren't just # comparing the yeas and nays here. for i in vote_passage_indicators: if i in action: passed = True break for i in vote_failure_indicators: if i in action and passed: # a quick explanation: originally an exception was # thrown if both passage and failure indicators were # present because I thought that would be a bug in my # lists. Then I found 2007 HB 160. # Now passed = False if the nays outnumber the yays.. # I won't automatically mark it as passed if the yays # ounumber the nays because I don't know what requires # a supermajority in MT. if no_count >= yes_count: passed = False break else: raise Exception("passage and failure indicator" "both present at: %s" % url) if i in action and passed is None: passed = False break for i in vote_ambiguous_indicators: if i in action: passed = yes_count > no_count break if passed is None: raise Exception("Unknown passage at: %s" % url) vote.result = 'pass' if passed else 'fail' return vote
def scrape(self): for leg_summary in self.legislation( created_after=datetime.datetime(2014, 1, 1)): leg_type = BILL_TYPES[leg_summary['Type']] bill = Bill(identifier=leg_summary['File\xa0#'], title=leg_summary['Title'], legislative_session=None, classification=leg_type, from_organization={"name": "New York City Council"}) bill.add_source(leg_summary['url'], note='web') leg_details = self.legDetails(leg_summary['url']) history = self.history(leg_summary['url']) bill.add_title(leg_details['Name'], note='created by administrative staff') if 'Summary' in leg_details: bill.add_abstract(leg_details['Summary'], note='') if leg_details['Law number']: bill.add_identifier(leg_details['Law number'], note='law number') for sponsorship in self._sponsors(leg_details.get('Sponsors', [])): sponsor, sponsorship_type, primary = sponsorship bill.add_sponsorship(sponsor, sponsorship_type, 'person', primary) for attachment in leg_details.get('Attachments', []): if attachment['label']: bill.add_document_link(attachment['label'], attachment['url'], media_type="application/pdf") history = list(history) if history: earliest_action = min( self.toTime(action['Date']) for action in history) bill.legislative_session = self.sessions(earliest_action) else: bill.legislative_session = str(self.SESSION_STARTS[0]) for action in history: action_description = action['Action'] if not action_description: continue action_class = ACTION_CLASSIFICATION[action_description] action_date = self.toDate(action['Date']) responsible_org = action['Action\xa0By'] if responsible_org == 'City Council': responsible_org = 'New York City Council' elif responsible_org == 'Administration': responsible_org = 'Mayor' if responsible_org == 'Town Hall Meeting': continue else: act = bill.add_action( action_description, action_date, organization={'name': responsible_org}, classification=action_class) if 'url' in action['Action\xa0Details']: action_detail_url = action['Action\xa0Details']['url'] if action_class == 'committee-referral': action_details = self.actionDetails(action_detail_url) referred_committee = action_details[ 'Action text'].rsplit(' to the ', 1)[-1] act.add_related_entity( referred_committee, 'organization', entity_id=_make_pseudo_id(name=referred_committee)) result, votes = self.extractVotes(action_detail_url) if result and votes: action_vote = VoteEvent( legislative_session=bill.legislative_session, motion_text=action_description, organization={'name': responsible_org}, classification=action_class, start_date=action_date, result=result, bill=bill) action_vote.add_source(action_detail_url, note='web') for option, voter in votes: action_vote.vote(option, voter) yield action_vote text = self.text(leg_summary['url']) if text: bill.extras = { 'local_classification': leg_summary['Type'], 'full_text': text } else: bill.extras = {'local_classification': leg_summary['Type']} yield bill
def scrape_pdf_for_votes(self, session, actor, date, motion, href): warned = False # vote indicator, a few spaces, a name, newline or multiple spaces # VOTE_RE = re.compile('(Y|N|E|NV|A|P|-)\s{2,5}(\w.+?)(?:\n|\s{2})') COUNT_RE = re.compile( r'^(\d+)\s+YEAS?\s+(\d+)\s+NAYS?\s+(\d+)\s+PRESENT(?:\s+(\d+)\s+NOT\sVOTING)?\s*$' ) PASS_FAIL_WORDS = { 'PASSED': 'pass', 'PREVAILED': 'fail', 'ADOPTED': 'pass', 'CONCURRED': 'pass', 'FAILED': 'fail', 'LOST': 'fail', } pdflines = self.fetch_pdf_lines(href) if not pdflines: return False yes_count = no_count = present_count = 0 yes_votes = [] no_votes = [] present_votes = [] excused_votes = [] not_voting = [] absent_votes = [] passed = None counts_found = False vote_lines = [] for line in pdflines: # consider pass/fail as a document property instead of a result of the vote count # extract the vote count from the document instead of just using counts of names if not line.strip(): continue elif line.strip() in PASS_FAIL_WORDS: # Crash on duplicate pass/fail status that differs from previous status if passed is not None and passed != PASS_FAIL_WORDS[ line.strip()]: raise Exception("Duplicate pass/fail matches in [%s]" % href) passed = PASS_FAIL_WORDS[line.strip()] elif COUNT_RE.match(line): (yes_count, no_count, present_count, not_voting_count) = COUNT_RE.match(line).groups() yes_count = int(yes_count) no_count = int(no_count) present_count = int(present_count) counts_found = True elif counts_found: for value in VOTE_VALUES: if re.search(r'^\s*({})\s+\w'.format(value), line): vote_lines.append(line) break votes = find_columns_and_parse(vote_lines) for name, vcode in votes.items(): if name == 'Mr. Speaker': name = session_details[session]['speaker'] elif name == 'Mr. President': name = session_details[session]['president'] else: # Converts "Davis,William" to "Davis, William". name = re.sub(r'\,([a-zA-Z])', r', \1', name) if vcode == 'Y': yes_votes.append(name) elif vcode == 'N': no_votes.append(name) elif vcode == 'P': present_votes.append(name) elif vcode == 'E': excused_votes.append(name) elif vcode == 'NV': not_voting.append(name) elif vcode == 'A': absent_votes.append(name) # fake the counts if yes_count == 0 and no_count == 0 and present_count == 0: yes_count = len(yes_votes) no_count = len(no_votes) else: # audit if yes_count != len(yes_votes): self.warning("Mismatched yes count [expect: %i] [have: %i]" % (yes_count, len(yes_votes))) warned = True if no_count != len(no_votes): self.warning("Mismatched no count [expect: %i] [have: %i]" % (no_count, len(no_votes))) warned = True if passed is None: if actor[ 'classification'] == 'lower': # senate doesn't have these lines self.warning( "No pass/fail word found; fall back to comparing yes and no vote." ) warned = True passed = 'pass' if yes_count > no_count else 'fail' classification, _ = _categorize_action(motion) vote_event = VoteEvent(legislative_session=session, motion_text=motion, classification=classification, organization=actor, start_date=date, result=passed) for name in yes_votes: vote_event.yes(name) for name in no_votes: vote_event.no(name) for name in present_votes: vote_event.vote('other', name) for name in excused_votes: vote_event.vote('excused', name) for name in not_voting: vote_event.vote('not voting', name) for name in absent_votes: vote_event.vote('absent', name) vote_event.set_count('yes', yes_count) vote_event.set_count('no', no_count) vote_event.set_count('other', present_count) vote_event.set_count('excused', len(excused_votes)) vote_event.set_count('absent', len(absent_votes)) vote_event.set_count('not voting', len(not_voting)) vote_event.add_source(href) # for distinguishing between votes with the same id and on same day vote_event.pupa_id = href if warned: self.warning("Warnings were issued. Best to check %s" % href) return vote_event
def scrape(self, session=None, chamber=None): bill_type_map = { "B": "bill", "R": "resolution", "JR": "joint resolution", "CR": "concurrent resolution", } chamber_map = { "H": "lower", "S": "upper", "J": "joint", "E": "legislature", # Effective date } action_code_map = { "HI": None, "SI": None, "HH": None, "SH": None, "HPF": ["introduction"], "HDSAS": None, "SPF": ["introduction"], "HSR": ["reading-2"], "SSR": ["reading-2"], "HFR": ["reading-1"], "SFR": ["reading-1"], "HRECM": ["withdrawal", "referral-committee"], "SRECM": ["withdrawal", "referral-committee"], "SW&C": ["withdrawal", "referral-committee"], "HW&C": ["withdrawal", "referral-committee"], "HRA": ["passage"], "SRA": ["passage"], "HPA": ["passage"], "HRECO": None, "SPA": ["passage"], "HTABL": None, # 'House Tabled' - what is this? "SDHAS": None, "HCFR": ["committee-passage-favorable"], "SCFR": ["committee-passage-favorable"], "HRAR": ["referral-committee"], "SRAR": ["referral-committee"], "STR": ["reading-3"], "SAHAS": None, "SE": ["passage"], "SR": ["referral-committee"], "HTRL": ["reading-3", "failure"], "HTR": ["reading-3"], "S3RLT": ["reading-3", "failure"], "HASAS": None, "S3RPP": None, "STAB": None, "SRECO": None, "SAPPT": None, "HCA": None, "HNOM": None, "HTT": None, "STT": None, "SRECP": None, "SCRA": None, "SNOM": None, "S2R": ["reading-2"], "H2R": ["reading-2"], "SENG": ["passage"], "HENG": ["passage"], "HPOST": None, "HCAP": None, "SDSG": ["executive-signature"], "SSG": ["executive-receipt"], "Signed Gov": ["executive-signature"], "HDSG": ["executive-signature"], "HSG": ["executive-receipt"], "EFF": None, "HRP": None, "STH": None, "HTS": None, } if not session: session = self.latest_session() self.info("no session specified, using %s", session) sid = SESSION_SITE_IDS[session] legislation = backoff(self.lservice.GetLegislationForSession, sid)["LegislationIndex"] for leg in legislation: lid = leg["Id"] instrument = backoff(self.lservice.GetLegislationDetail, lid) history = [x for x in instrument["StatusHistory"][0]] actions = reversed([{ "code": x["Code"], "action": x["Description"], "_guid": x["Id"], "date": x["Date"], } for x in history]) guid = instrument["Id"] # A little bit hacky. bill_prefix = instrument["DocumentType"] bill_chamber = chamber_map[bill_prefix[0]] bill_type = bill_type_map[bill_prefix[1:]] bill_id = "%s %s" % (bill_prefix, instrument["Number"]) if instrument["Suffix"]: bill_id += instrument["Suffix"] title = instrument["Caption"] description = instrument["Summary"] if title is None: continue bill = Bill( bill_id, legislative_session=session, chamber=bill_chamber, title=title, classification=bill_type, ) bill.add_abstract(description, note="description") bill.extras = {"guid": guid} if instrument["Votes"]: for vote_ in instrument["Votes"]: _, vote_ = vote_ vote_ = backoff(self.vservice.GetVote, vote_[0]["VoteId"]) vote = VoteEvent( start_date=vote_["Date"].strftime("%Y-%m-%d"), motion_text=vote_["Caption"] or "Vote on Bill", chamber={ "House": "lower", "Senate": "upper" }[vote_["Branch"]], result="pass" if vote_["Yeas"] > vote_["Nays"] else "fail", classification="passage", bill=bill, ) vote.set_count("yes", vote_["Yeas"]) vote.set_count("no", vote_["Nays"]) vote.set_count("other", vote_["Excused"] + vote_["NotVoting"]) vote.add_source(self.vsource) methods = {"Yea": "yes", "Nay": "no"} if vote_["Votes"] is not None: for vdetail in vote_["Votes"][0]: whom = vdetail["Member"] how = vdetail["MemberVoted"] if whom["Name"] == "VACANT": continue name, district = vote_name_pattern.search( whom["Name"]).groups() vote.vote(methods.get(how, "other"), name, note=district) yield vote ccommittees = defaultdict(list) committees = instrument["Committees"] if committees: for committee in committees[0]: ccommittees[{ "House": "lower", "Senate": "upper" }[committee["Type"]]].append(committee["Name"]) for action in actions: action_chamber = chamber_map[action["code"][0]] try: action_types = action_code_map[action["code"]] except KeyError: error_msg = "Code {code} for action {action} not recognized.".format( code=action["code"], action=action["action"]) self.logger.warning(error_msg) action_types = None committees = [] if action_types and any( ("committee" in x for x in action_types)): committees = [ str(x) for x in ccommittees.get(action_chamber, []) ] act = bill.add_action( action["action"], action["date"].strftime("%Y-%m-%d"), classification=action_types, chamber=action_chamber, ) for committee in committees: act.add_related_entity(committee, "organization") act.extras = {"code": action["code"], "guid": action["_guid"]} sponsors = [] if instrument["Authors"]: sponsors = instrument["Authors"]["Sponsorship"] if "Sponsors" in instrument and instrument["Sponsors"]: sponsors += instrument["Sponsors"]["Sponsorship"] sponsors = [(x["Type"], self.get_member(x["MemberId"])) for x in sponsors] for typ, sponsor in sponsors: name = "{First} {Last}".format(**dict(sponsor["Name"])) bill.add_sponsorship( name, entity_type="person", classification="primary" if "Author" in typ else "secondary", primary="Author" in typ, ) for version in instrument["Versions"]["DocumentDescription"]: name, url, doc_id, version_id = [ version[x] for x in ["Description", "Url", "Id", "Version"] ] link = bill.add_version_link(name, url, media_type="application/pdf") link["extras"] = { "_internal_document_id": doc_id, "_version_id": version_id, } bill.add_source(self.msource) bill.add_source(self.lsource) bill.add_source( SOURCE_URL.format(**{ "session": session, "bid": guid })) yield bill
def parse_vote(self, bill, link): # Server sometimes sends proper error headers, # sometimes not try: self.info("Get {}".format(link)) text = requests.get(link).text except requests.exceptions.HTTPError as err: self.warning("{} fetching vote {}, skipping".format(err, link)) return if 'Varnish cache server' in text: self.warning("Scrape rate is too high, try re-scraping with " "The --rpm set to a lower number") return if 'Page Not Found' in text or 'Page Unavailable' in text: self.warning("missing vote, skipping") return member_doc = lxml.html.fromstring(text) motion = member_doc.xpath("//div[@id='main_content']/h4/text()") chamber_date_line = ''.join( member_doc.xpath("//div[@id='main_content']/h3[1]//text()")) chamber_date_line_words = chamber_date_line.split() vote_chamber = chamber_date_line_words[0] vote_date = datetime.datetime.strptime(chamber_date_line_words[-1], '%m/%d/%Y') vote_status = " ".join(chamber_date_line_words[2:-2]) opinions = member_doc.xpath( "//div[@id='main_content']/h3[position() > 1]/text()") if len(opinions) > 0: vote_status = vote_status if vote_status.strip() else motion[0] vote_chamber = 'upper' if vote_chamber == 'Senate' else 'lower' for i in opinions: try: count = int(i[i.find("(") + 1:i.find(")")]) except ValueError: # This is likely not a vote-count text chunk # It's probably '`On roll call the vote was:` pass else: if "yea" in i.lower(): yes_count = count elif "nay" in i.lower(): no_count = count elif "present" in i.lower(): p_count = count elif "absent" in i.lower(): a_count = count vote = VoteEvent( bill=bill, start_date=vote_date.strftime('%Y-%m-%d'), chamber=vote_chamber, motion_text=vote_status, result='pass' if yes_count > no_count else 'fail', classification='passage', ) vote.pupa_id = link vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('abstain', p_count) vote.set_count('absent', a_count) vote.add_source(link) a_links = member_doc.xpath("//div[@id='main_content']/a/text()") for i in range(1, len(a_links)): if i <= yes_count: vote.vote('yes', re.sub(',', '', a_links[i]).split()[0]) elif no_count != 0 and i > yes_count and i <= yes_count + no_count: vote.vote('no', re.sub(',', '', a_links[i]).split()[0]) else: vote.vote('other', re.sub(',', '', a_links[i]).split()[0]) yield vote else: self.warning("No Votes for: %s", link)
def scrape_bill_type( self, chamber, session, bill_type, type_abbr, committee_abbr_regex=get_committee_name_regex(), ): bills = (self.session.query(CABill).filter_by( session_year=session).filter_by(measure_type=type_abbr)) for bill in bills: bill_session = session if bill.session_num != "0": bill_session += " Special Session %s" % bill.session_num bill_id = bill.short_bill_id fsbill = Bill(bill_id, session, title="", chamber=chamber) if (bill_id.startswith("S") and chamber == "lower") or (bill_id.startswith("A") and chamber == "upper"): print("!!!! BAD ID/CHAMBER PAIR !!!!", bill) continue # # Construct session for web query, going from '20092010' to '0910' # source_session = session[2:4] + session[6:8] # # Turn 'AB 10' into 'ab_10' # source_num = "%s_%s" % (bill.measure_type.lower(), # bill.measure_num) # Construct a fake source url source_url = ("http://leginfo.legislature.ca.gov/faces/" "billNavClient.xhtml?bill_id=%s") % bill.bill_id fsbill.add_source(source_url) fsbill.add_version_link(bill_id, source_url, media_type="text/html") title = "" type_ = ["bill"] subject = "" all_titles = set() # Get digest test (aka "summary") from latest version. if bill.versions: version = bill.versions[-1] nsmap = version.xml.nsmap xpath = "//caml:DigestText/xhtml:p" els = version.xml.xpath(xpath, namespaces=nsmap) chunks = [] for el in els: t = etree_text_content(el) t = re.sub(r"\s+", " ", t) t = re.sub(r"\)(\S)", lambda m: ") %s" % m.group(1), t) chunks.append(t) summary = "\n\n".join(chunks) for version in bill.versions: if not version.bill_xml: continue version_date = self._tz.localize( version.bill_version_action_date) # create a version name to match the state's format # 02/06/17 - Enrolled version_date_human = version_date.strftime("%m/%d/%y") version_name = "{} - {}".format(version_date_human, version.bill_version_action) version_base = "https://leginfo.legislature.ca.gov/faces" version_url_pdf = "{}/billPdf.xhtml?bill_id={}&version={}".format( version_base, version.bill_id, version.bill_version_id) fsbill.add_version_link( version_name, version_url_pdf, media_type="application/pdf", date=version_date.date(), ) # CA is inconsistent in that some bills have a short title # that is longer, more descriptive than title. if bill.measure_type in ("AB", "SB"): impact_clause = clean_title(version.title) title = clean_title(version.short_title) else: impact_clause = None if len(version.title) < len( version.short_title) and not version.title.lower( ).startswith("an act"): title = clean_title(version.short_title) else: title = clean_title(version.title) if title: all_titles.add(title) type_ = [bill_type] if version.appropriation == "Yes": type_.append("appropriation") tags = [] if version.fiscal_committee == "Yes": tags.append("fiscal committee") if version.local_program == "Yes": tags.append("local program") if version.urgency == "Yes": tags.append("urgency") if version.taxlevy == "Yes": tags.append("tax levy") if version.subject: subject = clean_title(version.subject) if not title: self.warning("Couldn't find title for %s, skipping" % bill_id) continue fsbill.title = title if summary: fsbill.add_abstract(summary, note="summary") fsbill.classification = type_ fsbill.subject = [subject] if subject else [] fsbill.extras["impact_clause"] = impact_clause fsbill.extras["tags"] = tags # We don't want the current title in alternate_titles all_titles.remove(title) for title in all_titles: fsbill.add_title(title) for author in version.authors: fsbill.add_sponsorship( author.name, classification=SPONSOR_TYPES[author.contribution], primary=author.primary_author_flg == "Y", entity_type="person", ) # fsbill.sponsorships[-1]['extras'] = {'official_type': author.contribution} seen_actions = set() for action in bill.actions: if not action.action: # NULL action text seems to be an error on CA's part, # unless it has some meaning I'm missing continue actor = action.actor or chamber actor = actor.strip() match = re.match(r"(Assembly|Senate)($| \(Floor)", actor) if match: actor = { "Assembly": "lower", "Senate": "upper" }[match.group(1)] elif actor.startswith("Governor"): actor = "executive" else: def replacer(matchobj): if matchobj: return { "Assembly": "lower", "Senate": "upper" }[matchobj.group()] else: return matchobj.group() actor = re.sub(r"^(Assembly|Senate)", replacer, actor) type_ = [] act_str = action.action act_str = re.sub(r"\s+", " ", act_str) attrs = self.categorizer.categorize(act_str) # Add in the committee strings of the related committees, if any. kwargs = attrs matched_abbrs = committee_abbr_regex.findall(action.action) if re.search(r"Com[s]?. on", action.action) and not matched_abbrs: msg = "Failed to extract committee abbr from %r." self.logger.warning(msg % action.action) if matched_abbrs: committees = [] for abbr in matched_abbrs: try: name = self.committee_abbr_to_name(chamber, abbr) committees.append(name) except KeyError: msg = ("Mapping contains no committee name for " "abbreviation %r. Action text was %r.") args = (abbr, action.action) raise KeyError(msg % args) committees = filter(None, committees) kwargs["committees"] = committees code = re.search(r"C[SXZ]\d+", actor) if code is not None: code = code.group() kwargs["actor_info"] = {"committee_code": code} assert len(list(committees)) == len(matched_abbrs) for committee, abbr in zip(committees, matched_abbrs): act_str = act_str.replace("Coms. on ", "") act_str = act_str.replace("Com. on " + abbr, committee) act_str = act_str.replace(abbr, committee) if not act_str.endswith("."): act_str = act_str + "." # Determine which chamber the action originated from. changed = False for committee_chamber in ["upper", "lower", "legislature"]: if actor.startswith(committee_chamber): actor = committee_chamber changed = True break if not changed: actor = "legislature" if actor != action.actor: actor_info = kwargs.get("actor_info", {}) actor_info["details"] = action.actor kwargs["actor_info"] = actor_info # Add strings for related legislators, if any. rgx = r"(?:senator|assembly[mwp][^ .,:;]+)\s+[^ .,:;]+" legislators = re.findall(rgx, action.action, re.I) if legislators: kwargs["legislators"] = legislators date = action.action_date date = self._tz.localize(date) date = date.date() if (actor, act_str, date) in seen_actions: continue kwargs.update(self.categorizer.categorize(act_str)) action = fsbill.add_action( act_str, date.strftime("%Y-%m-%d"), chamber=actor, classification=kwargs["classification"], ) for committee in kwargs.get("committees", []): action.add_related_entity(committee, entity_type="organization") seen_actions.add((actor, act_str, date)) for vote_num, vote in enumerate(bill.votes): if vote.vote_result == "(PASS)": result = True else: result = False if not vote.location: continue full_loc = vote.location.description first_part = full_loc.split(" ")[0].lower() if first_part in ["asm", "assembly"]: vote_chamber = "lower" # vote_location = ' '.join(full_loc.split(' ')[1:]) elif first_part.startswith("sen"): vote_chamber = "upper" # vote_location = ' '.join(full_loc.split(' ')[1:]) else: raise ScrapeError("Bad location: %s" % full_loc) if vote.motion: motion = vote.motion.motion_text or "" else: motion = "" if "Third Reading" in motion or "3rd Reading" in motion: vtype = "passage" elif "Do Pass" in motion: vtype = "passage" else: vtype = "other" motion = motion.strip() # Why did it take until 2.7 to get a flags argument on re.sub? motion = re.compile(r"(\w+)( Extraordinary)? Session$", re.IGNORECASE).sub("", motion) motion = re.compile(r"^(Senate|Assembly) ", re.IGNORECASE).sub("", motion) motion = re.sub(r"^(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? ", "", motion) motion = re.sub(r" \(\w+\)$", "", motion) motion = re.sub(r"(SCR|SB|AB|AJR|ACR)\s?\d+ \w+\.?$", "", motion) motion = re.sub( r"(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? " r"Urgency Clause$", "(Urgency Clause)", motion, ) motion = re.sub(r"\s+", " ", motion) if not motion: self.warning("Got blank motion on vote for %s" % bill_id) continue # XXX this is responsible for all the CA 'committee' votes, not # sure if that's a feature or bug, so I'm leaving it as is... # vote_classification = chamber if (vote_location == 'Floor') else 'committee' # org = { # 'name': vote_location, # 'classification': vote_classification # } fsvote = VoteEvent( motion_text=motion, start_date=self._tz.localize(vote.vote_date_time), result="pass" if result else "fail", classification=vtype, # organization=org, chamber=vote_chamber, bill=fsbill, ) fsvote.extras = {"threshold": vote.threshold} source_url = ("http://leginfo.legislature.ca.gov/faces" "/billVotesClient.xhtml?bill_id={}").format( fsbill.identifier) fsvote.add_source(source_url) fsvote.pupa_id = source_url + "#" + str(vote_num) rc = {"yes": [], "no": [], "other": []} for record in vote.votes: if record.vote_code == "AYE": rc["yes"].append(record.legislator_name) elif record.vote_code.startswith("NO"): rc["no"].append(record.legislator_name) else: rc["other"].append(record.legislator_name) # Handle duplicate votes for key in rc.keys(): rc[key] = list(set(rc[key])) for key, voters in rc.items(): for voter in voters: fsvote.vote(key, voter) # Set counts by summed votes for accuracy fsvote.set_count(key, len(voters)) yield fsvote yield fsbill self.session.expire_all()
def scrape(self, session=None): HTML_TAGS_RE = r'<.*?>' if session is None: session = self.latest_session() year_slug = session[5:] # Load all bills and resolutions via the private API bills_url = \ 'http://legislature.vermont.gov/bill/loadBillsReleased/{}/'.\ format(year_slug) bills_json = self.get(bills_url).text bills = json.loads(bills_json)['data'] or [] bills_url = \ 'http://legislature.vermont.gov/bill/loadBillsIntroduced/{}/'.\ format(year_slug) bills_json = self.get(bills_url).text bills.extend(json.loads(bills_json)['data'] or []) resolutions_url = \ 'http://legislature.vermont.gov/bill/loadAllResolutionsByChamber/{}/both'.\ format(year_slug) resolutions_json = self.get(resolutions_url).text bills.extend(json.loads(resolutions_json)['data'] or []) # Parse the information from each bill for info in bills: # Strip whitespace from strings info = {k: v.strip() for k, v in info.items()} # Identify the bill type and chamber if info['BillNumber'].startswith('J.R.H.'): bill_type = 'joint resolution' bill_chamber = 'lower' elif info['BillNumber'].startswith('J.R.S.'): bill_type = 'joint resolution' bill_chamber = 'upper' elif info['BillNumber'].startswith('H.C.R.'): bill_type = 'concurrent resolution' bill_chamber = 'lower' elif info['BillNumber'].startswith('S.C.R.'): bill_type = 'concurrent resolution' bill_chamber = 'upper' elif info['BillNumber'].startswith('H.R.'): bill_type = 'resolution' bill_chamber = 'lower' elif info['BillNumber'].startswith('S.R.'): bill_type = 'resolution' bill_chamber = 'upper' elif info['BillNumber'].startswith('PR.'): bill_type = 'constitutional amendment' if info['Body'] == 'H': bill_chamber = 'lower' elif info['Body'] == 'S': bill_chamber = 'upper' else: raise AssertionError("Amendment not tied to chamber") elif info['BillNumber'].startswith('H.'): bill_type = 'bill' bill_chamber = 'lower' elif info['BillNumber'].startswith('S.'): bill_type = 'bill' bill_chamber = 'upper' else: raise AssertionError("Unknown bill type found: '{}'".format( info['BillNumber'])) # Create the bill using its basic information bill = Bill(identifier=info['BillNumber'], legislative_session=session, chamber=bill_chamber, title=info['Title'], classification=bill_type) if 'resolution' in bill_type: bill.add_source(resolutions_url) else: bill.add_source(bills_url) # Load the bill's information page to access its metadata bill_url = 'http://legislature.vermont.gov/bill/status/{0}/{1}'.\ format(year_slug, info['BillNumber']) doc = self.lxmlize(bill_url) bill.add_source(bill_url) # Capture sponsors sponsors = doc.xpath( '//dl[@class="summary-table"]/dt[text()="Sponsor(s)"]/' 'following-sibling::dd[1]/ul/li') sponsor_type = 'primary' for sponsor in sponsors: if sponsor.xpath('span/text()') == ['Additional Sponsors']: sponsor_type = 'cosponsor' continue sponsor_name = sponsor.xpath('a/text()')[0].\ replace("Rep.", "").replace("Sen.", "").strip() if sponsor_name and not \ (sponsor_name[:5] == "Less" and len(sponsor_name) == 5): bill.add_sponsorship(name=sponsor_name, classification=sponsor_type, entity_type='person', primary=(sponsor_type == 'primary')) # Capture bill text versions # Warning: There's a TODO in VT's source code saying 'move this to where it used to be' # so leave in the old and new positions versions = doc.xpath( '//dl[@class="summary-table"]/dt[text()="Bill/Resolution Text"]/' 'following-sibling::dd[1]/ul/li/a |' '//ul[@class="bill-path"]//a') for version in versions: if version.xpath('text()'): bill.add_version_link( note=version.xpath('text()')[0], url=version.xpath('@href')[0].replace(' ', '%20'), media_type='application/pdf') # Identify the internal bill ID, used for actions and votes # If there is no internal bill ID, then it has no extra information try: internal_bill_id = re.search( r'"bill/loadBillDetailedStatus/.+?/(\d+)"', lxml.etree.tostring(doc).decode('utf-8')).group(1) except AttributeError: self.warning("Bill {} appears to have no activity".format( info['BillNumber'])) yield bill continue # Capture actions actions_url = 'http://legislature.vermont.gov/bill/loadBillDetailedStatus/{0}/{1}'.\ format(year_slug, internal_bill_id) actions_json = self.get(actions_url).text actions = json.loads(actions_json)['data'] bill.add_source(actions_url) chambers_passed = set() for action in actions: action = {k: v.strip() for k, v in action.items()} if "Signed by Governor" in action['FullStatus']: actor = 'executive' elif action['ChamberCode'] == 'H': actor = 'lower' elif action['ChamberCode'] == 'S': actor = 'upper' else: raise AssertionError("Unknown actor for bill action") # Categorize action if "Signed by Governor" in action['FullStatus']: assert chambers_passed == set("HS") action_type = 'executive-signature' elif actor == 'lower' and any( x.lower().startswith('aspassed') for x in action['keywords'].split(';')): action_type = 'passage' chambers_passed.add("H") elif actor == 'upper' and any( x.lower().startswith(' aspassed') or x.lower().startswith('aspassed') for x in action['keywords'].split(';')): action_type = 'passage' chambers_passed.add("S") else: action_type = None bill.add_action(description=re.sub(HTML_TAGS_RE, "", action['FullStatus']), date=datetime.datetime.strftime( datetime.datetime.strptime( action['StatusDate'], '%m/%d/%Y'), '%Y-%m-%d'), chamber=actor, classification=action_type) # Capture votes votes_url = 'http://legislature.vermont.gov/bill/loadBillRollCalls/{0}/{1}'.format( year_slug, internal_bill_id) votes_json = self.get(votes_url).text votes = json.loads(votes_json)['data'] bill.add_source(votes_url) for vote in votes: roll_call_id = vote['VoteHeaderID'] roll_call_url = ('http://legislature.vermont.gov/bill/' 'loadBillRollCallDetails/{0}/{1}'.format( year_slug, roll_call_id)) roll_call_json = self.get(roll_call_url).text roll_call = json.loads(roll_call_json)['data'] roll_call_yea = [] roll_call_nay = [] roll_call_not_voting = [] for member in roll_call: (member_name, _district) = member['MemberName'].split(" of ") member_name = member_name.strip() if member['MemberVote'] == "Yea": roll_call_yea.append(member_name) elif member['MemberVote'] == "Nay": roll_call_nay.append(member_name) else: roll_call_not_voting.append(member_name) if "Passed -- " in vote['FullStatus']: did_pass = True elif ("Failed -- " in vote['FullStatus'] or 'Veto of the Governor sustained' in vote['FullStatus']): did_pass = False else: raise AssertionError("Roll call vote result is unclear") # Check vote counts yea_count = int( re.search(r'Yeas = (\d+)', vote['FullStatus']).group(1)) nay_count = int( re.search(r'Nays = (\d+)', vote['FullStatus']).group(1)) vote_to_add = VoteEvent( chamber=('lower' if vote['ChamberCode'] == 'H' else 'upper'), start_date=datetime.datetime.strftime( datetime.datetime.strptime(vote['StatusDate'], '%m/%d/%Y'), '%Y-%m-%d'), motion_text=re.sub(HTML_TAGS_RE, "", vote['FullStatus']).strip(), result='pass' if did_pass else 'fail', classification='passage', legislative_session=session, bill=info['BillNumber'], bill_chamber=bill_chamber) vote_to_add.add_source(roll_call_url) vote_to_add.set_count('yes', yea_count) vote_to_add.set_count('no', nay_count) vote_to_add.set_count('not voting', len(roll_call_not_voting)) for member in roll_call_yea: vote_to_add.yes(member) for member in roll_call_nay: vote_to_add.no(member) for member in roll_call_not_voting: vote_to_add.vote('not voting', member) yield vote_to_add # Capture extra information- Not yet implemented # Witnesses: # http://legislature.vermont.gov/bill/loadBillWitnessList/{year_slug}/{internal_bill_id} # Conference committee members: # http://legislature.vermont.gov/bill/loadBillConference/{year_slug}/{bill_number} # Committee meetings: # http://legislature.vermont.gov/committee/loadHistoryByBill/{year_slug}?LegislationId={internal_bill_id} yield bill
def scrape(self, session=None): HTML_TAGS_RE = r'<.*?>' if session is None: session = self.latest_session() year_slug = self.jurisdiction.get_year_slug(session) # Load all bills and resolutions via the private API bills_url = \ 'http://legislature.vermont.gov/bill/loadBillsReleased/{}/'.\ format(year_slug) bills_json = self.get(bills_url).text bills = json.loads(bills_json)['data'] or [] bills_url = \ 'http://legislature.vermont.gov/bill/loadBillsIntroduced/{}/'.\ format(year_slug) bills_json = self.get(bills_url).text bills.extend(json.loads(bills_json)['data'] or []) resolutions_url = \ 'http://legislature.vermont.gov/bill/loadAllResolutionsByChamber/{}/both'.\ format(year_slug) resolutions_json = self.get(resolutions_url).text bills.extend(json.loads(resolutions_json)['data'] or []) # Parse the information from each bill for info in bills: # Strip whitespace from strings info = {k: v.strip() for k, v in info.items()} # Identify the bill type and chamber if info['BillNumber'].startswith('J.R.H.'): bill_type = 'joint resolution' bill_chamber = 'lower' elif info['BillNumber'].startswith('J.R.S.'): bill_type = 'joint resolution' bill_chamber = 'upper' elif info['BillNumber'].startswith('H.C.R.'): bill_type = 'concurrent resolution' bill_chamber = 'lower' elif info['BillNumber'].startswith('S.C.R.'): bill_type = 'concurrent resolution' bill_chamber = 'upper' elif info['BillNumber'].startswith('H.R.'): bill_type = 'resolution' bill_chamber = 'lower' elif info['BillNumber'].startswith('S.R.'): bill_type = 'resolution' bill_chamber = 'upper' elif info['BillNumber'].startswith('PR.'): bill_type = 'constitutional amendment' if info['Body'] == 'H': bill_chamber = 'lower' elif info['Body'] == 'S': bill_chamber = 'upper' else: raise AssertionError("Amendment not tied to chamber") elif info['BillNumber'].startswith('H.'): bill_type = 'bill' bill_chamber = 'lower' elif info['BillNumber'].startswith('S.'): bill_type = 'bill' bill_chamber = 'upper' else: raise AssertionError( "Unknown bill type found: '{}'". format(info['BillNumber']) ) bill_id = info['BillNumber'].replace('.', '').replace(' ', '') # put one space back in between type and number bill_id = re.sub(r'([a-zA-Z]+)(\d+)', r'\1 \2', bill_id) # Create the bill using its basic information bill = Bill( identifier=bill_id, legislative_session=session, chamber=bill_chamber, title=info['Title'], classification=bill_type ) if 'resolution' in bill_type: bill.add_source(resolutions_url) else: bill.add_source(bills_url) # Load the bill's information page to access its metadata bill_url = 'http://legislature.vermont.gov/bill/status/{0}/{1}'.\ format(year_slug, info['BillNumber']) doc = self.lxmlize(bill_url) bill.add_source(bill_url) # Capture sponsors sponsors = doc.xpath( '//dl[@class="summary-table"]/dt[text()="Sponsor(s)"]/' 'following-sibling::dd[1]/ul/li' ) sponsor_type = 'primary' for sponsor in sponsors: if sponsor.xpath('span/text()') == ['Additional Sponsors']: sponsor_type = 'cosponsor' continue sponsor_name = sponsor.xpath('a/text()')[0].\ replace("Rep.", "").replace("Sen.", "").strip() if sponsor_name and not \ (sponsor_name[:5] == "Less" and len(sponsor_name) == 5): bill.add_sponsorship( name=sponsor_name, classification=sponsor_type, entity_type='person', primary=(sponsor_type == 'primary') ) # Capture bill text versions # Warning: There's a TODO in VT's source code saying 'move this to where it used to be' # so leave in the old and new positions versions = doc.xpath( '//dl[@class="summary-table"]/dt[text()="Bill/Resolution Text"]/' 'following-sibling::dd[1]/ul/li/a |' '//ul[@class="bill-path"]//a' ) for version in versions: if version.xpath('text()'): bill.add_version_link( note=version.xpath('text()')[0], url=version.xpath('@href')[0].replace(' ', '%20'), media_type='application/pdf' ) # Identify the internal bill ID, used for actions and votes # If there is no internal bill ID, then it has no extra information try: internal_bill_id = re.search( r'"bill/loadBillDetailedStatus/.+?/(\d+)"', lxml.etree.tostring(doc).decode('utf-8') ).group(1) except AttributeError: self.warning("Bill {} appears to have no activity".format(info['BillNumber'])) yield bill continue # Capture actions actions_url = 'http://legislature.vermont.gov/bill/loadBillDetailedStatus/{0}/{1}'.\ format(year_slug, internal_bill_id) actions_json = self.get(actions_url).text actions = json.loads(actions_json)['data'] bill.add_source(actions_url) chambers_passed = set() for action in actions: action = {k: v for k, v in action.items() if v is not None} if "Signed by Governor" in action['FullStatus']: actor = 'executive' elif action['ChamberCode'] == 'H': actor = 'lower' elif action['ChamberCode'] == 'S': actor = 'upper' else: raise AssertionError("Unknown actor for bill action") # Categorize action if "Signed by Governor" in action['FullStatus']: # assert chambers_passed == set("HS") action_type = 'executive-signature' elif "Vetoed by the Governor" in action['FullStatus']: action_type = 'executive-veto' elif "Read first time" in action['FullStatus'] \ or "Read 1st time" in action['FullStatus']: action_type = 'introduction' elif "Reported favorably" in action['FullStatus']: action_type = 'committee-passage-favorable' elif actor == 'lower' and any(x.lower().startswith('aspassed') for x in action['keywords'].split(';')): action_type = 'passage' chambers_passed.add("H") elif actor == 'upper' and any(x.lower().startswith(' aspassed') or x.lower().startswith('aspassed') for x in action['keywords'].split(';')): action_type = 'passage' chambers_passed.add("S") else: action_type = None bill.add_action( description=re.sub(HTML_TAGS_RE, "", action['FullStatus']), date=datetime.datetime.strftime( datetime.datetime.strptime(action['StatusDate'], '%m/%d/%Y'), '%Y-%m-%d' ), chamber=actor, classification=action_type ) # Capture votes votes_url = 'http://legislature.vermont.gov/bill/loadBillRollCalls/{0}/{1}'.format( year_slug, internal_bill_id) votes_json = self.get(votes_url).text votes = json.loads(votes_json)['data'] bill.add_source(votes_url) for vote in votes: roll_call_id = vote['VoteHeaderID'] roll_call_url = ('http://legislature.vermont.gov/bill/' 'loadBillRollCallDetails/{0}/{1}'.format( year_slug, roll_call_id)) roll_call_json = self.get(roll_call_url).text roll_call = json.loads(roll_call_json)['data'] roll_call_yea = [] roll_call_nay = [] roll_call_not_voting = [] for member in roll_call: (member_name, _district) = member['MemberName'].split(" of ") member_name = member_name.strip() if member['MemberVote'] == "Yea": roll_call_yea.append(member_name) elif member['MemberVote'] == "Nay": roll_call_nay.append(member_name) else: roll_call_not_voting.append(member_name) if ("Passed -- " in vote['FullStatus'] or "Veto of Governor overridden" in vote['FullStatus']): did_pass = True elif ("Failed -- " in vote['FullStatus'] or 'Veto of the Governor sustained' in vote['FullStatus']): did_pass = False else: raise AssertionError("Roll call vote result is unclear") # Check vote counts yea_count = int(re.search(r'Yeas = (\d+)', vote['FullStatus']).group(1)) nay_count = int(re.search(r'Nays = (\d+)', vote['FullStatus']).group(1)) vote_to_add = VoteEvent( bill=bill, chamber=('lower' if vote['ChamberCode'] == 'H' else 'upper'), start_date=datetime.datetime.strftime( datetime.datetime.strptime(vote['StatusDate'], '%m/%d/%Y'), '%Y-%m-%d' ), motion_text=re.sub(HTML_TAGS_RE, "", vote['FullStatus']).strip(), result='pass' if did_pass else 'fail', classification='passage', legislative_session=session, ) vote_to_add.add_source(roll_call_url) vote_to_add.set_count('yes', yea_count) vote_to_add.set_count('no', nay_count) vote_to_add.set_count('not voting', len(roll_call_not_voting)) for member in roll_call_yea: vote_to_add.yes(member) for member in roll_call_nay: vote_to_add.no(member) for member in roll_call_not_voting: vote_to_add.vote('not voting', member) yield vote_to_add # Capture extra information- Not yet implemented # Witnesses: # http://legislature.vermont.gov/bill/loadBillWitnessList/{year_slug}/{internal_bill_id} # Conference committee members: # http://legislature.vermont.gov/bill/loadBillConference/{year_slug}/{bill_number} # Committee meetings: # http://legislature.vermont.gov/committee/loadHistoryByBill/{year_slug}?LegislationId={internal_bill_id} yield bill
def scrape_vote(self, bill, vote_url, chamber, date): page = self.lxmlize(vote_url) try: motion = page.xpath("//font/text()")[2] except IndexError: self.warning("Vote Summary Page Broken ") return # eg. http://leg.colorado.gov/content/sb18-033vote563ce6 if ("AM" in motion or "PM" in motion) and "/" in motion: motion = "Motion not given." if "withdrawn" not in motion: yes_no_counts = page.xpath( "//tr/td[preceding-sibling::td/descendant::" "font[contains(text(),'Aye')]]/font/text()" ) other_counts = page.xpath( "//tr/td[preceding-sibling::td/descendant::" "font[contains(text(),'Absent')]]/font/text()" ) abstain_counts = page.xpath( "//tr/td[preceding-sibling::td/descendant::" "font[contains(text(),'17C')]]/font/text()" ) yes_count = int(yes_no_counts[0]) no_count = int(yes_no_counts[2]) exc_count = int(other_counts[2]) absent_count = int(other_counts[0]) abstain_count = 0 if abstain_counts: abstain_count = int(abstain_counts[0]) # fix for # http://leg.colorado.gov/content/hb19-1029vote65e72e if absent_count == -1: absent_count = 0 passed = yes_count > no_count vote = VoteEvent( chamber=chamber, start_date=self._tz.localize(date), motion_text=motion, result="pass" if passed else "fail", bill=bill, classification="passage", ) vote.pupa_id = vote_url vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("excused", exc_count) vote.set_count("absent", absent_count) vote.set_count("abstain", abstain_count) vote.add_source(vote_url) rolls = page.xpath( "//tr[preceding-sibling::tr/descendant::" "td/div/b/font[contains(text(),'Vote')]]" ) vote_abrv = { "Y": "yes", "N": "no", "E": "excused", "A": "absent", "-": "absent", "17C": "abstain", } for roll in rolls: if len(roll.xpath(".//td/div/font/text()")) > 0: voted = roll.xpath(".//td/div/font/text()")[0].strip() voter = roll.xpath(".//td/font/text()")[0].strip() if voted == "V": continue vote.vote(vote_abrv[voted], voter) yield vote
def scrape(self): unreachable_urls = [] for leg_summary in self.legislation(created_after=datetime.datetime(2015, 5, 17)) : title = leg_summary['Title'].strip() if not title or not leg_summary['Intro\xa0Date'] : continue # https://chicago.legistar.com/LegislationDetail.aspx?ID=1800754&GUID=29575A7A-5489-4D8B-8347-4FC91808B201&Options=Advanced&Search= # doesn't have an intro date bill_type = BILL_TYPES[leg_summary['Type']] bill_session = self.session(self.toTime(leg_summary['Intro\xa0Date'])) bill = Bill(identifier=leg_summary['Record #'], legislative_session=bill_session, title=title, classification=bill_type, from_organization={"name":"Chicago City Council"}) bill.add_source(leg_summary['url']) try : leg_details = self.legDetails(leg_summary['url']) except IndexError : unreachable_urls.append(leg_summary['url']) yield bill continue for related_bill in leg_details.get('Related files', []) : lower_title = title.lower() if "sundry" in title or "miscellaneous" in title: #these are ominbus bill.add_related_bill(identifier = related_bill['label'], legislative_session = bill.legislative_session, relation_type='replaces') #for now we're skipping related bills if they #don't contain words that make us think they're #in a ominbus relationship with each other for i, sponsor in enumerate(leg_details.get('Sponsors', [])) : if i == 0 : primary = True sponsorship_type = "Primary" else : primary = False sponsorship_type = "Regular" sponsor_name = sponsor['label'] # Does the Mayor/Clerk introduce legisislation as # individuals role holders or as the OFfice of City # Clerk and the Office of the Mayor? entity_type = 'person' if sponsor_name.startswith(('City Clerk', 'Mendoza, Susana')) : sponsor_name = 'Office of the City Clerk' entity_type = 'organization' elif sponsor_name.startswith(('Emanuel, Rahm',)) : sponsor_name = 'Office of the Mayor' entity_type = 'organization' if not sponsor_name.startswith(('Misc. Transmittal', 'No Sponsor', 'Dept./Agency')) : bill.add_sponsorship(sponsor_name, sponsorship_type, entity_type, primary, entity_id = _make_pseudo_id(name=sponsor_name)) if 'Topic' in leg_details : for subject in leg_details[u'Topic'].split(',') : bill.add_subject(subject) for attachment in leg_details.get('Attachments', []) : if attachment['label'] : bill.add_version_link(attachment['label'], attachment['url'], media_type="application/pdf") for action in self.history(leg_summary['url']) : action_description = action['Action'] try : action_date = self.toTime(action['Date']).date().isoformat() except AttributeError : # https://chicago.legistar.com/LegislationDetail.aspx?ID=1424866&GUID=CEC53337-B991-4268-AE8A-D4D174F8D492 continue if action_description : try : responsible_org = action['Action\xa0By']['label'] except TypeError : responsible_org = action['Action\xa0By'] if responsible_org == 'City Council' : responsible_org = 'Chicago City Council' act = bill.add_action(action_description, action_date, organization={'name': responsible_org}, classification=ACTION_CLASSIFICATION[action_description]) if action_description == 'Referred' : try : leg_details['Current Controlling Legislative Body']['label'] controlling_bodies = [leg_details['Current Controlling Legislative Body']] except TypeError : controlling_bodies = leg_details['Current Controlling Legislative Body'] if controlling_bodies : for controlling_body in controlling_bodies : body_name = controlling_body['label'] if body_name.startswith("Joint Committee") : act.add_related_entity(body_name, 'organization') else : act.add_related_entity(body_name, 'organization', entity_id = _make_pseudo_id(name=body_name)) if 'url' in action['Action\xa0Details'] : action_detail_url = action['Action\xa0Details']['url'] result, votes = self.extractVotes(action_detail_url) if votes and result : # see https://github.com/datamade/municipal-scrapers-us/issues/15 action_vote = VoteEvent(legislative_session=bill.legislative_session, motion_text=action_description, organization={'name': responsible_org}, classification=None, start_date=action_date, result=result, bill=bill) action_vote.add_source(action_detail_url) for option, voter in votes : action_vote.vote(option, voter) yield action_vote bill.extras = {'local_classification' : leg_summary['Type']} yield bill print(unreachable_urls)
def process_vote(self, vote, bill, member_ids): try: motion = vote["ReadingDescription"] except KeyError: self.logger.warning( "Can't even figure out what we're voting on. Skipping.") return if "VoteResult" not in vote: if "postponed" in motion.lower(): result = "Postponed" status = True # because we're talking abtout the motion, not the amendment elif "tabled" in motion.lower(): result = "Tabled" status = True else: self.logger.warning("Could not find result of vote, skipping.") return else: result = vote["VoteResult"].strip().lower() statuses = { "approved": 'pass', "disapproved": 'fail', "failed": 'fail', "declined": 'fail', "passed": 'pass' } try: status = statuses[result] except KeyError: self.logger.warning( "Unexpected vote result '{result},' skipping vote.".format( result=result)) return date = self.date_format(vote["DateOfVote"]) leg_votes = vote["MemberVotes"] v = VoteEvent(chamber='legislature', start_date=date, motion_text=motion, result=status, classification='passage', bill=bill) yes_count = no_count = other_count = 0 for leg_vote in leg_votes: mem_name = member_ids[int(leg_vote["MemberId"])] if leg_vote["Vote"] == "1": yes_count += 1 v.yes(mem_name) elif leg_vote["Vote"] == "2": no_count += 1 v.no(mem_name) else: other_count += 1 v.vote('other', mem_name) v.set_count('yes', yes_count) v.set_count('no', no_count) v.set_count('other', other_count) # the documents for the readings are inside the vote # level in the json, so we'll deal with them here # and also add relevant actions if "amendment" in motion.lower(): if status: t = "amendment-passage" elif result in ["Tabled", "Postponed"]: t = "amendment-deferral" else: t = "amendment-failure" elif "first reading" in motion.lower(): t = "reading-1" elif "1st reading" in motion.lower(): t = "reading-1" elif "second reading" in motion.lower(): t = "reading-2" elif "2nd reading" in motion.lower(): t = "reading-2" elif "third reading" in motion.lower(): t = "reading-3" elif "3rd reading" in motion.lower(): t = "reading-3" elif "final reading" in motion.lower(): t = "reading-3" elif result in ["Tabled", "Postponed"]: t = None else: t = None bill.add_action(motion, date, classification=t) if "amendment" in t: vote["type"] = "amendment" elif "reading" in t: vote["type"] = t.replace("bill:", "") # some documents/versions are hiding in votes. if "AttachmentPath" in vote: is_version = False try: if vote["DocumentType"] in [ "enrollment", "engrossment", "introduction" ]: is_version = True except KeyError: pass if motion in ["enrollment", "engrossment", "introduction"]: is_version = True self.add_documents(vote["AttachmentPath"], bill, is_version) return v
def _scrape_upper_chamber(self, session): if int(session[:4]) >= 2016: if len(session) == 4: # regular session url = 'http://www.senate.mo.gov/%sinfo/jrnlist/default.aspx' % ( session[-2:], ) else: # special session url = 'http://www.senate.mo.gov/%sinfo/jrnlist/%sJournals.aspx' % ( session[-4:-2], session[-2:]) else: url = 'http://www.senate.mo.gov/%sinfo/jrnlist/journals.aspx' % ( session[-2:]) vote_types = { 'YEAS': 'yes', 'NAYS': 'no', 'Absent with leave': 'other', 'Absent': 'other', 'Vacancies': 'other', } page = self.lxmlize(url) journs = page.xpath("//table")[0].xpath(".//a") for a in journs: pdf_url = a.attrib['href'] data = self._get_pdf(pdf_url).decode() lines = data.split("\n") in_vote = False cur_date = None vote_type = 'other' cur_bill = '' cur_motion = '' bc = None vote = {} counts = collections.defaultdict(int) for line in lines: line = line.strip() if cur_date is None: matches = re.findall(date_re, line) if matches != []: date = matches[0] date = "%s, %s %s, %s" % date date = dt.datetime.strptime(date, "%A, %B %d, %Y") cur_date = date matches = re.findall(motion_re, line) if matches != []: cont = False for x in matches: if "vote" in x.lower(): cur_motion = x bill = re.findall(bill_re, x) if bill != []: bc = { 'H': 'lower', 'S': 'upper', 'J': 'legislature' }[bill[0][0]] cur_bill = "%s%s%s %s" % bill[0] in_vote = True cont = True if cont: continue if in_vote: if is_vote_end(line): in_vote = False yes, no, other = counts['yes'], counts['no'], counts[ 'other'] if bc is None: continue v = VoteEvent( start_date=TIMEZONE.localize(date), motion_text=cur_motion, result='pass' if yes > no else 'fail', legislative_session=session, classification='passage', bill=cur_bill, bill_chamber=bc, ) v.add_source(url) v.add_source(pdf_url) v.set_count('yes', yes) v.set_count('no', no) v.set_count('other', other) for key in vote: for person in vote[key]: v.vote(key, person) yield v vote = {} counts = collections.defaultdict(int) continue if "Journal of the Senate" in line: continue if re.match( r".*(Monday|Tuesday|Wednesday|Thursday|Friday|" "Saturday|Sunday), .* \d+, \d+.*", line): continue found = False rl = None for vote_type in list(vote_types): if line.lower().startswith(vote_type.lower()): if "none" in line.lower(): continue if "Senator" in line and "Senators" not in line: line = self._clean_line(line) line = line[len(vote_type):] line = line.replace("-Senator ", "") rl = line vote_category = vote_types[vote_type] found = True if vote_category not in vote: vote[vote_category] = [] if found and rl is None: continue elif rl: line = rl names = [self._clean_line(x) for x in line.strip().split()] if names == []: continue lname = names[-1] lname = lname.rsplit("-", 1) if len(lname) > 1: person, count = lname if count.isdigit() is False: continue names.pop(-1) names.append(person) counts[vote_category] += int(count) for name in names: vote[vote_category].append(name)
def scrape(self, window=28, matter_ids=None) : '''By default, scrape board reports updated in the last 28 days. Optionally specify a larger or smaller window of time from which to scrape updates, or specific matters to scrape. Note that passing a value for :matter_ids supercedes the value of :window, such that the given matters will be scraped regardless of when they were updated. Optional parameters :window (numeric) - Amount of time for which to scrape updates, e.g. a window of 7 will scrape legislation updated in the last week. Pass a window of 0 to scrape all legislation. :matter_ids (str) - Comma-separated list of matter IDs to scrape ''' if matter_ids: matters = [self.matter(matter_id) for matter_id in matter_ids.split(',')] matters = filter(None, matters) # Skip matters that are not yet in Legistar elif float(window): # Support for partial days, i.e., window=0.15 n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(float(window)) matters = self.matters(n_days_ago) else: # Scrape all matters, including those without a last-modified date matters = self.matters() n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(float(window)) for matter in matters: # Skip this bill, until Metro cleans up duplicate in Legistar API if matter['MatterFile'] == '2017-0447': continue matter_id = matter['MatterId'] date = matter['MatterIntroDate'] title = matter['MatterTitle'] identifier = matter['MatterFile'] if not all((date, title, identifier)) : continue # Do not scrape private bills introduced before this timestamp. if self._is_restricted(matter) and (date < self.START_DATE_PRIVATE_SCRAPE): continue bill_session = self.session(self.toTime(date)) bill_type = BILL_TYPES[matter['MatterTypeName']] if identifier.startswith('S'): alternate_identifiers = [identifier] identifier = identifier[1:] else: alternate_identifiers = [] bill = Bill(identifier=identifier, legislative_session=bill_session, title=title, classification=bill_type, from_organization={"name": "Board of Directors"}) # The Metro scraper scrapes private bills. # However, we do not want to capture significant data about private bills, # other than the value of the helper function `_is_restricted` and a last modified timestamp. # We yield private bills early, wipe data from previously imported once-public bills, # and include only data *required* by the pupa schema. # https://github.com/opencivicdata/pupa/blob/master/pupa/scrape/schemas/bill.py bill.extras = {'restrict_view' : self._is_restricted(matter)} # Add API source early. # Private bills should have this url for debugging. legistar_api = self.BASE_URL + '/matters/{0}'.format(matter_id) bill.add_source(legistar_api, note='api') if self._is_restricted(matter): # required fields bill.title = 'Restricted View' # wipe old data bill.extras['plain_text'] = '' bill.extras['rtf_text'] = '' bill.sponsorships = [] bill.related_bills = [] bill.versions = [] bill.documents = [] bill.actions = [] yield bill continue legistar_web = matter['legistar_url'] bill.add_source(legistar_web, note='web') for identifier in alternate_identifiers: bill.add_identifier(identifier) for action, vote in self.actions(matter_id) : act = bill.add_action(**action) if action['description'] == 'Referred' : body_name = matter['MatterBodyName'] act.add_related_entity(body_name, 'organization', entity_id = _make_pseudo_id(name=body_name)) result, votes = vote if result : vote_event = VoteEvent(legislative_session=bill.legislative_session, motion_text=action['description'], organization=action['organization'], classification=None, start_date=action['date'], result=result, bill=bill) vote_event.add_source(legistar_web) vote_event.add_source(legistar_api + '/histories') for vote in votes : try: raw_option = vote['VoteValueName'].lower() except AttributeError: raw_option = None clean_option = self.VOTE_OPTIONS.get(raw_option, raw_option) vote_event.vote(clean_option, vote['VotePersonName'].strip()) yield vote_event for sponsorship in self.sponsorships(matter_id) : bill.add_sponsorship(**sponsorship) for topic in self.topics(matter_id) : bill.add_subject(topic['MatterIndexName'].strip()) for relation in self.relations(matter_id): try: # Get data (i.e., json) for the related bill. # Then, we can find the 'MatterFile' (i.e., identifier) and the 'MatterIntroDate' (i.e., to determine its legislative session). # Sometimes, the related bill does not yet exist: in this case, throw an error, and continue. related_bill = self.endpoint('/matters/{0}', relation['MatterRelationMatterId']) except scrapelib.HTTPError: continue else: date = related_bill['MatterIntroDate'] related_bill_session = self.session(self.toTime(date)) identifier = related_bill['MatterFile'] bill.add_related_bill(identifier=identifier, legislative_session=related_bill_session, relation_type='companion') # Currently, the relation type for bills can be one of a few possibilites: https://github.com/opencivicdata/python-opencivicdata/blob/master/opencivicdata/common.py#L104 # Metro simply understands these as related files, suggesting that they receive a relation of 'companion'. bill.add_version_link('Board Report', 'https://metro.legistar.com/ViewReport.ashx?M=R&N=TextL5&GID=557&ID={}&GUID=LATEST&Title=Board+Report'.format(matter_id), media_type="application/pdf") for attachment in self.attachments(matter_id) : if attachment['MatterAttachmentName'] : bill.add_document_link(attachment['MatterAttachmentName'], attachment['MatterAttachmentHyperlink'], media_type="application/pdf") bill.extras['local_classification'] = matter['MatterTypeName'] matter_version_value = matter['MatterVersion'] text = self.text(matter_id, matter_version_value) if text : if text['MatterTextPlain'] : bill.extras['plain_text'] = text['MatterTextPlain'] if text['MatterTextRtf'] : bill.extras['rtf_text'] = text['MatterTextRtf'].replace(u'\u0000', '') yield bill
def _process_votes(self, rollcalls, bill_id, original_chamber, session, proxy): result_types = { 'FAILED': False, 'DEFEATED': False, 'PREVAILED': True, 'PASSED': True, 'SUSTAINED': True, 'NOT SECONDED': False, 'OVERRIDDEN': True, 'ADOPTED': True, } for r in rollcalls: proxy_link = proxy["url"] + r["link"] (path, resp) = self.urlretrieve(proxy_link) text = convert_pdf(path, 'text').decode("utf-8") lines = text.split("\n") os.remove(path) chamber = "lower" if "house of representatives" in lines[0].lower() else "upper" date_parts = lines[1].strip().split()[-3:] date_str = " ".join(date_parts).title() + " " + lines[2].strip() vote_date = datetime.datetime.strptime(date_str, "%b %d, %Y %I:%M:%S %p") vote_date = pytz.timezone('America/Indiana/Indianapolis').localize(vote_date) vote_date = vote_date.isoformat() passed = None for res, val in result_types.items(): # We check multiple lines now because the result of the # roll call vote as parsed can potentially be split. # PDF documents suck. for line in lines[3:5]: if res in line.upper(): passed = val break if passed is None: raise AssertionError("Missing bill passage type") motion = " ".join(lines[4].split()[:-2]) try: yeas = int(lines[4].split()[-1]) nays = int(lines[5].split()[-1]) excused = int(lines[6].split()[-1]) not_voting = int(lines[7].split()[-1]) except ValueError: self.logger.warning("Vote format is weird, skipping") continue vote = VoteEvent(chamber=chamber, legislative_session=session, bill=bill_id, bill_chamber=original_chamber, start_date=vote_date, motion_text=motion, result="pass" if passed else "fail", classification="passage") vote.set_count('yes', yeas) vote.set_count('no', nays) vote.set_count('excused', excused) vote.set_count('not voting', not_voting) vote.add_source(proxy_link) currently_counting = "" possible_vote_lines = lines[8:] for l in possible_vote_lines: l = l.replace("NOT\xc2\xa0VOTING", "NOT VOTING") l = l.replace("\xc2\xa0", " -") if "yea-" in l.lower().replace(" ", ""): currently_counting = "yes" elif "nay-" in l.lower().replace(" ", ""): currently_counting = "no" elif "excused-" in l.lower().replace(" ", ""): currently_counting = "excused" elif "notvoting-" in l.lower().replace(" ", ""): currently_counting = "not voting" elif currently_counting == "": pass elif re.search(r'v\. \d\.\d', l): # this gets rid of the version number # which is often found at the bottom of the doc pass else: voters = l.split(" ") for v in voters: if v.strip(): vote.vote(currently_counting, v.strip()) yield vote
def scrape_vote(self, bill, date, url): page = self.get(url).text page = lxml.html.fromstring(page) header = page.xpath("string(//h3[contains(@id, 'hdVote')])") if "No Bill Action" in header: self.warning("bad vote header -- skipping") return location = header.split(", ")[1] if location.startswith("House"): chamber = "lower" elif location.startswith("Senate"): chamber = "upper" elif location.startswith("Joint"): chamber = "legislature" else: raise ScrapeError("Bad chamber: %s" % location) motion = ", ".join(header.split(", ")[2:]).strip() if motion: # If we can't detect a motion, skip this vote yes_count = int( page.xpath("string(//span[contains(@id, 'tdAyes')])")) no_count = int( page.xpath("string(//span[contains(@id, 'tdNays')])")) excused_count = int( page.xpath("string(//span[contains(@id, 'tdExcused')])")) absent_count = int( page.xpath("string(//span[contains(@id, 'tdAbsent')])")) passed = yes_count > no_count if motion.startswith("Do Pass"): type = "passage" elif motion == "Concurred in amendments": type = "amendment" elif motion == "Veto override": type = "veto_override" else: type = "other" vote = VoteEvent( chamber=chamber, start_date=date, motion_text=motion, result="pass" if passed else "fail", classification=type, bill=bill, ) # The vote page URL has a unique ID # However, some votes are "consent calendar" events, # and relate to the passage of _multiple_ bills # These can't be modeled yet in Pupa, but for now we can # append a bill ID to the URL that forms the `pupa_id` # https://github.com/opencivicdata/pupa/issues/308 vote.pupa_id = "{}#{}".format(url, bill.identifier.replace(" ", "")) vote.add_source(url) vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("excused", excused_count) vote.set_count("absent", absent_count) for td in page.xpath("//table[@id='tblVoteTotals']/tbody/tr/td"): option_or_person = td.text.strip() if option_or_person in ("Aye", "Yea"): vote.yes(td.getprevious().text.strip()) elif option_or_person == "Nay": vote.no(td.getprevious().text.strip()) elif option_or_person == "Excused": vote.vote("excused", td.getprevious().text.strip()) elif option_or_person == "Absent": vote.vote("absent", td.getprevious().text.strip()) yield vote