def test_vote_event_identifier_dedupe(): j = Jurisdiction.objects.create(id='jid', division_id='did') j.legislative_sessions.create(name='1900', identifier='1900') vote_event = ScrapeVoteEvent(legislative_session='1900', start_date='2013', classification='anything', result='passed', motion_text='a vote on something', identifier='Roll Call No. 1') dmi = DumbMockImporter() bi = BillImporter('jid', dmi, dmi) _, what = VoteEventImporter('jid', dmi, dmi, bi).import_item(vote_event.as_dict()) assert what == 'insert' assert VoteEvent.objects.count() == 1 # same exact vote event, no changes _, what = VoteEventImporter('jid', dmi, dmi, bi).import_item(vote_event.as_dict()) assert what == 'noop' assert VoteEvent.objects.count() == 1 # new info, update vote_event.result = 'failed' _, what = VoteEventImporter('jid', dmi, dmi, bi).import_item(vote_event.as_dict()) assert what == 'update' assert VoteEvent.objects.count() == 1 # new bill, insert vote_event.identifier = 'Roll Call 2' _, what = VoteEventImporter('jid', dmi, dmi, bi).import_item(vote_event.as_dict()) assert what == 'insert' assert VoteEvent.objects.count() == 2
def scrape_senate_vote(self, bill, url, date): try: filename, resp = self.urlretrieve(url) except scrapelib.HTTPError: self.warning("missing vote file %s" % url) return vote = Vote( chamber='upper', start_date=date.strftime("%Y-%m-%d"), motion_text='Passage', # setting 'fail' for now. result='fail', classification='passage', bill=bill ) vote.add_source(url) text = convert_pdf(filename, 'text').decode('utf-8') os.remove(filename) if re.search('Yea:\s+\d+\s+Nay:\s+\d+\s+Absent:\s+\d+', text): yield from self.scrape_senate_vote_3col(bill, vote, text, url, date) return data = re.split(r'(Yea|Nay|Absent)s?:', text)[::-1] data = filter(None, data) keymap = dict(yea='yes', nay='no') actual_vote = collections.defaultdict(int) vote_count = { 'yes': 0, 'no': 0, 'other': 0 } while True: if not data: break vote_val = data.pop() key = keymap.get(vote_val.lower(), 'other') values = data.pop() for name in re.split(r'(?:[\s,]+and\s|[\s,]{2,})', values): if name.lower().strip() == 'none.': continue name = name.replace('..', '') name = re.sub(r'\.$', '', name) name = name.strip('-1234567890 \n') if not name: continue vote.vote(key, name) actual_vote[vote_val] += 1 vote_count[key] += 1 assert actual_vote[vote_val] == vote_count[key] for key, value in vote_count.items(): vote.set_count(key, value) # updating result with actual value vote.result = 'pass' if vote_count['yes'] > (vote_count['no'] + vote_count['other']) else 'fail' yield vote
def scrape_senate_vote(self, bill, url, date): try: filename, resp = self.urlretrieve(url) except scrapelib.HTTPError: self.warning("missing vote file %s" % url) return vote = VoteEvent( chamber="upper", start_date=date.strftime("%Y-%m-%d"), motion_text="Passage", # setting 'fail' for now. result="fail", classification="passage", bill=bill, ) vote.add_source(url) vote.pupa_id = url text = convert_pdf(filename, "text").decode("utf-8") os.remove(filename) if re.search(r"Yea:\s+\d+\s+Nay:\s+\d+\s+Absent:\s+\d+", text): yield from self.scrape_senate_vote_3col(bill, vote, text, url, date) return data = re.split(r"(Yea|Nay|Absent)s?:", text)[::-1] data = filter(None, data) keymap = dict(yea="yes", nay="no") actual_vote = collections.defaultdict(int) vote_count = {"yes": 0, "no": 0, "other": 0} while True: if not data: break vote_val = data.pop() key = keymap.get(vote_val.lower(), "other") values = data.pop() for name in re.split(r"(?:[\s,]+and\s|[\s,]{2,})", values): if name.lower().strip() == "none.": continue name = name.replace("..", "") name = re.sub(r"\.$", "", name) name = name.strip("-1234567890 \n") if not name: continue vote.vote(key, name) actual_vote[vote_val] += 1 vote_count[key] += 1 assert actual_vote[vote_val] == vote_count[key] for key, value in vote_count.items(): vote.set_count(key, value) # updating result with actual value vote.result = ( "pass" if vote_count["yes"] > (vote_count["no"] + vote_count["other"]) else "fail" ) yield vote
def test_vote_event_identifier_dedupe(): j = create_jurisdiction() j.legislative_sessions.create(name='1900', identifier='1900') Organization.objects.create(id='org-id', name='Legislature', classification='legislature', jurisdiction=j) vote_event = ScrapeVoteEvent(legislative_session='1900', start_date='2013', classification='anything', result='passed', motion_text='a vote on something', identifier='Roll Call No. 1') dmi = DumbMockImporter() oi = OrganizationImporter('jid') bi = BillImporter('jid', dmi, oi) _, what = VoteEventImporter('jid', dmi, oi, bi).import_item(vote_event.as_dict()) assert what == 'insert' assert VoteEvent.objects.count() == 1 # same exact vote event, no changes _, what = VoteEventImporter('jid', dmi, oi, bi).import_item(vote_event.as_dict()) assert what == 'noop' assert VoteEvent.objects.count() == 1 # new info, update vote_event.result = 'failed' _, what = VoteEventImporter('jid', dmi, oi, bi).import_item(vote_event.as_dict()) assert what == 'update' assert VoteEvent.objects.count() == 1 # new bill, insert vote_event.identifier = 'Roll Call 2' _, what = VoteEventImporter('jid', dmi, oi, bi).import_item(vote_event.as_dict()) assert what == 'insert' assert VoteEvent.objects.count() == 2
def _parse_senate_votes(self, vote_data, bill, url): vote_datetime = datetime.datetime.strptime(vote_data["voteDate"], "%Y-%m-%d") if vote_data["voteType"] == "FLOOR": motion = "Floor Vote" elif vote_data["voteType"] == "COMMITTEE": motion = "{} Vote".format(vote_data["committee"]["name"]) else: raise ValueError("Unknown vote type encountered.") if vote_data["version"]: motion += " - Version: " + vote_data["version"] vote = VoteEvent( chamber="upper", start_date=vote_datetime.strftime("%Y-%m-%d"), motion_text=motion, classification="passage", result="fail", bill=bill, ) vote.add_source(url) vote_rolls = vote_data["memberVotes"]["items"] yes_count, no_count, other_count = 0, 0, 0 # Count all yea votes. if "items" in vote_rolls.get("AYE", {}): for legislator in vote_rolls["AYE"]["items"]: vote.yes(legislator["fullName"]) yes_count += 1 if "items" in vote_rolls.get("AYEWR", {}): for legislator in vote_rolls["AYEWR"]["items"]: vote.yes(legislator["fullName"]) yes_count += 1 # Count all nay votes. if "items" in vote_rolls.get("NAY", {}): for legislator in vote_rolls["NAY"]["items"]: vote.no(legislator["fullName"]) no_count += 1 # Count all other types of votes. other_vote_types = ("EXC", "ABS", "ABD") for vote_type in other_vote_types: if vote_rolls.get(vote_type, []): for legislator in vote_rolls[vote_type]["items"]: vote.vote("other", legislator["fullName"]) other_count += 1 vote.result = "pass" if yes_count > no_count else "fail" vote.set_count("yes", yes_count) vote.set_count("no", no_count) vote.set_count("other", other_count) return vote
def test_vote_event_bill_id_dedupe(): j = create_jurisdiction() session = j.legislative_sessions.create(name='1900', identifier='1900') org = Organization.objects.create(id='org-id', name='House', classification='lower', jurisdiction=j) bill = Bill.objects.create(id='bill-1', identifier='HB 1', legislative_session=session, from_organization=org) bill2 = Bill.objects.create(id='bill-2', identifier='HB 2', legislative_session=session, from_organization=org) vote_event = ScrapeVoteEvent(legislative_session='1900', start_date='2013', classification='anything', result='passed', motion_text='a vote on something', bill=bill.identifier, bill_chamber='lower', chamber='lower') dmi = DumbMockImporter() oi = OrganizationImporter('jid') bi = BillImporter('jid', dmi, oi) _, what = VoteEventImporter('jid', dmi, oi, bi).import_item(vote_event.as_dict()) assert what == 'insert' assert VoteEvent.objects.count() == 1 # same exact vote event, no changes _, what = VoteEventImporter('jid', dmi, oi, bi).import_item(vote_event.as_dict()) assert what == 'noop' assert VoteEvent.objects.count() == 1 # new info, update vote_event.result = 'failed' _, what = VoteEventImporter('jid', dmi, oi, bi).import_item(vote_event.as_dict()) assert what == 'update' assert VoteEvent.objects.count() == 1 # new vote event, insert vote_event = ScrapeVoteEvent(legislative_session='1900', start_date='2013', classification='anything', result='passed', motion_text='a vote on something', bill=bill2.identifier, bill_chamber='lower', chamber='lower') _, what = VoteEventImporter('jid', dmi, oi, bi).import_item(vote_event.as_dict()) assert what == 'insert' assert VoteEvent.objects.count() == 2
def _parse_senate_votes(self, vote_data, bill, url): vote_datetime = datetime.datetime.strptime(vote_data['voteDate'], '%Y-%m-%d') if vote_data['voteType'] == 'FLOOR': motion = 'Floor Vote' elif vote_data['voteType'] == 'COMMITTEE': motion = '{} Vote'.format(vote_data['committee']['name']) else: raise ValueError('Unknown vote type encountered.') vote = VoteEvent( chamber='upper', start_date=vote_datetime.strftime('%Y-%m-%d'), motion_text=motion, classification='passage', result='fail', bill=bill, ) vote.add_source(url) vote_rolls = vote_data['memberVotes']['items'] yes_count, no_count, other_count = 0, 0, 0 # Count all yea votes. if 'items' in vote_rolls.get('AYE', {}): for legislator in vote_rolls['AYE']['items']: vote.yes(legislator['fullName']) yes_count += 1 if 'items' in vote_rolls.get('AYEWR', {}): for legislator in vote_rolls['AYEWR']['items']: vote.yes(legislator['fullName']) yes_count += 1 # Count all nay votes. if 'items' in vote_rolls.get('NAY', {}): for legislator in vote_rolls['NAY']['items']: vote.no(legislator['fullName']) no_count += 1 # Count all other types of votes. other_vote_types = ('EXC', 'ABS', 'ABD') for vote_type in other_vote_types: if vote_rolls.get(vote_type, []): for legislator in vote_rolls[vote_type]['items']: vote.vote('other', legislator['fullName']) other_count += 1 vote.result = 'pass' if yes_count > no_count else 'fail' vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('other', other_count) return vote
def _parse_senate_votes(self, vote_data, bill, url): vote_datetime = datetime.datetime.strptime( vote_data['voteDate'], '%Y-%m-%d') if vote_data['voteType'] == 'FLOOR': motion = 'Floor Vote' elif vote_data['voteType'] == 'COMMITTEE': motion = '{} Vote'.format(vote_data['committee']['name']) else: raise ValueError('Unknown vote type encountered.') vote = VoteEvent( chamber='upper', start_date=vote_datetime.strftime('%Y-%m-%d'), motion_text=motion, classification='passage', result='fail', bill=bill, ) vote.add_source(url) vote_rolls = vote_data['memberVotes']['items'] yes_count, no_count, other_count = 0, 0, 0 # Count all yea votes. if 'items' in vote_rolls.get('AYE', {}): for legislator in vote_rolls['AYE']['items']: vote.yes(legislator['fullName']) yes_count += 1 if 'items' in vote_rolls.get('AYEWR', {}): for legislator in vote_rolls['AYEWR']['items']: vote.yes(legislator['fullName']) yes_count += 1 # Count all nay votes. if 'items' in vote_rolls.get('NAY', {}): for legislator in vote_rolls['NAY']['items']: vote.no(legislator['fullName']) no_count += 1 # Count all other types of votes. other_vote_types = ('EXC', 'ABS', 'ABD') for vote_type in other_vote_types: if vote_rolls.get(vote_type, []): for legislator in vote_rolls[vote_type]['items']: vote.vote('other', legislator['fullName']) other_count += 1 vote.result = 'pass' if yes_count > no_count else 'fail' vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('other', other_count) return vote
def test_vote_event_pupa_identifier_dedupe(): j = create_jurisdiction() j.legislative_sessions.create(name='1900', identifier='1900') Organization.objects.create(id='org-id', name='Legislature', classification='legislature', jurisdiction=j) vote_event = ScrapeVoteEvent(legislative_session='1900', start_date='2013', classification='anything', result='passed', motion_text='a vote on something', identifier='Roll Call No. 1') vote_event.pupa_id = 'foo' dmi = DumbMockImporter() oi = OrganizationImporter('jid') bi = BillImporter('jid', dmi, oi) _, what = VoteEventImporter('jid', dmi, oi, bi).import_item(vote_event.as_dict()) assert what == 'insert' assert VoteEvent.objects.count() == 1 # same exact vote event, no changes _, what = VoteEventImporter('jid', dmi, oi, bi).import_item(vote_event.as_dict()) assert what == 'noop' assert VoteEvent.objects.count() == 1 # new info, update vote_event.result = 'failed' _, what = VoteEventImporter('jid', dmi, oi, bi).import_item(vote_event.as_dict()) assert what == 'update' assert VoteEvent.objects.count() == 1 # new bill identifier, update vote_event.identifier = 'First Roll Call' _, what = VoteEventImporter('jid', dmi, oi, bi).import_item(vote_event.as_dict()) assert what == 'update' assert VoteEvent.objects.count() == 1 # new pupa identifier, insert vote_event.pupa_id = 'bar' _, what = VoteEventImporter('jid', dmi, oi, bi).import_item(vote_event.as_dict()) assert what == 'insert' assert VoteEvent.objects.count() == 2
def _parse_votes(self, url, vote, bill): '''Given a vote url and a vote object, extract the voters and the vote counts from the vote page and update the vote object. ''' if url.lower().endswith('.pdf'): try: resp = self.get(url) except HTTPError: # This vote document wasn't found. msg = 'No document found at url %r' % url self.logger.warning(msg) return try: v = PDFCommitteeVote(url, resp.content, bill) return v.asvote() except PDFCommitteeVoteParseError: # Warn and skip. self.warning("Could't parse committee vote at %r" % url) return html = self.get(url).text doc = lxml.html.fromstring(html) doc.make_links_absolute(url) # Yes, no, excused, absent. try: vals = doc.xpath('//table')[1].xpath('tr/td/text()') except IndexError: # Most likely was a bogus link lacking vote data. return yes_count, no_count, excused_count, absent_count = map(int, vals) # Get the motion. try: motion = doc.xpath('//br')[-1].tail.strip() except IndexError: # Some of them mysteriously have no motion listed. motion = vote['action'] if not motion: motion = vote['action'] vote['motion'] = motion action = vote['action'] vote_url = vote['vote_url'] vote = VoteEvent( chamber=vote['chamber'], start_date=vote['date'], motion_text=vote['motion'], result='fail', # placeholder classification='passage', bill=bill, bill_action=vote['action'], ) vote.pupa_id = vote_url # URL contains sequence number vote.add_source(vote_url) vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('excused', excused_count) vote.set_count('absent', absent_count) for text in doc.xpath('//table')[2].xpath('tr/td/text()'): if not text.strip(u'\xa0'): continue v, name = filter(None, text.split(u'\xa0')) # Considering Name is brackets as short name regex = re.compile(".*?\((.*?)\)") short_name = re.findall(regex, name) if len(short_name) > 0: note = 'Short Name: ' + short_name[0] else: note = '' # Name without brackets like 'Kary, Douglas' name = re.sub("[\(\[].*?[\)\]]", "", name) if v == 'Y': vote.yes(name, note=note) elif v == 'N': vote.no(name, note=note) elif v == 'E': vote.vote('excused', name, note=note) elif v == 'A': vote.vote('absent', name, note=note) # code to deterimine value of `passed` passed = None # some actions take a super majority, so we aren't just # comparing the yeas and nays here. for i in vote_passage_indicators: if i in action: passed = True break for i in vote_failure_indicators: if i in action and passed: # a quick explanation: originally an exception was # thrown if both passage and failure indicators were # present because I thought that would be a bug in my # lists. Then I found 2007 HB 160. # Now passed = False if the nays outnumber the yays.. # I won't automatically mark it as passed if the yays # ounumber the nays because I don't know what requires # a supermajority in MT. if no_count >= yes_count: passed = False break else: raise Exception("passage and failure indicator" "both present at: %s" % url) if i in action and passed is None: passed = False break for i in vote_ambiguous_indicators: if i in action: passed = yes_count > no_count break if passed is None: raise Exception("Unknown passage at: %s" % url) vote.result = 'pass' if passed else 'fail' return vote
def _parse_votes(self, url, vote, bill): '''Given a vote url and a vote object, extract the voters and the vote counts from the vote page and update the vote object. ''' if url.lower().endswith('.pdf'): try: resp = self.get(url) except HTTPError: # This vote document wasn't found. msg = 'No document found at url %r' % url self.logger.warning(msg) return try: v = PDFCommitteeVote(url, resp.content, bill) return v.asvote() except PDFCommitteeVoteParseError: # Warn and skip. self.warning("Could't parse committee vote at %r" % url) return html = self.get(url).text doc = lxml.html.fromstring(html) doc.make_links_absolute(url) # Yes, no, excused, absent. try: vals = doc.xpath('//table')[1].xpath('tr/td/text()') except IndexError: # Most likely was a bogus link lacking vote data. return yes_count, no_count, excused_count, absent_count = map(int, vals) # Get the motion. try: motion = doc.xpath('//br')[-1].tail.strip() except IndexError: # Some of them mysteriously have no motion listed. motion = vote['action'] if not motion: motion = vote['action'] vote['motion'] = motion action = vote['action'] vote_url = vote['vote_url'] vote = VoteEvent( chamber=vote['chamber'], start_date=vote['date'], motion_text=vote['motion'], result='fail', # placeholder classification='passage', bill=bill, bill_action=vote['action'], ) vote.pupa_id = vote_url # URL contains sequence number vote.add_source(vote_url) vote.set_count('yes', yes_count) vote.set_count('no', no_count) vote.set_count('excused', excused_count) vote.set_count('absent', absent_count) for text in doc.xpath('//table')[2].xpath('tr/td/text()'): if not text.strip(u'\xa0'): continue v, name = filter(None, text.split(u'\xa0')) # Considering Name is brackets as short name regex = re.compile(r".*?\((.*?)\)") short_name = re.findall(regex, name) if len(short_name) > 0: note = 'Short Name: ' + short_name[0] else: note = '' # Name without brackets like 'Kary, Douglas' name = re.sub(r"[\(\[].*?[\)\]]", "", name) if v == 'Y': vote.yes(name, note=note) elif v == 'N': vote.no(name, note=note) elif v == 'E': vote.vote('excused', name, note=note) elif v == 'A': vote.vote('absent', name, note=note) # code to deterimine value of `passed` passed = None # some actions take a super majority, so we aren't just # comparing the yeas and nays here. for i in vote_passage_indicators: if i in action: passed = True break for i in vote_failure_indicators: if i in action and passed: # a quick explanation: originally an exception was # thrown if both passage and failure indicators were # present because I thought that would be a bug in my # lists. Then I found 2007 HB 160. # Now passed = False if the nays outnumber the yays.. # I won't automatically mark it as passed if the yays # ounumber the nays because I don't know what requires # a supermajority in MT. if no_count >= yes_count: passed = False break else: raise Exception("passage and failure indicator" "both present at: %s" % url) if i in action and passed is None: passed = False break for i in vote_ambiguous_indicators: if i in action: passed = yes_count > no_count break if passed is None: raise Exception("Unknown passage at: %s" % url) vote.result = 'pass' if passed else 'fail' return vote