def test_vote_event_bill_clearing(): # ensure that we don't wind up with vote events sitting around forever on bills as # changes make it look like there are multiple vote events j = create_jurisdiction() session = j.legislative_sessions.create(name="1900", identifier="1900") org = Organization.objects.create(id="org-id", name="House", classification="lower", jurisdiction=j) bill = Bill.objects.create( id="bill-1", identifier="HB 1", legislative_session=session, from_organization=org, ) Bill.objects.create( id="bill-2", identifier="HB 2", legislative_session=session, from_organization=org, ) oi = OrganizationImporter("jid") dmi = DumbMockImporter() bi = BillImporter("jid", dmi, oi) vote_event1 = ScrapeVoteEvent( legislative_session="1900", start_date="2013", classification="anything", result="passed", motion_text="a vote on somthing", # typo intentional bill=bill.identifier, bill_chamber="lower", chamber="lower", ) vote_event2 = ScrapeVoteEvent( legislative_session="1900", start_date="2013", classification="anything", result="passed", motion_text="a vote on something else", bill=bill.identifier, bill_chamber="lower", chamber="lower", ) # have to use import_data so postimport is called VoteEventImporter("jid", dmi, oi, bi).import_data( [vote_event1.as_dict(), vote_event2.as_dict()]) assert VoteEvent.objects.count() == 2 # a typo is fixed, we don't want 3 vote events now vote_event1.motion_text = "a vote on something" VoteEventImporter("jid", dmi, oi, bi).import_data( [vote_event1.as_dict(), vote_event2.as_dict()]) assert VoteEvent.objects.count() == 2
def handle_page(self): summary = self.doc.xpath("/".join([ '//h4[starts-with(text(), "SUMMARY")]', "/following-sibling::p", "text()", ])) if summary and summary[0].strip(): self.obj.add_abstract(abstract=summary[0].strip(), note="summary") # versions for va in self.doc.xpath( '//h4[text()="FULL TEXT"]/following-sibling::ul[1]/li/a[1]'): # 11/16/09 \xa0House: Prefiled and ordered printed; offered 01/13/10 10100110D date, desc = va.text.split(u" \xa0") desc.rsplit(" ", 1)[0] # chop off last part link = va.get("href") if "http" not in link: link = "{}{}".format(BASE_URL, link) date = datetime.datetime.strptime(date, "%m/%d/%y").date() # budget bills in VA are searchable but no full text available if "+men+" in link: logging.getLogger("va").warning( "not adding budget version, bill text not available") else: # VA duplicates reprinted bills, lets keep the original name self.obj.add_version_link(desc, link, date=date, media_type="text/html", on_duplicate="ignore") # amendments for va in self.doc.xpath( '//h4[text()="AMENDMENTS"]/following-sibling::ul[1]/li/a[1]'): version_name = va.xpath("string(.)") if (("adopted" in version_name.lower() or "engrossed" in version_name.lower()) and "not adopted" not in version_name.lower() and "not engrossed" not in version_name.lower()): version_url = va.xpath("@href")[0] self.obj.add_version_link( version_name, version_url, media_type="text/html", on_duplicate="ignore", ) # actions seen_next = False for ali, next_ali in pairwise( self.doc.xpath( '//h4[text()="HISTORY"]/following-sibling::ul[1]/li')): # If we've used this action text before, we don't need to parse it again if seen_next: seen_next = False continue date, action = ali.text_content().split(u" \xa0") try: actor, action = action.split(": ", 1) except ValueError: assert any([ action.startswith("{}:".format(x)) for x in self.actor_map.keys() ]), "Unparseable action text found: '{}'".format(action) logging.getLogger("va").warning( "Skipping apparently-null action: '{}'".format(action)) continue # Bill history entries purely in parentheses tend to be # notes and not actions, so we'll skip them. if action.startswith("(") and action.endswith(")"): continue actor = self.actor_map[actor] date = datetime.datetime.strptime(date.strip(), "%m/%d/%y").date() # if action ends in (##-Y ##-N) remove that part vrematch = self.vote_strip_re.match(action) # The following conditional logic is messy to handle # Virginia's crazy and inconsistently formatted bill # histories. Someone less harried and tired than me # could probably make this much cleaner. - alo if vrematch: vote_action, y, n, o = vrematch.groups() y = int(y) n = int(n) # Set default count for "other" votes to 0. We have to # do this explicitly as it's excluded from the action # text when there were no abstentions (the only type of # "other" vote encountered thus far). o = int(o) if o else 0 vote_url = ali.xpath("a/@href") # Finds relevant information from the current action if # vote count encountered, then searches for the presence # of identical counts in the next entry (we assume that # it's probably there). If matching votes are found, it # merges data in both to create a unified vote record. # # This is because Virginia usually publishes two lines # of history data for a single vote, without guaranteed # order, so we unsafely attempt to match on identical # vote counts in the next line. vote = VoteEvent( start_date=date, chamber=actor, motion_text=vote_action.strip(), result="pass" if y > n else "fail", classification="passage", bill=self.obj, ) vote.set_count("yes", y) vote.set_count("no", n) vote.set_count("other", o) try: next_action = ( next_ali.text_content().split(" \xa0")[1].split( ": ", 1)[1]) except (AttributeError, ValueError): next_action = "" vrematch_next = self.vote_strip_re.match(next_action) if vrematch_next: vote_action_next, y_next, n_next, o_next = vrematch_next.groups( ) y_next = int(y_next) n_next = int(n_next) o_next = int(o_next) if o_next else 0 vote_url_next = next_ali.xpath("a/@href") # Check that the vote counts match and that only one action # has a URL (otherwise, they're probably different votes). if [y_next, n_next, o_next ] == [y, n, o] and len(vote_url) != len(vote_url_next): seen_next = True if not vote_url: vote_url = vote_url_next else: vote.motion_text = vote_action_next.strip() action = next_action if vote_url: list( self.scrape_page_items(VotePage, url=vote_url[0], obj=vote)) vote.add_source(vote_url[0]) else: vote.add_source(self.url) yield from add_pupa_id(vote) # categorize actions for pattern, atype in ACTION_CLASSIFIERS: if re.match(pattern, action): break else: atype = None # if matched a 'None' atype, don't add the action if atype != SKIP: self.obj.add_action(action, date, chamber=actor, classification=atype)