示例#1
0
def test_vote_event_bill_clearing():
    # ensure that we don't wind up with vote events sitting around forever on bills as
    # changes make it look like there are multiple vote events
    j = create_jurisdiction()
    session = j.legislative_sessions.create(name="1900", identifier="1900")
    org = Organization.objects.create(id="org-id",
                                      name="House",
                                      classification="lower",
                                      jurisdiction=j)
    bill = Bill.objects.create(
        id="bill-1",
        identifier="HB 1",
        legislative_session=session,
        from_organization=org,
    )
    Bill.objects.create(
        id="bill-2",
        identifier="HB 2",
        legislative_session=session,
        from_organization=org,
    )
    oi = OrganizationImporter("jid")
    dmi = DumbMockImporter()
    bi = BillImporter("jid", dmi, oi)

    vote_event1 = ScrapeVoteEvent(
        legislative_session="1900",
        start_date="2013",
        classification="anything",
        result="passed",
        motion_text="a vote on somthing",  # typo intentional
        bill=bill.identifier,
        bill_chamber="lower",
        chamber="lower",
    )
    vote_event2 = ScrapeVoteEvent(
        legislative_session="1900",
        start_date="2013",
        classification="anything",
        result="passed",
        motion_text="a vote on something else",
        bill=bill.identifier,
        bill_chamber="lower",
        chamber="lower",
    )

    # have to use import_data so postimport is called
    VoteEventImporter("jid", dmi, oi, bi).import_data(
        [vote_event1.as_dict(), vote_event2.as_dict()])
    assert VoteEvent.objects.count() == 2

    # a typo is fixed, we don't want 3 vote events now
    vote_event1.motion_text = "a vote on something"
    VoteEventImporter("jid", dmi, oi, bi).import_data(
        [vote_event1.as_dict(), vote_event2.as_dict()])
    assert VoteEvent.objects.count() == 2
    def handle_page(self):
        summary = self.doc.xpath("/".join([
            '//h4[starts-with(text(), "SUMMARY")]',
            "/following-sibling::p",
            "text()",
        ]))
        if summary and summary[0].strip():
            self.obj.add_abstract(abstract=summary[0].strip(), note="summary")

        # versions
        for va in self.doc.xpath(
                '//h4[text()="FULL TEXT"]/following-sibling::ul[1]/li/a[1]'):

            # 11/16/09 \xa0House: Prefiled and ordered printed; offered 01/13/10 10100110D
            date, desc = va.text.split(u" \xa0")
            desc.rsplit(" ", 1)[0]  # chop off last part
            link = va.get("href")
            if "http" not in link:
                link = "{}{}".format(BASE_URL, link)
            date = datetime.datetime.strptime(date, "%m/%d/%y").date()

            # budget bills in VA are searchable but no full text available
            if "+men+" in link:
                logging.getLogger("va").warning(
                    "not adding budget version, bill text not available")
            else:
                # VA duplicates reprinted bills, lets keep the original name
                self.obj.add_version_link(desc,
                                          link,
                                          date=date,
                                          media_type="text/html",
                                          on_duplicate="ignore")

        # amendments
        for va in self.doc.xpath(
                '//h4[text()="AMENDMENTS"]/following-sibling::ul[1]/li/a[1]'):
            version_name = va.xpath("string(.)")
            if (("adopted" in version_name.lower()
                 or "engrossed" in version_name.lower())
                    and "not adopted" not in version_name.lower()
                    and "not engrossed" not in version_name.lower()):
                version_url = va.xpath("@href")[0]
                self.obj.add_version_link(
                    version_name,
                    version_url,
                    media_type="text/html",
                    on_duplicate="ignore",
                )

        # actions
        seen_next = False
        for ali, next_ali in pairwise(
                self.doc.xpath(
                    '//h4[text()="HISTORY"]/following-sibling::ul[1]/li')):
            # If we've used this action text before, we don't need to parse it again
            if seen_next:
                seen_next = False
                continue
            date, action = ali.text_content().split(u" \xa0")
            try:
                actor, action = action.split(": ", 1)
            except ValueError:
                assert any([
                    action.startswith("{}:".format(x))
                    for x in self.actor_map.keys()
                ]), "Unparseable action text found: '{}'".format(action)
                logging.getLogger("va").warning(
                    "Skipping apparently-null action: '{}'".format(action))
                continue

            # Bill history entries purely in parentheses tend to be
            # notes and not actions, so we'll skip them.
            if action.startswith("(") and action.endswith(")"):
                continue

            actor = self.actor_map[actor]
            date = datetime.datetime.strptime(date.strip(), "%m/%d/%y").date()

            # if action ends in (##-Y ##-N) remove that part
            vrematch = self.vote_strip_re.match(action)
            # The following conditional logic is messy to handle
            # Virginia's crazy and inconsistently formatted bill
            # histories. Someone less harried and tired than me
            # could probably make this much cleaner. - alo
            if vrematch:
                vote_action, y, n, o = vrematch.groups()
                y = int(y)
                n = int(n)
                # Set default count for "other" votes to 0. We have to
                # do this explicitly as it's excluded from the action
                # text when there were no abstentions (the only type of
                # "other" vote encountered thus far).
                o = int(o) if o else 0

                vote_url = ali.xpath("a/@href")

                # Finds relevant information from the current action if
                # vote count encountered, then searches for the presence
                # of identical counts in the next entry (we assume that
                # it's probably there). If matching votes are found, it
                # merges data in both to create a unified vote record.
                #
                # This is because Virginia usually publishes two lines
                # of history data for a single vote, without guaranteed
                # order, so we unsafely attempt to match on identical
                # vote counts in the next line.
                vote = VoteEvent(
                    start_date=date,
                    chamber=actor,
                    motion_text=vote_action.strip(),
                    result="pass" if y > n else "fail",
                    classification="passage",
                    bill=self.obj,
                )
                vote.set_count("yes", y)
                vote.set_count("no", n)
                vote.set_count("other", o)

                try:
                    next_action = (
                        next_ali.text_content().split(" \xa0")[1].split(
                            ": ", 1)[1])
                except (AttributeError, ValueError):
                    next_action = ""

                vrematch_next = self.vote_strip_re.match(next_action)
                if vrematch_next:
                    vote_action_next, y_next, n_next, o_next = vrematch_next.groups(
                    )
                    y_next = int(y_next)
                    n_next = int(n_next)
                    o_next = int(o_next) if o_next else 0
                    vote_url_next = next_ali.xpath("a/@href")
                    # Check that the vote counts match and that only one action
                    # has a URL (otherwise, they're probably different votes).
                    if [y_next, n_next, o_next
                        ] == [y, n, o] and len(vote_url) != len(vote_url_next):
                        seen_next = True
                        if not vote_url:
                            vote_url = vote_url_next
                        else:
                            vote.motion_text = vote_action_next.strip()
                            action = next_action

                if vote_url:
                    list(
                        self.scrape_page_items(VotePage,
                                               url=vote_url[0],
                                               obj=vote))
                    vote.add_source(vote_url[0])
                else:
                    vote.add_source(self.url)

                yield from add_pupa_id(vote)

            # categorize actions
            for pattern, atype in ACTION_CLASSIFIERS:
                if re.match(pattern, action):
                    break
            else:
                atype = None

            # if matched a 'None' atype, don't add the action
            if atype != SKIP:
                self.obj.add_action(action,
                                    date,
                                    chamber=actor,
                                    classification=atype)