Exemplo n.º 1
0
def test_vote_event_pupa_identifier_dedupe():
    j = create_jurisdiction()
    j.legislative_sessions.create(name="1900", identifier="1900")
    Organization.objects.create(id="org-id",
                                name="Legislature",
                                classification="legislature",
                                jurisdiction=j)

    vote_event = ScrapeVoteEvent(
        legislative_session="1900",
        start_date="2013",
        classification="anything",
        result="passed",
        motion_text="a vote on something",
        identifier="Roll Call No. 1",
    )
    vote_event.pupa_id = "foo"

    dmi = DumbMockImporter()
    oi = OrganizationImporter("jid")
    bi = BillImporter("jid", dmi, oi)

    _, what = VoteEventImporter("jid", dmi, oi,
                                bi).import_item(vote_event.as_dict())
    assert what == "insert"
    assert VoteEvent.objects.count() == 1

    # same exact vote event, no changes
    _, what = VoteEventImporter("jid", dmi, oi,
                                bi).import_item(vote_event.as_dict())
    assert what == "noop"
    assert VoteEvent.objects.count() == 1

    # new info, update
    vote_event.result = "failed"
    _, what = VoteEventImporter("jid", dmi, oi,
                                bi).import_item(vote_event.as_dict())
    assert what == "update"
    assert VoteEvent.objects.count() == 1

    # new bill identifier, update
    vote_event.identifier = "First Roll Call"
    _, what = VoteEventImporter("jid", dmi, oi,
                                bi).import_item(vote_event.as_dict())
    assert what == "update"
    assert VoteEvent.objects.count() == 1

    # new identifier, insert
    vote_event.pupa_id = "bar"
    _, what = VoteEventImporter("jid", dmi, oi,
                                bi).import_item(vote_event.as_dict())
    assert what == "insert"
    assert VoteEvent.objects.count() == 2
Exemplo n.º 2
0
def test_vote_event_bill_actions_two_stage():
    # this test is very similar to what we're testing in test_vote_event_bill_actions w/
    # ve3 and ve4, that two bills that reference the same action won't conflict w/ the
    # OneToOneField, but in this case we do it in two stages so that the conflict is found
    # even if the votes weren't in the same scrape
    create_jurisdiction()
    bill = ScrapeBill("HB 1", "1900", "Axe & Tack Tax Act", chamber="lower")

    bill.add_action(description="passage", date="1900-04-02", chamber="lower")

    ve1 = ScrapeVoteEvent(
        legislative_session="1900",
        motion_text="passage",
        start_date="1900-04-02",
        classification="passage:bill",
        result="pass",
        bill_chamber="lower",
        bill="HB 1",
        bill_action="passage",
        chamber="lower",
    )
    ve2 = ScrapeVoteEvent(
        legislative_session="1900",
        motion_text="passage",
        start_date="1900-04-02",
        classification="passage:bill",
        result="pass",
        bill_chamber="lower",
        bill="HB 1",
        bill_action="passage",
        chamber="lower",
    )
    # disambiguate them
    ve1.pupa_id = "one"
    ve2.pupa_id = "two"

    bi = BillImporter("jid")
    bi.import_data([bill.as_dict()])

    # first imports just fine
    VoteEventImporter("jid", bi).import_data([ve1.as_dict()])
    votes = list(VoteEvent.objects.all())
    assert len(votes) == 1
    assert votes[0].bill_action is not None

    # when second is imported, ensure that action stays pinned to first just as it would
    # have if they were both in same import
    VoteEventImporter("jid", bi).import_data([ve1.as_dict(), ve2.as_dict()])
    votes = list(VoteEvent.objects.all())
    assert len(votes) == 2
    assert votes[0].bill_action is not None
    assert votes[1].bill_action is None
    def add_vote(self, bill, chamber, date, text, url):
        votes = re.findall(r"Ayes,?[\s]?(\d+)[,;]\s+N(?:oes|ays),?[\s]?(\d+)", text)
        yes, no = int(votes[0][0]), int(votes[0][1])

        vtype = "other"
        for regex, type in motion_classifiers.items():
            if re.match(regex, text):
                vtype = type
                break

        v = VoteEvent(
            chamber=chamber,
            start_date=TIMEZONE.localize(date),
            motion_text=text,
            result="pass" if yes > no else "fail",
            classification=vtype,
            bill=bill,
        )
        v.pupa_id = url.split("/")[-1]
        v.set_count("yes", yes)
        v.set_count("no", no)

        # fetch the vote itself
        if url:
            v.add_source(url)

            if "av" in url:
                self.add_house_votes(v, url)
            elif "sv" in url:
                self.add_senate_votes(v, url)

        return v
Exemplo n.º 4
0
def build_vote(session, bill_id, url, vote_record, chamber, motion_text):
    # When they vote in a substitute they mark it as XHB
    bill_id = bill_id.replace("XHB", "HB")
    passed = len(vote_record["yes"]) > len(vote_record["no"])
    vote_event = VoteEvent(
        result="pass" if passed else "fail",
        chamber=chamber,
        start_date=vote_record["date"].strftime("%Y-%m-%d"),
        motion_text=motion_text,
        classification="passage",
        legislative_session=session,
        bill=bill_id,
        bill_chamber="upper" if bill_id[0] == "S" else "lower",
    )
    vote_event.pupa_id = url
    vote_event.set_count("yes", len(vote_record["yes"]))
    vote_event.set_count("no", len(vote_record["no"]))
    vote_event.set_count("excused", len(vote_record["excused"]))
    vote_event.set_count("absent", len(vote_record["absent"]))
    vote_event.set_count("other", len(vote_record["other"]))
    for vote_type in ["yes", "no", "excused", "absent", "other"]:
        for voter in vote_record[vote_type]:
            vote_event.vote(vote_type, voter)

    vote_event.add_source(url)
    return vote_event
Exemplo n.º 5
0
    def parse_vote_page(self, vote_url, bill):
        vote_html = self.get(vote_url).text
        doc = lxml.html.fromstring(vote_html)
        # chamber
        if "senate" in vote_url:
            chamber = "upper"
        else:
            chamber = "lower"

        # date in the following format: Mar 23, 2009
        date = doc.xpath('//td[starts-with(text(), "Legislative")]')[0].text
        date = date.replace(u"\xa0", " ")
        date = datetime.datetime.strptime(date[18:], "%b %d, %Y")

        # motion
        motion = "".join(x.text_content() for x in doc.xpath('//td[@colspan="23"]'))
        if motion == "":
            motion = "No motion given"  # XXX: Double check this. See SJ 3.
        motion = motion.replace(u"\xa0", " ")

        # totals
        tot_class = doc.xpath('//td[contains(text(), "Yeas")]')[0].get("class")
        totals = doc.xpath('//td[@class="%s"]/text()' % tot_class)[1:]
        yes_count = int(totals[0].split()[-1])
        no_count = int(totals[1].split()[-1])
        other_count = int(totals[2].split()[-1])
        other_count += int(totals[3].split()[-1])
        other_count += int(totals[4].split()[-1])
        passed = yes_count > no_count

        vote = VoteEvent(
            bill=bill,
            chamber=chamber,
            start_date=date.strftime("%Y-%m-%d"),
            motion_text=motion,
            classification="passage",
            result="pass" if passed else "fail",
        )
        vote.pupa_id = vote_url  # contains sequence number
        vote.set_count("yes", yes_count)
        vote.set_count("no", no_count)
        vote.set_count("other", other_count)

        # go through, find Voting Yea/Voting Nay/etc. and next tds are voters
        func = None
        for td in doc.xpath("//td/text()"):
            td = td.replace(u"\xa0", " ")
            if td.startswith("Voting Yea"):
                func = vote.yes
            elif td.startswith("Voting Nay"):
                func = vote.no
            elif td.startswith("Not Voting"):
                func = vote.other
            elif td.startswith("Excused"):
                func = vote.other
            elif func:
                td = td.rstrip("*")
                func(td)

        return vote
Exemplo n.º 6
0
    def scrape_senate_vote(self, bill, url, date):
        try:
            filename, resp = self.urlretrieve(url)
        except scrapelib.HTTPError:
            self.warning("missing vote file %s" % url)
            return

        vote = VoteEvent(
            chamber="upper",
            start_date=date.strftime("%Y-%m-%d"),
            motion_text="Passage",
            # setting 'fail' for now.
            result="fail",
            classification="passage",
            bill=bill,
        )
        vote.add_source(url)
        vote.pupa_id = url

        text = convert_pdf(filename, "text").decode("utf-8")
        os.remove(filename)

        if re.search(r"Yea:\s+\d+\s+Nay:\s+\d+\s+Absent:\s+\d+", text):
            yield from self.scrape_senate_vote_3col(bill, vote, text, url,
                                                    date)
            return

        data = re.split(r"(Yea|Nay|Absent)s?:", text)[::-1]
        data = filter(None, data)
        keymap = dict(yea="yes", nay="no")
        actual_vote = collections.defaultdict(int)
        vote_count = {"yes": 0, "no": 0, "other": 0}
        while True:
            if not data:
                break
            vote_val = data.pop()
            key = keymap.get(vote_val.lower(), "other")
            values = data.pop()
            for name in re.split(r"(?:[\s,]+and\s|[\s,]{2,})", values):
                if name.lower().strip() == "none.":
                    continue
                name = name.replace("..", "")
                name = re.sub(r"\.$", "", name)
                name = name.strip("-1234567890 \n")
                if not name:
                    continue
                vote.vote(key, name)
                actual_vote[vote_val] += 1
                vote_count[key] += 1
            assert actual_vote[vote_val] == vote_count[key]

        for key, value in vote_count.items():
            vote.set_count(key, value)
        # updating result with actual value
        vote.result = ("pass" if vote_count["yes"] >
                       (vote_count["no"] + vote_count["other"]) else "fail")

        yield vote
Exemplo n.º 7
0
    def scrape_votes(self, bill, bill_page, chamber):
        vote_links = bill_page.xpath(
            '//table[contains(@class,"history")]//a[contains(@href, "view_votes")]'
        )
        for vote_link in vote_links:
            vote_url = vote_link.attrib["href"]
            date_td, motion_td, *_ = vote_link.xpath("ancestor::tr/td")
            date = datetime.strptime(date_td.text, "%b %d, %Y")
            motion_text = motion_td.text_content()
            vote_page = self.lxmlize(vote_url)
            passed = "Passed" in motion_text or "Advanced" in motion_text
            cells = vote_page.xpath(
                '//div[contains(@class,"table-responsive")]/table//td')
            vote = VoteEvent(
                bill=bill,
                chamber=chamber,
                start_date=TIMEZONE.localize(date),
                motion_text=motion_text,
                classification="passage",
                result="pass" if passed else "fail",
            )

            yes_count = self.process_count(vote_page, "Yes:")
            no_count = self.process_count(vote_page, "No:")
            exc_count = self.process_count(vote_page, "Excused - Not Voting:")
            absent_count = self.process_count(vote_page,
                                              "Absent - Not Voting:")
            present_count = self.process_count(vote_page,
                                               "Present - Not Voting:")

            vote.set_count("yes", yes_count)
            vote.set_count("no", no_count)
            vote.set_count("excused", exc_count)
            vote.set_count("absent", absent_count)
            vote.set_count("abstain", present_count)

            query_params = urllib.parse.parse_qs(
                urllib.parse.urlparse(vote_url).query)
            vote.pupa_id = query_params["KeyID"][0]
            vote.add_source(vote_url)
            for chunk in range(0, len(cells), 2):
                name = cells[chunk].text
                vote_type = cells[chunk + 1].text
                if name and vote_type:
                    vote.vote(VOTE_TYPE_MAP.get(vote_type.lower(), "other"),
                              name)
            yield vote
Exemplo n.º 8
0
    def parse_committee_votes(self, bill, url):
        bill.add_source(url)
        html = self.get(url).text
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)
        chamber = "upper" if "Senate" in doc.xpath("string(//h1)") else "lower"
        committee = tuple(doc.xpath("//h2")[0].itertext())[-2].strip()
        for link in doc.xpath("//a[contains(@href, 'listVoteSummary.cfm')]"):

            # Date
            for fmt in ("%m/%d/%Y", "%m-%d-%Y"):
                date = link.xpath("../../td")[0].text_content()
                try:
                    date = datetime.datetime.strptime(date, fmt)
                except ValueError:
                    continue
                break

            # Motion
            motion = link.text_content().split(" - ")[-1].strip()
            motion = "Committee vote (%s): %s" % (committee, motion)

            # Roll call
            vote_url = link.attrib["href"]
            rollcall = self.parse_upper_committee_vote_rollcall(bill, vote_url)

            vote = VoteEvent(
                chamber=chamber,
                start_date=tz.localize(date),
                motion_text=motion,
                classification=[],
                result="pass" if rollcall["passed"] else "fail",
                bill=bill,
            )
            vote.pupa_id = vote_url
            vote.set_count("yes", rollcall["yes_count"])
            vote.set_count("no", rollcall["no_count"])
            vote.set_count("other", rollcall["other_count"])

            for voteval in ("yes", "no", "other"):
                for name in rollcall.get(voteval + "_votes", []):
                    vote.vote(voteval, name)

            vote.add_source(url)
            vote.add_source(vote_url)

            yield vote
Exemplo n.º 9
0
    def scrape_vote(self, chamber, session, bill_id, vote_url):
        try:
            resp = self.get(vote_url)
            html = resp.text
        except scrapelib.HTTPError:
            return

        doc = lxml.html.fromstring(html)
        motion = doc.xpath("//p[1]//b[1]/text()")[-1].strip()
        if len(motion) == 0:
            print(motion)
            motion = doc.xpath("//h2[1]/text()")[0].strip()

        vote_count = (
            doc.xpath("//h3[contains(text(),'YEA and ')]/text()")[0].strip().split()
        )
        yeas = int(vote_count[0])
        nays = int(vote_count[3])

        date = doc.xpath("//b[contains(text(),'Date:')]/../text()")[1].strip()
        date = datetime.datetime.strptime(date, "%m/%d/%Y").date()

        vote = VoteEvent(
            chamber="lower",
            start_date=date,
            motion_text=motion,
            result="pass" if yeas > nays else "fail",
            classification="passage",
            legislative_session=session,
            bill=bill_id,
            bill_chamber=chamber,
        )
        vote.set_count("yes", yeas)
        vote.set_count("no", nays)
        vote.add_source(vote_url)
        vote.pupa_id = vote_url

        # first table has YEAs
        for name in doc.xpath("//table[1]//font/text()"):
            vote.yes(name.strip())

        # second table is nays
        for name in doc.xpath("//table[2]//font/text()"):
            vote.no(name.strip())

        yield vote
Exemplo n.º 10
0
    def scrape_chamber_votes(self, chamber, session):
        url = {
            "upper": "%s/%s" % (RI_URL_BASE, "SVotes"),
            "lower": "%s/%s" % (RI_URL_BASE, "HVotes"),
        }[chamber]
        action = "%s/%s" % (url, "votes.asp")
        dates = self.get_vote_dates(url, session)
        for date in dates:
            votes = self.parse_vote_page(self.post_to(action, date), url, session)
            for vote_dict in votes:
                for vote in vote_dict.values():
                    count = vote["count"]
                    chamber = {"H": "lower", "S": "upper"}[vote["meta"]["chamber"]]

                    try:
                        bill_id = self._bill_id_by_type[(chamber, vote["meta"]["bill"])]
                    except KeyError:
                        self.warning(
                            "no such bill_id %s %s", chamber, vote["meta"]["bill"]
                        )
                        continue

                    v = VoteEvent(
                        chamber=chamber,
                        start_date=vote["time"].strftime("%Y-%m-%d"),
                        motion_text=vote["meta"]["extra"]["motion"],
                        result="pass" if count["passage"] else "fail",
                        classification="passage",
                        legislative_session=session,
                        bill=bill_id,
                        bill_chamber=chamber,
                    )
                    v.set_count("yes", int(count["YEAS"]))
                    v.set_count("no", int(count["NAYS"]))
                    v.set_count("other", int(count["NOT VOTING"]))
                    v.add_source(vote["source"])
                    v.pupa_id = vote["source"]

                    for vt in vote["votes"]:
                        key = {"Y": "yes", "N": "no"}.get(vt["vote"], "other")
                        v.vote(key, vt["name"])
                    yield v
Exemplo n.º 11
0
    def scrape_votes(self, bill, page):
        base_url = "https://apps.azleg.gov/api/BillStatusFloorAction"
        for header in page["FloorHeaders"]:
            params = {
                "billStatusId": page["BillId"],
                "billStatusActionId": header["BillStatusActionId"],
                "includeVotes": "true",
            }
            resp = self.get(base_url, params=params)
            actions = json.loads(resp.content.decode("utf-8"))

            for action in actions:
                if action["Action"] == "No Action":
                    continue
                action_date = datetime.datetime.strptime(
                    action["ReportDate"], "%Y-%m-%dT%H:%M:%S")
                vote = VoteEvent(
                    chamber={
                        "S": "upper",
                        "H": "lower"
                    }[header["LegislativeBody"]],
                    motion_text=action["Action"],
                    classification="passage",
                    result=("pass" if action["UnanimouslyAdopted"]
                            or action["Ayes"] > action["Nays"] else "fail"),
                    start_date=action_date.strftime("%Y-%m-%d"),
                    bill=bill,
                )
                vote.add_source(resp.url)
                vote.set_count("yes", action["Ayes"] or 0)
                vote.set_count("no", action["Nays"] or 0)
                vote.set_count("other", (action["Present"] or 0))
                vote.set_count("absent", (action["Absent"] or 0))
                vote.set_count("excused", (action["Excused"] or 0))
                vote.set_count("not voting", (action["NotVoting"] or 0))

                for v in action["Votes"]:
                    vote_type = {"Y": "yes", "N": "no"}.get(v["Vote"], "other")
                    vote.vote(vote_type, v["Legislator"]["FullName"])
                vote.pupa_id = resp.url + str(action["ReferralNumber"])
                yield vote
Exemplo n.º 12
0
    def asvote(self):
        v = VoteEvent(
            chamber=self.chamber(),
            start_date=self.date(),
            motion_text=self.motion(),
            result="pass" if self.passed() else "fail",
            classification="passage",
            bill=self.bill,
        )
        v.pupa_id = self.url  # URL contains sequence number
        v.set_count("yes", self.yes_count())
        v.set_count("no", self.no_count())
        v.set_count("other", self.other_count())

        for voter in self.yes_votes():
            v.yes(voter)
        for voter in self.no_votes():
            v.no(voter)
        for voter in self.other_votes():
            v.vote("other", voter)
        v.add_source(self.url)
        return v
Exemplo n.º 13
0
    def parse_vote(self, bill, link):
        # Server sometimes sends proper error headers,
        # sometimes not
        try:
            self.info("Get {}".format(link))
            text = requests.get(link).text
        except requests.exceptions.HTTPError as err:
            self.warning("{} fetching vote {}, skipping".format(err, link))
            return

        if "Varnish cache server" in text:
            self.warning("Scrape rate is too high, try re-scraping with "
                         "The --rpm set to a lower number")
            return

        if "Page Not Found" in text or "Page Unavailable" in text:
            self.warning("missing vote, skipping")
            return
        member_doc = lxml.html.fromstring(text)
        motion = member_doc.xpath("//div[@id='main_content']/h4/text()")
        chamber_date_line = "".join(
            member_doc.xpath("//div[@id='main_content']/h3[1]//text()"))
        chamber_date_line_words = chamber_date_line.split()
        vote_chamber = chamber_date_line_words[0]
        vote_date = datetime.datetime.strptime(chamber_date_line_words[-1],
                                               "%m/%d/%Y")
        vote_status = " ".join(chamber_date_line_words[2:-2])
        opinions = member_doc.xpath(
            "//div[@id='main_content']/h3[position() > 1]/text()")
        if len(opinions) > 0:
            vote_status = vote_status if vote_status.strip() else motion[0]
            vote_chamber = "upper" if vote_chamber == "Senate" else "lower"

            for i in opinions:
                try:
                    count = int(i[i.find("(") + 1:i.find(")")])
                except ValueError:
                    # This is likely not a vote-count text chunk
                    # It's probably '`On roll call the vote was:`
                    pass
                else:
                    if "yea" in i.lower():
                        yes_count = count
                    elif "nay" in i.lower():
                        no_count = count
                    elif "present" in i.lower():
                        p_count = count
                    elif "absent" in i.lower():
                        a_count = count

            vote = VoteEvent(
                bill=bill,
                start_date=vote_date.strftime("%Y-%m-%d"),
                chamber=vote_chamber,
                motion_text=vote_status,
                result="pass" if yes_count > no_count else "fail",
                classification="passage",
            )
            vote.pupa_id = link

            vote.set_count("yes", yes_count)
            vote.set_count("no", no_count)
            vote.set_count("abstain", p_count)
            vote.set_count("absent", a_count)

            vote.add_source(link)

            a_links = member_doc.xpath("//div[@id='main_content']/a/text()")
            for i in range(1, len(a_links)):
                if i <= yes_count:
                    vote.vote("yes", re.sub(",", "", a_links[i]).split()[0])
                elif no_count != 0 and i > yes_count and i <= yes_count + no_count:
                    vote.vote("no", re.sub(",", "", a_links[i]).split()[0])
                else:
                    vote.vote("other", re.sub(",", "", a_links[i]).split()[0])
            yield vote
        else:
            self.warning("No Votes for: %s", link)
Exemplo n.º 14
0
    def scrape_vote(self, bill, motion, url):
        page = self.get(url, retry_on_404=True).text
        page = lxml.html.fromstring(page)

        yeas_cell = page.xpath("//td[text() = 'Yeas (Y):']")[0]
        yes_count = int(yeas_cell.xpath("string(following-sibling::td)"))

        nays_cell = page.xpath("//td[text() = 'Nays (N):']")[0]
        no_count = int(nays_cell.xpath("string(following-sibling::td)"))

        abs_cell = page.xpath("//td[text() = 'Absent (X):']")[0]
        abs_count = int(abs_cell.xpath("string(following-sibling::td)"))

        ex_cell = page.xpath("//td[text() = 'Excused (E):']")[0]
        ex_count = int(ex_cell.xpath("string(following-sibling::td)"))

        other_count = abs_count + ex_count

        if "chamber=House" in url:
            chamber = "lower"
        elif "chamber=Senate" in url:
            chamber = "upper"

        date_cell = page.xpath("//td[text() = 'Date:']")[0]
        date = date_cell.xpath("string(following-sibling::td)")
        try:
            date = datetime.datetime.strptime(date, "%B %d, %Y")
        except ValueError:
            date = datetime.datetime.strptime(date, "%b. %d, %Y")

        outcome_cell = page.xpath("//td[text()='Outcome:']")[0]
        outcome = outcome_cell.xpath("string(following-sibling::td)")

        vote = VoteEvent(
            chamber=chamber,
            start_date=date.strftime("%Y-%m-%d"),
            motion_text=motion,
            result="pass" if outcome == "PREVAILS" else "fail",
            classification="passage",
            bill=bill,
        )
        vote.set_count("yes", yes_count)
        vote.set_count("no", no_count)
        vote.set_count("other", other_count)
        vote.add_source(url)
        vote.pupa_id = url

        member_cell = page.xpath("//td[text() = 'Member']")[0]
        for row in member_cell.xpath("../../tr")[1:]:
            name = row.xpath("string(td[2])")
            # name = name.split(" of ")[0]

            vtype = row.xpath("string(td[4])")
            if vtype == "Y":
                vote.vote("yes", name)
            elif vtype == "N":
                vote.vote("no", name)
            elif vtype == "X" or vtype == "E":
                vote.vote("other", name)

        yield vote
Exemplo n.º 15
0
    def scrape_vote(self, url, session):
        fname, _ = self.urlretrieve(url)
        text = convert_pdf(fname, type="text").decode()
        lines = text.splitlines()

        chamber = "upper" if "senate" in url else "lower"
        if "Maryland" not in text:
            self.warning(f"empty vote from {url}")
            return
        date = re.findall(r"Legislative Date: (\w+ \d+, \d{4})", text)[0]

        section = "preamble"
        motion = None
        bill_id = None
        how = None
        voters = defaultdict(list)

        for line in lines:
            if section == "preamble":
                possible_bill_id = re.findall(r"([HS][BJR] \d+)", line)
                if possible_bill_id:
                    bill_id = possible_bill_id[0]

                # preamble has metadata, then motion, then counts.  our process then is to
                # store the last line as the motion, but if the last line looks like a
                # continuation, append it to the prior line

                line = line.strip()
                counts = re.findall(
                    r"(\d+) Yeas\s+(\d+) Nays\s+(\d+) Not Voting\s+(\d+) Excused\s+(\d+) Absent",
                    line,
                )
                if counts:
                    yes_count, no_count, nv_count, excused_count, absent_count = counts[
                        0
                    ]
                    yes_count = int(yes_count)
                    no_count = int(no_count)
                    nv_count = int(nv_count)
                    excused_count = int(excused_count)
                    absent_count = int(absent_count)
                    section = "votes"
                elif line and line != "(Const)":
                    # questions seem to be split across two lines
                    if line.endswith("?"):
                        motion = motion + " " + line
                    else:
                        motion = line
            elif section == "votes":
                if line.startswith("Voting Yea"):
                    how = "yes"
                elif line.startswith("Voting Nay"):
                    how = "no"
                elif line.startswith("Not Voting"):
                    how = "not voting"
                elif line.startswith("Excused from Voting"):
                    how = "excused"
                elif line.startswith("Excused (Absent)"):
                    how = "absent"
                elif how:
                    names = re.split(r"\s{2,}", line)
                    voters[how].extend(names)

        if not bill_id and not motion:
            return
        elif bill_id and not motion:
            self.warning(f"got {bill_id} but no motion, not registering as a vote")
        elif motion and not bill_id:
            self.warning(f"got {motion} but no bill_id, not registering as a vote")
            return

        # bleh - result not indicated anywhere
        result = "pass" if yes_count > no_count else "fail"
        bill_chamber = "upper" if bill_id.startswith("S") else "lower"
        date = datetime.datetime.strptime(date, "%b %d, %Y").strftime("%Y-%m-%d")
        vote = VoteEvent(
            chamber=chamber,
            start_date=date,
            result=result,
            classification="passage",
            motion_text=motion,
            legislative_session=session,
            bill=bill_id,
            bill_chamber=bill_chamber,
        )
        # URL includes sequence ID, will be unique
        vote.pupa_id = url
        vote.add_source(url)
        vote.set_count("yes", yes_count)
        vote.set_count("no", no_count)
        vote.set_count("not voting", nv_count)
        vote.set_count("excused", excused_count)
        vote.set_count("absent", absent_count)
        for how, names in voters.items():
            for name in names:
                name = name.strip().replace("*", "")
                if name and "COPY" not in name and "Indicates Vote Change" not in name:
                    vote.vote(how, name)
        check_counts(vote, raise_error=True)
        return vote
Exemplo n.º 16
0
    def scrape_votes(self, session):
        votes = {}
        other_counts = defaultdict(int)
        last_line = []
        vote_url = "http://gencourt.state.nh.us/dynamicdatafiles/RollCallSummary.txt"
        lines = self.get(vote_url).content.decode("utf-8").splitlines()

        for line in lines:

            if len(line) < 2:
                continue

            if line.strip() == "":
                continue

            line = line.split("|")
            if len(line) < 14:
                if len(last_line + line[1:]) == 14:
                    line = last_line
                    self.warning("used bad vote line")
                else:
                    last_line = line
                    self.warning("bad vote line %s" % "|".join(line))
            session_yr = line[0].replace("\xef\xbb\xbf", "")
            body = line[1]
            vote_num = line[2]
            timestamp = line[3]
            bill_id = line[4].strip()
            yeas = int(line[5])
            nays = int(line[6])
            # present = int(line[7])
            # absent = int(line[8])
            motion = line[11].strip() or "[not available]"

            if session_yr == session and bill_id in self.bills_by_id:
                actor = "lower" if body == "H" else "upper"
                time = dt.datetime.strptime(timestamp, "%m/%d/%Y %I:%M:%S %p")
                time = pytz.timezone("America/New_York").localize(
                    time).isoformat()
                # TODO: stop faking passed somehow
                passed = yeas > nays
                vote = Vote(
                    chamber=actor,
                    start_date=time,
                    motion_text=motion,
                    result="pass" if passed else "fail",
                    classification="passage",
                    bill=self.bills_by_id[bill_id],
                )
                vote.set_count("yes", yeas)
                vote.set_count("no", nays)
                vote.add_source(vote_url)
                vote.pupa_id = session_yr + body + vote_num  # unique ID for vote
                votes[body + vote_num] = vote

        for line in (self.get(
                "http://gencourt.state.nh.us/dynamicdatafiles/RollCallHistory.txt"
        ).content.decode("utf-8").splitlines()):
            if len(line) < 2:
                continue

            # 2016|H|2|330795||Yea|
            # 2012    | H   | 2    | 330795  | 964 |  HB309  | Yea | 1/4/2012 8:27:03 PM
            session_yr, body, v_num, _, employee, bill_id, vote, date = line.split(
                "|")

            if not bill_id:
                continue

            if session_yr == session and bill_id.strip() in self.bills_by_id:
                try:
                    leg = " ".join(self.legislators[employee]["name"].split())
                except KeyError:
                    self.warning("Error, can't find person %s" % employee)
                    continue

                vote = vote.strip()
                if body + v_num not in votes:
                    self.warning("Skipping processing this vote:")
                    self.warning("Bad ID: %s" % (body + v_num))
                    continue
                # code = self.legislators[employee]['seat']

                if vote == "Yea":
                    votes[body + v_num].yes(leg)
                elif vote == "Nay":
                    votes[body + v_num].no(leg)
                else:
                    votes[body + v_num].vote("other", leg)
                    # hack-ish, but will keep the vote count sync'd
                    other_counts[body + v_num] += 1
                    votes[body + v_num].set_count("other",
                                                  other_counts[body + v_num])
        for vote in votes.values():
            yield vote
Exemplo n.º 17
0
    def scrape_action_page(self, bill, page):
        action_rows = page.xpath("//tbody/tr")
        for row in action_rows:
            action_date = row.xpath("td[1]/text()")[0]
            action_date = datetime.strptime(action_date, "%m/%d/%Y")
            action_year = action_date.year
            action_date = action_date.strftime("%Y-%m-%d")

            if row.xpath("td[2]/text()"):
                action_actor = row.xpath("td[2]/text()")[0]
                action_actor = self.chamber_map_reverse[action_actor.strip()]

            action_name = row.xpath("string(td[3])")

            # House votes
            if "Supplement" in action_name:
                actor = "lower"
                vote_action = re.findall(r"(.+)-\s*\d+\s*YEAS", action_name)[0].strip()

                y = int(re.findall(r"(\d+)\s*YEAS", action_name)[0])
                n = int(re.findall(r"(\d+)\s*NAYS", action_name)[0])

                # get supplement number
                n_supplement = int(re.findall(r"No\.\s*(\d+)", action_name)[0])
                cached_vote = VoteEvent(
                    chamber=actor,
                    start_date=action_date,
                    motion_text=vote_action,
                    result="pass" if y > n else "fail",
                    classification="passage",
                    bill=bill,
                )
                cached_vote.set_count("yes", y)
                cached_vote.set_count("no", n)

                housevote_pdf = "https://malegislature.gov/Journal/House/{}/{}/RollCalls".format(
                    bill.legislative_session, action_year
                )
                self.scrape_house_vote(cached_vote, housevote_pdf, n_supplement)
                cached_vote.add_source(housevote_pdf)

                cached_vote.pupa_id = "{}#{}".format(housevote_pdf, n_supplement)

                # XXX: disabled house votes on 8/1 to try to get MA importing again
                # will leaving this in and commented out once we resolve the ID issue
                # yield cached_vote

            # Senate votes
            if "Roll Call" in action_name:
                actor = "upper"
                # placeholder
                vote_action = action_name.split(" -")[0]
                # 2019 H86 Breaks our regex,
                # Ordered to a third reading --
                # see Senate   Roll Call #25 and House Roll Call 56
                if "yeas" in action_name and "nays" in action_name:
                    try:
                        y, n = re.search(
                            r"(\d+) yeas .*? (\d+) nays", action_name.lower()
                        ).groups()
                        y = int(y)
                        n = int(n)
                    except AttributeError:
                        y = int(
                            re.search(r"yeas\s+(\d+)", action_name.lower()).group(1)
                        )
                        n = int(
                            re.search(r"nays\s+(\d+)", action_name.lower()).group(1)
                        )

                    # TODO: other count isn't included, set later
                    cached_vote = VoteEvent(
                        chamber=actor,
                        start_date=action_date,
                        motion_text=vote_action,
                        result="pass" if y > n else "fail",
                        classification="passage",
                        bill=bill,
                    )
                    cached_vote.set_count("yes", y)
                    cached_vote.set_count("no", n)

                    rollcall_pdf = "http://malegislature.gov" + row.xpath(
                        "string(td[3]/a/@href)"
                    )
                    self.scrape_senate_vote(cached_vote, rollcall_pdf)
                    cached_vote.add_source(rollcall_pdf)
                    cached_vote.pupa_id = rollcall_pdf
                    # XXX: also disabled, see above note
                    # yield cached_vote

            attrs = self.categorizer.categorize(action_name)
            action = bill.add_action(
                action_name.strip(),
                action_date,
                chamber=action_actor,
                classification=attrs["classification"],
            )
            for com in attrs.get("committees", []):
                com = com.strip()
                action.add_related_entity(com, entity_type="organization")
Exemplo n.º 18
0
    def scrape_house_vote(self, bill, url):
        try:
            filename, resp = self.urlretrieve(url)
        except scrapelib.HTTPError:
            self.warning("missing vote file %s" % url)
            return
        text = convert_pdf(filename, "text")
        os.remove(filename)

        lines = text.splitlines()

        vote_type = None
        votes = collections.defaultdict(list)
        date = None

        for idx, line in enumerate(lines):
            line = line.rstrip().decode("utf-8")
            match = re.search(r"(\d+)/(\d+)/(\d{4,4})$", line)
            if match:
                date = datetime.datetime.strptime(match.group(0), "%m/%d/%Y")
                continue

            match = re.match(
                r"\s+YEAS: (\d+)\s+NAYS: (\d+)\s+NOT VOTING: (\d+)", line)
            if match:
                motion = (lines[idx - 2].strip()).decode("utf-8")
                if not motion:
                    self.warning("No motion text found for vote")
                    motion = "PASSAGE"
                yes_count, no_count, other_count = [
                    int(g) for g in match.groups()
                ]

                exc_match = re.search(r"EXCUSED: (\d+)", line)
                if exc_match:
                    other_count += int(exc_match.group(1))

                if line.endswith("ADOPTED") or line.endswith("PASSED"):
                    passed = True
                else:
                    passed = False

                continue

            match = re.match(
                r"(YEAS|NAYS|NOT VOTING|PAIRED|EXCUSED):\s+(\d+)\s*$", line)
            if match:
                vote_type = {
                    "YEAS": "yes",
                    "NAYS": "no",
                    "NOT VOTING": "other",
                    "EXCUSED": "other",
                    "PAIRED": "paired",
                }[match.group(1)]
                continue

            if vote_type == "paired":
                for part in line.split("   "):
                    part = part.strip()
                    if not part:
                        continue
                    name, pair_type = re.match(r"([^\(]+)\((YEA|NAY)\)",
                                               line).groups()
                    name = name.strip()
                    if pair_type == "YEA":
                        votes["yes"].append(name)
                    elif pair_type == "NAY":
                        votes["no"].append(name)
            elif vote_type:
                for name in line.split("   "):
                    name = name.strip()
                    if not name:
                        continue
                    votes[vote_type].append(name)
        if date:
            vote = VoteEvent(
                chamber="lower",
                start_date=date.strftime("%Y-%m-%d"),
                motion_text=motion,
                result="pass" if passed else "fail",
                classification="passage",
                bill=bill,
            )

            vote.set_count("yes", yes_count)
            vote.set_count("no", no_count)
            vote.set_count("other", other_count)
            vote.add_source(url)
            vote.pupa_id = url

            for key, values in votes.items():
                for value in values:
                    if "Committee" in value:
                        continue
                    if "*" in value:
                        value = value.replace("*", "")
                    vote.vote(key, value)

            yield vote
        else:
            self.warning("Syntax Error/Warning using 'convert_pdf'")
Exemplo n.º 19
0
    def scrape_vote_history(self, bill, vurl):
        """
         Obtain the information on a vote and link it to the related Bill
        :param bill: related bill
        :param vurl: source for the voteEvent information.
        :return: voteEvent object
        """
        html = self.get(vurl).text
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(vurl)

        # skip first two rows
        for row in doc.xpath("//table/tr")[2:]:
            tds = row.getchildren()
            if len(tds) != 11:
                self.warning("irregular vote row: %s" % vurl)
                continue
            (
                timestamp,
                motion,
                vote,
                yeas,
                nays,
                nv,
                exc,
                pres,
                abst,
                total,
                result,
            ) = tds

            timestamp = timestamp.text.replace(u"\xa0", " ")
            timestamp = datetime.datetime.strptime(timestamp,
                                                   "%m/%d/%Y %H:%M %p")

            yeas = int(yeas.text)
            nays = int(nays.text)
            others = int(nv.text) + int(exc.text) + int(abst.text) + int(
                pres.text)
            assert yeas + nays + others == int(total.text)

            if result.text == "Passed":
                passed = "pass"
            else:
                passed = "fail"

            vote_link = vote.xpath("a")[0]
            if "[H]" in vote_link.text:
                chamber = "lower"
            else:
                chamber = "upper"

            vote = VoteEvent(
                chamber=chamber,  # 'upper' or 'lower'
                start_date=timestamp.strftime(
                    "%Y-%m-%d"),  # 'YYYY-MM-DD' format
                motion_text=motion.text,
                result=passed,
                classification="passage",  # Can also be 'other'
                # Provide a Bill instance to link with the VoteEvent...
                bill=bill,
            )

            vote.set_count("yes", yeas)
            vote.set_count("no", nays)
            vote.set_count("other", others)

            vote.add_source(vurl)

            # obtain vote rollcall from pdf and add it to the VoteEvent object
            rollcall_pdf = vote_link.get("href")
            self.scrape_rollcall(vote, rollcall_pdf)
            vote.add_source(rollcall_pdf)
            if rollcall_pdf in self._seen_vote_ids:
                self.warning("duplicate usage of %s, skipping", rollcall_pdf)
                continue
            else:
                self._seen_vote_ids.add(rollcall_pdf)
            vote.pupa_id = rollcall_pdf  # distinct KEY for each one

            yield vote
Exemplo n.º 20
0
    def process_vote(self, votes, url, base_url, bill, legislators,
                     chamber_dict, vote_results):
        for v in votes["items"]:
            try:
                v["yeas"]
            except KeyError:
                # sometimes the actual vote is buried a second layer deep
                v = self.get(base_url + v["link"]).json()
                try:
                    v["yeas"]
                except KeyError:
                    self.logger.warning("No vote info available, skipping")
                    continue

            try:
                chamber = chamber_dict[v["chamber"]]
            except KeyError:
                chamber = "lower" if "house" in v["apn"] else "upper"
            try:
                date = self._tz.localize(
                    datetime.datetime.strptime(v["date"], "%m/%d/%y"))
                date = "{:%Y-%m-%d}".format(date)
            except KeyError:
                try:
                    date = self._tz.localize(
                        datetime.datetime.strptime(v["occurred"], "%m/%d/%y"))
                    date = "{:%Y-%m-%d}".format(date)
                except KeyError:
                    self.logger.warning("No date found for vote, skipping")
                    continue
            try:
                motion = v["action"]
            except KeyError:
                motion = v["motiontype"]

            if motion in self._vote_motion_dict:
                motion_text = self._vote_motion_dict[motion]
            else:
                self.warning(
                    "Unknown vote code {}, please add to _vote_motion_dict".
                    format(motion))
                motion_text = v["results"]

            # Sometimes Ohio's SOLAR will only return part of the JSON, so in that case skip
            if not motion and isinstance(v["yeas"], str) and isinstance(
                    v["nays"], str):
                waringText = 'Malformed JSON found for vote ("revno" of {}); skipping'
                self.warning(waringText.format(v["revno"]))
                continue

            result = v.get("results") or v.get("passed")
            if result is None:
                if len(v["yeas"]) > len(v["nays"]):
                    result = "passed"
                else:
                    result = "failed"

            passed = vote_results[result.lower()]
            if "committee" in v:
                vote = VoteEvent(
                    chamber=chamber,
                    start_date=date,
                    motion_text=motion_text,
                    result="pass" if passed else "fail",
                    # organization=v["committee"],
                    bill=bill,
                    classification="committee-passage",
                )
            else:
                vote = VoteEvent(
                    chamber=chamber,
                    start_date=date,
                    motion_text=motion_text,
                    result="pass" if passed else "fail",
                    classification="passage",
                    bill=bill,
                )
            # Concatenate the bill identifier and vote identifier to avoid collisions
            vote.pupa_id = "{}:{}".format(bill.identifier.replace(" ", ""),
                                          v["revno"])
            # the yea and nay counts are not displayed, but vote totals are
            # and passage status is.
            yes_count = 0
            no_count = 0
            absent_count = 0
            excused_count = 0
            for voter_id in v["yeas"]:
                vote.yes(legislators[voter_id])
                yes_count += 1
            for voter_id in v["nays"]:
                vote.no(legislators[voter_id])
                no_count += 1
            if "absent" in v:
                for voter_id in v["absent"]:
                    vote.vote("absent", legislators[voter_id])
                    absent_count += 1
            if "excused" in v:
                for voter_id in v["excused"]:
                    vote.vote("excused", legislators[voter_id])
                    excused_count += 1

            vote.set_count("yes", yes_count)
            vote.set_count("no", no_count)
            vote.set_count("absent", absent_count)
            vote.set_count("excused", excused_count)
            # check to see if there are any other things that look
            # like vote categories, throw a warning if so
            for key, val in v.items():
                if (type(val) == list and len(val) > 0
                        and key not in ["yeas", "nays", "absent", "excused"]):
                    if val[0] in legislators:
                        self.logger.warning(
                            "{k} looks like a vote type that's not being counted."
                            " Double check it?".format(k=key))
            vote.add_source(url)

            yield vote
Exemplo n.º 21
0
    def parse_roll_call(self, bill, link, chamber, date):
        url = link.attrib["href"]
        page = self.get(url).text
        page = lxml.html.fromstring(page)

        xpath = 'string(//div[@class="Column-OneFourth"]/div[3])'
        motion = page.xpath(xpath).strip()
        motion = re.sub(r"\s+", " ", motion)

        if motion == "FP":
            motion = "FINAL PASSAGE"

        if motion == "FINAL PASSAGE":
            type = "passage"
        elif re.match(r"CONCUR(RENCE)? IN \w+ AMENDMENTS", motion):
            type = "amendment"
        else:
            type = []
            motion = link.text_content()

        # Looks like for "YEAS" and "NAYS" counts, PA has multiple HTML
        # formats: one where the "YEAS" text node is nested within a span
        # element, and another where the text node is a direct child of the div
        # element
        yeas_elements = page.xpath("//div/span[text() = 'YEAS']/..")
        if len(yeas_elements) == 0:
            yeas_elements = page.xpath(
                "//div[text()[normalize-space() = 'YEAS']]")
        yeas = int(yeas_elements[0].getnext().text)

        nays_elements = page.xpath("//div/span[text() = 'NAYS']/..")
        if len(nays_elements) == 0:
            nays_elements = page.xpath(
                "//div[text()[normalize-space() = 'NAYS']]")
        nays = int(nays_elements[0].getnext().text)

        # "LVE" and "N/V" have been moved up as direct children of the div
        # element
        other = 0
        lve_elements = page.xpath('//div[text()[normalize-space() = "LVE"]]')
        if lve_elements:
            other += int(lve_elements[0].getnext().text)
        nv_elements = page.xpath('//div[text()[normalize-space() = "N/V"]]')
        if nv_elements:
            other += int(nv_elements[0].getnext().text)

        vote = VoteEvent(
            chamber=chamber,
            start_date=tz.localize(date),
            motion_text=motion,
            classification=type,
            result="pass" if yeas > (nays + other) else "fail",
            bill=bill,
        )
        # pupa_id situation here is a bit weird, same vote can be used for
        # multiple bills see:
        # http://www.legis.state.pa.us/CFDOCS/Legis/RC/Public/rc_view_action2.cfm?sess_yr=2017&sess_ind=0&rc_body=H&rc_nbr=11       # noqa
        # so we toss the bill id onto the end of the URL
        vote.pupa_id = url + "#" + bill.identifier
        vote.add_source(url)
        vote.set_count("yes", yeas)
        vote.set_count("no", nays)
        vote.set_count("other", other)

        for div in page.xpath('//*[contains(@class, "RollCalls-Vote")]'):
            name = div[0].tail.strip()
            name = re.sub(r"^[\s,]+", "", name)
            name = re.sub(r"[\s,]+$", "", name)
            class_attr = div.attrib["class"].lower()
            if "yea" in class_attr:
                voteval = "yes"
            elif "nay" in class_attr:
                voteval = "no"
            elif "nvote" in class_attr:
                voteval = "other"
            elif "lve" in class_attr:
                voteval = "other"
            else:
                msg = "Unrecognized vote val: %s" % class_attr
                raise Exception(msg)
            vote.vote(voteval, name)

        return vote
Exemplo n.º 22
0
    def scrape_votes(self, url, motion, date, chamber, bill):
        vote_pdf, resp = self.urlretrieve(url)
        text = convert_pdf(vote_pdf, "text")
        os.remove(vote_pdf)

        # this way we get a key error on a missing vote type
        motion, passed = self._vote_mapping[motion]

        yes_votes = []
        no_votes = []
        other_votes = []
        absent_votes = []
        not_voting_votes = []
        # point at array to add names to
        cur_array = None

        precursors = (
            ("yeas--", yes_votes),
            ("nays--", no_votes),
            ("absent or those not voting--", absent_votes),
            ("absent and those not voting--", absent_votes),
            ("not voting--", not_voting_votes),
            ("voting present--", other_votes),
            ("present--", other_votes),
            ("disclaimer", None),
        )

        # split lines on newline, recombine lines that don't end in punctuation
        lines = _combine_lines(text.decode().split("\n"))

        for line in lines:

            # check if the line starts with a precursor, switch to that array
            for pc, arr in precursors:
                if pc in line.lower():
                    cur_array = arr
                    line = line.replace(pc, "")

            # split names
            for name in line.split(","):
                name = name.strip()

                # move on if that's all there was
                if not name:
                    continue

                # None or a Total indicate the end of a section
                if "None." in name:
                    cur_array = None

                match = re.match(r"(.+?)\. Total--.*", name)
                if match:
                    cur_array.append(match.groups()[0])
                    cur_array = None

                # append name if it looks ok
                junk_in_name = False
                for junk in (
                        "on final passage",
                        "Necessary",
                        "who would have",
                        "being a tie",
                        "therefore",
                        "Vacancies",
                        "a pair",
                        "Total-",
                        "ATTORNEY",
                        "on final passage",
                        "SPEAKER",
                        "BOARD",
                        "TREASURER",
                        "GOVERNOR",
                        "ARCHIVES",
                        "SECRETARY",
                ):
                    if junk in name:
                        junk_in_name = True
                        break
                if cur_array is not None and not junk_in_name:
                    # strip trailing .
                    if name[-1] == ".":
                        name = name[:-1]
                    name = self.clean_voter_name(name)
                    cur_array.append(name)

        # return vote object
        yes_count = len(yes_votes)
        no_count = len(no_votes)
        absent_count = len(absent_votes)
        not_voting_count = len(not_voting_votes)
        other_count = len(other_votes)

        vote = VoteEvent(
            chamber=chamber,
            start_date=self._tz.localize(date),
            motion_text=motion,
            result="pass" if passed else "fail",
            classification="passage",
            bill=bill,
        )
        vote.pupa_id = url + "#" + bill.identifier

        vote.set_count("yes", yes_count)
        vote.set_count("no", no_count)
        vote.set_count("absent", absent_count)
        vote.set_count("not voting", not_voting_count)
        vote.set_count("other", other_count)
        vote.add_source(url)
        for yes_vote in yes_votes:
            vote.vote("yes", self.clean_voter_name(yes_vote))
        for no_vote in no_votes:
            vote.vote("no", self.clean_voter_name(no_vote))
        for absent_vote in absent_votes:
            vote.vote("absent", self.clean_voter_name(absent_vote))
        for not_voting_vote in not_voting_votes:
            vote.vote("not voting", self.clean_voter_name(not_voting_vote))
        for other_vote in other_votes:
            vote.vote("other", self.clean_voter_name(other_vote))
        yield vote
Exemplo n.º 23
0
    def scrape_bills(self, session, year_abr):
        # Main Bill information
        main_bill_csv = self.access_to_csv("MainBill")

        # keep a dictionary of bills (mapping bill_id to Bill obj)
        bill_dict = {}

        for rec in main_bill_csv:
            bill_type = rec["BillType"].strip()
            bill_number = int(rec["BillNumber"])
            bill_id = bill_type + str(bill_number)
            title = rec["Synopsis"]
            if bill_type[0] == "A":
                chamber = "lower"
            else:
                chamber = "upper"

            # some bills have a blank title.. just skip it
            if not title:
                continue

            bill = Bill(
                bill_id,
                title=title,
                chamber=chamber,
                legislative_session=session,
                classification=self._bill_types[bill_type[1:]],
            )
            if rec["IdenticalBillNumber"].strip():
                bill.add_related_bill(
                    rec["IdenticalBillNumber"].split()[0],
                    legislative_session=session,
                    relation_type="companion",
                )

            # TODO: last session info is in there too
            bill_dict[bill_id] = bill

        # Sponsors
        bill_sponsors_csv = self.access_to_csv("BillSpon")

        for rec in bill_sponsors_csv:
            bill_type = rec["BillType"].strip()
            bill_number = int(rec["BillNumber"])
            bill_id = bill_type + str(bill_number)
            if bill_id not in bill_dict:
                self.warning("unknown bill %s in sponsor database" % bill_id)
                continue
            bill = bill_dict[bill_id]
            name = rec["Sponsor"]
            sponsor_type = rec["Type"]
            if sponsor_type == "P":
                sponsor_type = "primary"
            else:
                sponsor_type = "cosponsor"
            bill.add_sponsorship(
                name,
                classification=sponsor_type,
                entity_type="person",
                primary=sponsor_type == "primary",
            )

        # Documents
        bill_document_csv = self.access_to_csv("BillWP")

        for rec in bill_document_csv:
            bill_type = rec["BillType"].strip()
            bill_number = int(rec["BillNumber"])
            bill_id = bill_type + str(bill_number)
            if bill_id not in bill_dict:
                self.warning("unknown bill %s in document database" % bill_id)
                continue
            bill = bill_dict[bill_id]
            document = rec["Document"]
            document = document.split("\\")
            document = document[-2] + "/" + document[-1]

            # doc_url = "ftp://www.njleg.state.nj.us/%s/%s" % (year, document)
            htm_url = "http://www.njleg.state.nj.us/{}/Bills/{}".format(
                year_abr, document.replace(".DOC", ".HTM"))

            # name document based _doctype
            try:
                doc_name = self._doctypes[rec["DocType"]]
            except KeyError:
                raise Exception("unknown doctype %s on %s" %
                                (rec["DocType"], bill_id))
            if rec["Comment"]:
                doc_name += " " + rec["Comment"]

            # Clean HTMX links.
            if htm_url.endswith("HTMX"):
                htm_url = re.sub("X$", "", htm_url)

            if rec["DocType"] in self._version_types:
                if htm_url.lower().endswith("htm"):
                    mimetype = "text/html"
                elif htm_url.lower().endswith("wpd"):
                    mimetype = "application/vnd.wordperfect"
                try:
                    bill.add_version_link(doc_name,
                                          htm_url,
                                          media_type=mimetype)
                except ValueError:
                    self.warning(
                        "Couldn't find a document for bill {}".format(bill_id))
                    pass
            else:
                bill.add_document_link(doc_name, htm_url)

        # Votes
        next_year = int(year_abr) + 1
        vote_info_list = [
            "A%s" % year_abr,
            "A%s" % next_year,
            "S%s" % year_abr,
            "S%s" % next_year,
            "CA%s-%s" % (year_abr, next_year),
            "CS%s-%s" % (year_abr, next_year),
        ]
        # keep votes clean globally, a few votes show up in multiple files
        votes = {}

        for filename in vote_info_list:
            s_vote_url = "ftp://www.njleg.state.nj.us/votes/%s.zip" % filename
            try:
                s_vote_zip, resp = self.urlretrieve(s_vote_url)
            except scrapelib.FTPError:
                self.warning("could not find %s" % s_vote_url)
                continue
            zippedfile = zipfile.ZipFile(s_vote_zip)
            for vfile in ["%s.txt" % (filename), "%sEnd.txt" % (filename)]:
                try:
                    vote_file = io.TextIOWrapper(zippedfile.open(vfile, "r"),
                                                 encoding="latin-1")
                except KeyError:
                    #
                    # Right, so, 2011 we have an "End" file with more
                    # vote data than was in the original dump.
                    #
                    self.warning("No such file: %s" % (vfile))
                    continue

                vdict_file = csv.DictReader(vote_file)
                if filename.startswith("A") or filename.startswith("CA"):
                    chamber = "lower"
                else:
                    chamber = "upper"

                if filename.startswith("C"):
                    vote_file_type = "committee"
                else:
                    vote_file_type = "chamber"

                for rec in vdict_file:
                    if vote_file_type == "chamber":
                        bill_id = rec["Bill"].strip()
                        leg = rec["Full_Name"]

                        date = rec["Session_Date"]
                        action = rec["Action"]
                        leg_vote = rec["Legislator_Vote"]
                        vote_parts = (bill_id, chamber, action)
                    else:
                        bill_id = "%s%s" % (rec["Bill_Type"],
                                            rec["Bill_Number"])
                        leg = rec["Name"]
                        # drop time portion
                        date = rec["Agenda_Date"].split()[0]
                        # make motion readable
                        action = self._com_vote_motions[rec["BillAction"]]
                        # first char (Y/N) use [0:1] to ignore ''
                        leg_vote = rec["LegislatorVote"][0:1]
                        committee = rec["Committee_House"]
                        vote_parts = (bill_id, chamber, action, committee)

                    date = datetime.strptime(date, "%m/%d/%Y")
                    vote_id = "_".join(vote_parts).replace(" ", "_")

                    if vote_id not in votes:
                        votes[vote_id] = VoteEvent(
                            start_date=TIMEZONE.localize(date),
                            chamber=chamber,
                            motion_text=action,
                            classification="passage",
                            result=None,
                            bill=bill_dict[bill_id],
                        )
                        votes[vote_id].pupa_id = vote_id
                    if leg_vote == "Y":
                        votes[vote_id].vote("yes", leg)
                    elif leg_vote == "N":
                        votes[vote_id].vote("no", leg)
                    else:
                        votes[vote_id].vote("other", leg)

            # remove temp file
            os.remove(s_vote_zip)

            # Counts yes/no/other votes and saves overall vote
            for vote in votes.values():
                counts = collections.defaultdict(int)
                for count in vote.votes:
                    counts[count["option"]] += 1
                vote.set_count("yes", counts["yes"])
                vote.set_count("no", counts["no"])
                vote.set_count("other", counts["other"])

                # Veto override.
                if vote.motion_text == "OVERRIDE":
                    # Per the NJ leg's glossary, a veto override requires
                    # 2/3ds of each chamber. 27 in the senate, 54 in the house.
                    # http://www.njleg.state.nj.us/legislativepub/glossary.asp
                    if "lower" in vote.bill:
                        vote.result = "pass" if counts["yes"] >= 54 else "fail"
                    elif "upper" in vote.bill:
                        vote.result = "pass" if counts["yes"] >= 27 else "fail"
                else:
                    # Regular vote.
                    vote.result = "pass" if counts["yes"] > counts[
                        "no"] else "fail"

                vote.add_source("http://www.njleg.state.nj.us/downloads.asp")
                yield vote

        # Actions
        bill_action_csv = self.access_to_csv("BillHist")
        actor_map = {"A": "lower", "G": "executive", "S": "upper"}

        for rec in bill_action_csv:
            bill_type = rec["BillType"].strip()
            bill_number = int(rec["BillNumber"])
            bill_id = bill_type + str(bill_number)
            if bill_id not in bill_dict:
                self.warning("unknown bill %s in action database" % bill_id)
                continue
            bill = bill_dict[bill_id]
            action = rec["Action"]
            date = rec["DateAction"]
            date = datetime.strptime(date, "%m/%d/%y %H:%M:%S")
            actor = actor_map[rec["House"]]
            comment = rec["Comment"]
            action, atype = self.categorize_action(action, bill_id)
            if comment:
                action += " " + comment
            bill.add_action(
                action,
                date=TIMEZONE.localize(date),
                classification=atype,
                chamber=actor,
            )

        # Subjects
        subject_csv = self.access_to_csv("BillSubj")
        for rec in subject_csv:
            bill_id = rec["BillType"].strip() + str(int(rec["BillNumber"]))
            if bill_id not in bill_dict:
                self.warning("unknown bill %s in subject database" % bill_id)
                continue
            bill = bill_dict.get(bill_id)
            if bill:
                bill.subject.append(rec["SubjectKey"])
            else:
                self.warning("invalid bill id in BillSubj: %s" % bill_id)

        phony_bill_count = 0
        # save all bills at the end
        for bill in bill_dict.values():
            # add sources
            if not bill.actions and not bill.versions:
                self.warning("probable phony bill detected %s",
                             bill.identifier)
                phony_bill_count += 1
            else:
                bill.add_source("http://www.njleg.state.nj.us/downloads.asp")
                yield bill

        if phony_bill_count:
            self.warning("%s total phony bills detected", phony_bill_count)
Exemplo n.º 24
0
    def _parse_votes(self, url, vote, bill):
        """Given a vote url and a vote object, extract the voters and
        the vote counts from the vote page and update the vote object.
        """
        if url.lower().endswith(".pdf"):

            try:
                resp = self.get(url)
            except HTTPError:
                # This vote document wasn't found.
                msg = "No document found at url %r" % url
                self.logger.warning(msg)
                return

            try:
                v = PDFCommitteeVote(url, resp.content, bill)
                return v.asvote()
            except PDFCommitteeVoteParseError:
                # Warn and skip.
                self.warning("Could't parse committee vote at %r" % url)
                return

        html = self.get(url).text
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)

        # Yes, no, excused, absent.
        try:
            vals = doc.xpath("//table")[1].xpath("tr/td/text()")
        except IndexError:
            # Most likely was a bogus link lacking vote data.
            return

        yes_count, no_count, excused_count, absent_count = map(int, vals)

        # Get the motion.
        try:
            motion = doc.xpath("//br")[-1].tail.strip()
        except IndexError:
            # Some of them mysteriously have no motion listed.
            motion = vote["action"]

        if not motion:
            motion = vote["action"]

        vote["motion"] = motion

        action = vote["action"]
        vote_url = vote["vote_url"]

        vote = VoteEvent(
            chamber=vote["chamber"],
            start_date=vote["date"],
            motion_text=vote["motion"],
            result="fail",  # placeholder
            classification="passage",
            bill=bill,
            bill_action=vote["action"],
        )
        vote.pupa_id = vote_url  # URL contains sequence number
        vote.add_source(vote_url)
        vote.set_count("yes", yes_count)
        vote.set_count("no", no_count)
        vote.set_count("excused", excused_count)
        vote.set_count("absent", absent_count)

        for text in doc.xpath("//table")[2].xpath("tr/td/text()"):
            if not text.strip(u"\xa0"):
                continue
            v, name = filter(None, text.split(u"\xa0"))
            # Considering Name is brackets as short name
            regex = re.compile(r".*?\((.*?)\)")
            short_name = re.findall(regex, name)
            if len(short_name) > 0:
                note = "Short Name: " + short_name[0]
            else:
                note = ""
            # Name without brackets like 'Kary, Douglas'
            name = re.sub(r"[\(\[].*?[\)\]]", "", name)
            if v == "Y":
                vote.yes(name, note=note)
            elif v == "N":
                vote.no(name, note=note)
            elif v == "E":
                vote.vote("excused", name, note=note)
            elif v == "A":
                vote.vote("absent", name, note=note)

        # code to deterimine value of `passed`
        passed = None

        # some actions take a super majority, so we aren't just
        # comparing the yeas and nays here.
        for i in vote_passage_indicators:
            if i in action:
                passed = True
                break
        for i in vote_failure_indicators:
            if i in action and passed:
                # a quick explanation:  originally an exception was
                # thrown if both passage and failure indicators were
                # present because I thought that would be a bug in my
                # lists.  Then I found 2007 HB 160.
                # Now passed = False if the nays outnumber the yays..
                # I won't automatically mark it as passed if the yays
                # ounumber the nays because I don't know what requires
                # a supermajority in MT.
                if no_count >= yes_count:
                    passed = False
                    break
                else:
                    raise Exception("passage and failure indicator"
                                    "both present at: %s" % url)
            if i in action and passed is None:
                passed = False
                break
        for i in vote_ambiguous_indicators:
            if i in action:
                passed = yes_count > no_count
                break
        if passed is None:
            raise Exception("Unknown passage at: %s" % url)

        vote.result = "pass" if passed else "fail"

        return vote
Exemplo n.º 25
0
    def scrape_bill_type(
            self,
            chamber,
            session,
            bill_type,
            type_abbr,
            committee_abbr_regex=get_committee_name_regex(),
    ):
        bills = (self.session.query(CABill).filter_by(
            session_year=session).filter_by(measure_type=type_abbr))

        archive_year = int(session[0:4])
        not_archive_year = archive_year >= 2009

        for bill in bills:
            bill_session = session
            if bill.session_num != "0":
                bill_session += " Special Session %s" % bill.session_num

            bill_id = bill.short_bill_id
            if bill_id.strip() == "SB77" and session == "20052006":
                continue

            fsbill = Bill(bill_id, bill_session, title="", chamber=chamber)
            if (bill_id.startswith("S")
                    and chamber == "lower") or (bill_id.startswith("A")
                                                and chamber == "upper"):
                print("!!!! BAD ID/CHAMBER PAIR !!!!", bill)
                continue

            # Construct a fake source url
            source_url = ("http://leginfo.legislature.ca.gov/faces/"
                          "billNavClient.xhtml?bill_id=%s") % bill.bill_id

            fsbill.add_source(source_url)
            fsbill.add_version_link(bill_id,
                                    source_url,
                                    media_type="text/html")

            title = ""
            type_ = ["bill"]
            subject = ""
            all_titles = set()
            summary = ""

            # Get digest test (aka "summary") from latest version.
            if bill.versions and not_archive_year:
                version = bill.versions[-1]
                nsmap = version.xml.nsmap
                xpath = "//caml:DigestText/xhtml:p"
                els = version.xml.xpath(xpath, namespaces=nsmap)
                chunks = []
                for el in els:
                    t = etree_text_content(el)
                    t = re.sub(r"\s+", " ", t)
                    t = re.sub(r"\)(\S)", lambda m: ") %s" % m.group(1), t)
                    chunks.append(t)
                summary = "\n\n".join(chunks)

            for version in bill.versions:
                if not version.bill_xml:
                    continue

                version_date = self._tz.localize(
                    version.bill_version_action_date)

                # create a version name to match the state's format
                # 02/06/17 - Enrolled
                version_date_human = version_date.strftime("%m/%d/%y")
                version_name = "{} - {}".format(version_date_human,
                                                version.bill_version_action)

                version_base = "https://leginfo.legislature.ca.gov/faces"

                version_url_pdf = "{}/billPdf.xhtml?bill_id={}&version={}".format(
                    version_base, version.bill_id, version.bill_version_id)

                fsbill.add_version_link(
                    version_name,
                    version_url_pdf,
                    media_type="application/pdf",
                    date=version_date.date(),
                )

                # CA is inconsistent in that some bills have a short title
                # that is longer, more descriptive than title.
                if bill.measure_type in ("AB", "SB"):
                    impact_clause = clean_title(version.title)
                    title = clean_title(version.short_title)
                else:
                    impact_clause = None
                    if len(version.title) < len(
                            version.short_title) and not version.title.lower(
                            ).startswith("an act"):
                        title = clean_title(version.short_title)
                    else:
                        title = clean_title(version.title)

                if title:
                    all_titles.add(title)

                type_ = [bill_type]

                if version.appropriation == "Yes":
                    type_.append("appropriation")

                tags = []
                if version.fiscal_committee == "Yes":
                    tags.append("fiscal committee")
                if version.local_program == "Yes":
                    tags.append("local program")
                if version.urgency == "Yes":
                    tags.append("urgency")
                if version.taxlevy == "Yes":
                    tags.append("tax levy")

                if version.subject:
                    subject = clean_title(version.subject)

            if not title:
                self.warning("Couldn't find title for %s, skipping" % bill_id)
                continue

            fsbill.title = title
            if summary:
                fsbill.add_abstract(summary, note="summary")
            fsbill.classification = type_
            fsbill.subject = [subject] if subject else []
            fsbill.extras["impact_clause"] = impact_clause
            fsbill.extras["tags"] = tags

            # We don't want the current title in alternate_titles
            all_titles.remove(title)

            for title in all_titles:
                fsbill.add_title(title)

            for author in version.authors:
                fsbill.add_sponsorship(
                    author.name,
                    classification=SPONSOR_TYPES[author.contribution],
                    primary=author.primary_author_flg == "Y",
                    entity_type="person",
                )
                # fsbill.sponsorships[-1]['extras'] = {'official_type': author.contribution}

            seen_actions = set()
            for action in bill.actions:
                if not action.action:
                    # NULL action text seems to be an error on CA's part,
                    # unless it has some meaning I'm missing
                    continue
                actor = action.actor or chamber
                actor = actor.strip()
                match = re.match(r"(Assembly|Senate)($| \(Floor)", actor)
                if match:
                    actor = {
                        "Assembly": "lower",
                        "Senate": "upper"
                    }[match.group(1)]
                elif actor.startswith("Governor"):
                    actor = "executive"
                else:

                    def replacer(matchobj):
                        if matchobj:
                            return {
                                "Assembly": "lower",
                                "Senate": "upper"
                            }[matchobj.group()]
                        else:
                            return matchobj.group()

                    actor = re.sub(r"^(Assembly|Senate)", replacer, actor)

                type_ = []

                act_str = action.action
                act_str = re.sub(r"\s+", " ", act_str)

                attrs = self.categorizer.categorize(act_str)

                # Add in the committee strings of the related committees, if any.
                kwargs = attrs
                matched_abbrs = committee_abbr_regex.findall(action.action)

                if re.search(r"Com[s]?. on",
                             action.action) and not matched_abbrs:
                    msg = "Failed to extract committee abbr from %r."
                    self.logger.warning(msg % action.action)

                if matched_abbrs:
                    committees = []
                    for abbr in matched_abbrs:
                        try:
                            name = self.committee_abbr_to_name(chamber, abbr)
                            committees.append(name)
                        except KeyError:
                            msg = ("Mapping contains no committee name for "
                                   "abbreviation %r. Action text was %r.")
                            args = (abbr, action.action)
                            self.warning(msg % args)

                    committees = filter(None, committees)
                    kwargs["committees"] = committees

                    code = re.search(r"C[SXZ]\d+", actor)
                    if code is not None:
                        code = code.group()
                        kwargs["actor_info"] = {"committee_code": code}
                    if not_archive_year:
                        assert len(list(committees)) == len(matched_abbrs)
                    for committee, abbr in zip(committees, matched_abbrs):
                        act_str = act_str.replace("Coms. on ", "")
                        act_str = act_str.replace("Com. on " + abbr, committee)
                        act_str = act_str.replace(abbr, committee)
                        if not act_str.endswith("."):
                            act_str = act_str + "."

                # Determine which chamber the action originated from.
                changed = False
                for committee_chamber in ["upper", "lower", "legislature"]:
                    if actor.startswith(committee_chamber):
                        actor = committee_chamber
                        changed = True
                        break
                if not changed:
                    actor = "legislature"

                if actor != action.actor:
                    actor_info = kwargs.get("actor_info", {})
                    actor_info["details"] = action.actor
                    kwargs["actor_info"] = actor_info

                # Add strings for related legislators, if any.
                rgx = r"(?:senator|assembly[mwp][^ .,:;]+)\s+[^ .,:;]+"
                legislators = re.findall(rgx, action.action, re.I)
                if legislators:
                    kwargs["legislators"] = legislators

                date = action.action_date
                date = self._tz.localize(date)
                date = date.date()
                if (actor, act_str, date) in seen_actions:
                    continue

                kwargs.update(self.categorizer.categorize(act_str))

                action = fsbill.add_action(
                    act_str,
                    date.strftime("%Y-%m-%d"),
                    chamber=actor,
                    classification=kwargs["classification"],
                )
                for committee in kwargs.get("committees", []):
                    action.add_related_entity(committee,
                                              entity_type="organization")
                seen_actions.add((actor, act_str, date))

            source_url = (
                "http://leginfo.legislature.ca.gov/faces/billVotesClient.xhtml?"
            )
            source_url += f"bill_id={session}{bill.session_num}{fsbill.identifier}"

            # Votes for non archived years
            if archive_year > 2009:
                for vote_num, vote in enumerate(bill.votes):
                    if vote.vote_result == "(PASS)":
                        result = True
                    else:
                        result = False

                    if not vote.location:
                        continue

                    full_loc = vote.location.description
                    first_part = full_loc.split(" ")[0].lower()
                    if first_part in ["asm", "assembly"]:
                        vote_chamber = "lower"
                        # vote_location = ' '.join(full_loc.split(' ')[1:])
                    elif first_part.startswith("sen"):
                        vote_chamber = "upper"
                        # vote_location = ' '.join(full_loc.split(' ')[1:])
                    else:
                        # raise ScrapeError("Bad location: %s" % full_loc) # To uncomment
                        continue

                    if vote.motion:
                        motion = vote.motion.motion_text or ""
                    else:
                        motion = ""

                    if "Third Reading" in motion or "3rd Reading" in motion:
                        vtype = "passage"
                    elif "Do Pass" in motion:
                        vtype = "passage"
                    else:
                        vtype = "other"

                    motion = motion.strip()
                    motion = re.compile(r"(\w+)( Extraordinary)? Session$",
                                        re.IGNORECASE).sub("", motion)
                    motion = re.compile(r"^(Senate|Assembly) ",
                                        re.IGNORECASE).sub("", motion)
                    motion = re.sub(r"^(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.?  ",
                                    "", motion)
                    motion = re.sub(r" \(\w+\)$", "", motion)
                    motion = re.sub(r"(SCR|SB|AB|AJR|ACR)\s?\d+ \w+\.?$", "",
                                    motion)
                    motion = re.sub(
                        r"(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? "
                        r"Urgency Clause$",
                        "(Urgency Clause)",
                        motion,
                    )
                    motion = re.sub(r"\s+", " ", motion)

                    if not motion:
                        self.warning("Got blank motion on vote for %s" %
                                     bill_id)
                        continue

                    # XXX this is responsible for all the CA 'committee' votes, not
                    # sure if that's a feature or bug, so I'm leaving it as is...
                    # vote_classification = chamber if (vote_location == 'Floor') else 'committee'
                    # org = {
                    # 'name': vote_location,
                    # 'classification': vote_classification
                    # }

                    fsvote = VoteEvent(
                        motion_text=motion,
                        start_date=self._tz.localize(vote.vote_date_time),
                        result="pass" if result else "fail",
                        classification=vtype,
                        # organization=org,
                        chamber=vote_chamber,
                        bill=fsbill,
                    )
                    fsvote.extras = {"threshold": vote.threshold}

                    fsvote.add_source(source_url)
                    fsvote.pupa_id = source_url + "#" + str(vote_num)

                    rc = {"yes": [], "no": [], "other": []}
                    for record in vote.votes:
                        if record.vote_code == "AYE":
                            rc["yes"].append(record.legislator_name)
                        elif record.vote_code.startswith("NO"):
                            rc["no"].append(record.legislator_name)
                        else:
                            rc["other"].append(record.legislator_name)

                    # Handle duplicate votes
                    for key in rc.keys():
                        rc[key] = list(set(rc[key]))

                    for key, voters in rc.items():
                        for voter in voters:
                            fsvote.vote(key, voter)
                        # Set counts by summed votes for accuracy
                        fsvote.set_count(key, len(voters))

                    yield fsvote
            if len(bill.votes) > 0 and archive_year <= 2009:
                vote_page_url = (
                    "http://leginfo.legislature.ca.gov/faces/billVotesClient.xhtml?"
                )
                vote_page_url += (
                    f"bill_id={session}{bill.session_num}{fsbill.identifier}")

                # parse the bill data page, finding the latest html text
                data = self.get(vote_page_url).content
                doc = html.fromstring(data)
                doc.make_links_absolute(vote_page_url)
                num_of_votes = len(doc.xpath("//div[@class='status']"))
                for vote_section in range(1, num_of_votes + 1):
                    lines = doc.xpath(
                        f"//div[@class='status'][{vote_section}]//div[@class='statusRow']"
                    )
                    date, result, motion, vtype, location = "", "", "", "", ""
                    votes = {}
                    for line in lines:
                        line = line.text_content().split()
                        if line[0] == "Date":
                            date = line[1]
                            date = datetime.datetime.strptime(date, "%m/%d/%y")
                            date = self._tz.localize(date)
                        elif line[0] == "Result":
                            result = "pass" if "PASS" in line[1] else "fail"
                        elif line[0] == "Motion":
                            motion = " ".join(line[1:])
                        elif line[0] == "Location":
                            location = " ".join(line[1:])
                        elif len(line) > 1:
                            if line[0] == "Ayes" and line[1] != "Count":
                                votes["yes"] = line[1:]
                            elif line[0] == "Noes" and line[1] != "Count":
                                votes["no"] = line[1:]
                            elif line[0] == "NVR" and line[1] != "Count":
                                votes["not voting"] = line[1:]
                    # Determine chamber based on location
                    first_part = location.split(" ")[0].lower()
                    vote_chamber = ""
                    if first_part in ["asm", "assembly"]:
                        vote_chamber = "lower"
                    elif first_part.startswith("sen"):
                        vote_chamber = "upper"

                    if "Third Reading" in motion or "3rd Reading" in motion:
                        vtype = "passage"
                    elif "Do Pass" in motion:
                        vtype = "passage"
                    else:
                        vtype = "other"
                    if len(motion) > 0:
                        fsvote = VoteEvent(
                            motion_text=motion,
                            start_date=date,
                            result=result,
                            classification=vtype,
                            chamber=vote_chamber,
                            bill=fsbill,
                        )
                        fsvote.add_source(vote_page_url)
                        fsvote.pupa_id = vote_page_url + "#" + str(
                            vote_section)

                        for how_voted, voters in votes.items():
                            for voter in voters:
                                voter = voter.replace(",", "")
                                fsvote.vote(how_voted, voter)
                        yield fsvote

            yield fsbill
            self.session.expire_all()
Exemplo n.º 26
0
    def scrape_assembly_votes(self, session, bill, assembly_url, bill_id):

        # parse the bill data page, finding the latest html text
        url = assembly_url + "&Floor%26nbspVotes=Y"

        data = self.get(url).text
        doc = lxml.html.fromstring(data)
        doc.make_links_absolute(url)

        if "Votes:" in doc.text_content():
            vote_motions = []
            additional_votes_on_motion = 2
            for table in doc.xpath("//table"):

                date = table.xpath('caption/span[contains(., "DATE:")]')
                date = next(date[0].itersiblings()).text
                date = datetime.datetime.strptime(date, "%m/%d/%Y")
                date = eastern.localize(date)
                date = date.isoformat()

                spanText = table.xpath("caption/span/text()")
                motion = spanText[2].strip() + spanText[3].strip()
                if motion in vote_motions:
                    motion = motion + f" - Vote {additional_votes_on_motion}"
                    additional_votes_on_motion += 1
                else:
                    vote_motions.append(motion)

                votes = (table.xpath("caption/span/span")[0].text.split(":")
                         [1].split("/"))
                yes_count, no_count = map(int, votes)
                passed = yes_count > no_count
                vote = VoteEvent(
                    chamber="lower",
                    start_date=date,
                    motion_text=motion,
                    bill=bill,
                    result="pass" if passed else "fail",
                    classification="passage",
                )

                vote.set_count("yes", yes_count)
                vote.set_count("no", no_count)
                absent_count = 0
                excused_count = 0
                tds = table.xpath("tr/td/text()")
                votes = [tds[i:i + 2] for i in range(0, len(tds), 2)]

                vote_dictionary = {
                    "Y": "yes",
                    "NO": "no",
                    "ER": "excused",
                    "AB": "absent",
                    "NV": "not voting",
                    "EL": "other",
                }

                for vote_pair in votes:
                    name, vote_val = vote_pair
                    vote.vote(vote_dictionary[vote_val], name)
                    if vote_val == "AB":
                        absent_count += 1
                    elif vote_val == "ER":
                        excused_count += 1

                vote.set_count("absent", absent_count)
                vote.set_count("excused", excused_count)
                vote.add_source(url)
                vote.pupa_id = url + motion + spanText[1]

                yield vote
Exemplo n.º 27
0
    def scrape_vote(self, session, bill, vote_url, chamber, date):
        page = self.lxmlize(vote_url)

        try:
            motion = page.xpath("//font/text()")[2]
        except IndexError:
            self.warning("Vote Summary Page Broken ")
            return

        # eg. http://leg.colorado.gov/content/sb18-033vote563ce6
        if ("AM" in motion or "PM" in motion) and "/" in motion:
            motion = "Motion not given."

        if "withdrawn" not in motion:
            yes_no_counts = page.xpath(
                "//tr/td[preceding-sibling::td/descendant::"
                "font[contains(text(),'Aye')]]/font/text()"
            )
            other_counts = page.xpath(
                "//tr/td[preceding-sibling::td/descendant::"
                "font[contains(text(),'Absent')]]/font/text()"
            )
            abstain_counts = page.xpath(
                "//tr/td[preceding-sibling::td/descendant::"
                "font[contains(text(),'17C')]]/font/text()"
            )
            yes_count = int(yes_no_counts[0])
            no_count = int(yes_no_counts[2])
            exc_count = int(other_counts[2])
            absent_count = int(other_counts[0])
            abstain_count = 0
            if abstain_counts:
                abstain_count = int(abstain_counts[0])

            # fix for
            # http://leg.colorado.gov/content/hb19-1029vote65e72e
            if absent_count == -1:
                absent_count = 0

            passed = yes_count > no_count
            vote = VoteEvent(
                chamber=chamber,
                start_date=self._tz.localize(date),
                motion_text=motion,
                result="pass" if passed else "fail",
                bill=bill,
                classification="passage",
            )
            vote.pupa_id = vote_url
            vote.set_count("yes", yes_count)
            vote.set_count("no", no_count)
            vote.set_count("excused", exc_count)
            vote.set_count("absent", absent_count)
            vote.set_count("abstain", abstain_count)
            vote.add_source(vote_url)

            rolls = page.xpath(
                "//tr[preceding-sibling::tr/descendant::"
                "td/div/b/font[contains(text(),'Vote')]]"
            )

            vote_abrv = {
                "Y": "yes",
                "N": "no",
                "E": "excused",
                "A": "absent",
                "-": "absent",
                "17C": "abstain",
            }
            for roll in rolls:
                if len(roll.xpath(".//td/div/font/text()")) > 0:
                    voted = roll.xpath(".//td/div/font/text()")[0].strip()
                    voter = roll.xpath(".//td/font/text()")[0].strip()
                    if voted == "V":
                        continue
                    vote.vote(vote_abrv[voted], voter)
            yield vote
Exemplo n.º 28
0
    def scrape_votes(self, bill, url):
        page = lxml.html.fromstring(self.get(url).text.replace(u"\xa0", " "))

        seen_rcs = set()

        re_ns = "http://exslt.org/regular-expressions"
        path = r"//p[re:test(text(), 'OKLAHOMA\s+(HOUSE|STATE\s+SENATE)')]"
        for header in page.xpath(path, namespaces={"re": re_ns}):
            bad_vote = False
            # Each chamber has the motion name on a different line of the file
            if "HOUSE" in header.xpath("string()"):
                chamber = "lower"
                motion_index = 8
            else:
                chamber = "upper"
                motion_index = 13

            motion = header.xpath("string(following-sibling::p[%d])" %
                                  motion_index).strip()
            motion = re.sub(r"\s+", " ", motion)
            if not motion.strip():
                self.warning("Motion text not found")
                return
            match = re.match(r"^(.*) (PASSED|FAILED)$", motion)
            if match:
                motion = match.group(1)
                passed = match.group(2) == "PASSED"
            else:
                passed = None

            rcs_p = header.xpath(
                "following-sibling::p[contains(., 'RCS#')]")[0]
            rcs_line = rcs_p.xpath("string()").replace(u"\xa0", " ")
            rcs = re.search(r"RCS#\s+(\d+)", rcs_line).group(1)

            if rcs in seen_rcs:
                continue
            else:
                seen_rcs.add(rcs)

            date_line = rcs_p.getnext().xpath("string()")
            date = re.search(r"\d+/\d+/\d+", date_line).group(0)
            date = datetime.datetime.strptime(date, "%m/%d/%Y").date()

            vtype = None
            counts = collections.defaultdict(int)
            votes = collections.defaultdict(list)

            seen_yes = False

            for sib in header.xpath("following-sibling::p")[13:]:
                line = sib.xpath("string()").replace("\r\n", " ").strip()
                if "*****" in line:
                    break
                regex = (r"(YEAS|NAYS|EXCUSED|VACANT|CONSTITUTIONAL "
                         r"PRIVILEGE|NOT VOTING|N/V)\s*:\s*(\d+)(.*)")
                match = re.match(regex, line)
                if match:
                    if match.group(1) == "YEAS" and "RCS#" not in line:
                        vtype = "yes"
                        seen_yes = True
                    elif match.group(1) == "NAYS" and seen_yes:
                        vtype = "no"
                    elif match.group(1) == "VACANT":
                        continue  # skip these
                    elif seen_yes:
                        vtype = "other"
                    if seen_yes and match.group(3).strip():
                        self.warning("Bad vote format, skipping.")
                        bad_vote = True
                    counts[vtype] += int(match.group(2))
                elif seen_yes:
                    for name in line.split("   "):
                        if not name:
                            continue
                        if "HOUSE" in name or "SENATE " in name:
                            continue
                        votes[vtype].append(name.strip())

            if bad_vote:
                continue

            if passed is None:
                passed = counts["yes"] > (counts["no"] + counts["other"])

            vote = Vote(
                chamber=chamber,
                start_date=date.strftime("%Y-%m-%d"),
                motion_text=motion,
                result="pass" if passed else "fail",
                bill=bill,
                classification="passage",
            )
            vote.set_count("yes", counts["yes"])
            vote.set_count("no", counts["no"])
            vote.set_count("other", counts["other"])
            vote.pupa_id = url + "#" + rcs

            vote.add_source(url)

            for name in votes["yes"]:
                vote.yes(name)
            for name in votes["no"]:
                if ":" in name:
                    raise Exception(name)
                vote.no(name)
            for name in votes["other"]:
                vote.vote("other", name)

            yield vote
Exemplo n.º 29
0
    def parse_vote_pdf(self, vote_url, bill):

        filename, response = self.urlretrieve(vote_url)

        text = convert_pdf(filename, type="text").decode()
        lines = text.splitlines()

        if "Senate" in vote_url:
            chamber = "upper"
        else:
            chamber = "lower"

        date_string = lines[0].split("Calendar Date:")[1].strip()
        date = datetime.datetime.strptime(date_string, "%b %d, %Y %I:%M (%p)")

        page_index = None
        for index, line in enumerate(lines):
            if "Yeas" in line and "Nays" in line:
                page_index = index
                break

        vote_counts = 5 * [0]
        vote_types = ["yes", "no", "not voting", "excused", "absent"]

        if page_index:

            counts = re.split(r"\s{2,}", lines[page_index].strip())

            for index, count in enumerate(counts):
                number, string = count.split(" ", 1)
                number = int(number)
                vote_counts[index] = number
        else:
            raise ValueError("Vote Counts Not found at %s" % vote_url)

        passed = vote_counts[0] > vote_counts[1]

        # Consent calendar votes address multiple bills in one VoteEvent
        # eg, http://mgaleg.maryland.gov/2018RS/votes/Senate/0478.pdf
        is_consent_calendar = any(
            ["Consent Calendar" in line for line in lines[:page_index]]
        )
        consent_calendar_bills = None
        motion = ""
        if is_consent_calendar:
            motion = re.split(r"\s{2,}", lines[page_index - 4].strip())[0]
            consent_calendar_bills = re.split(r"\s{2,}", lines[page_index - 1].strip())
            assert (
                consent_calendar_bills
            ), "Could not find bills for consent calendar vote"

        motion_keywords = [
            "favorable",
            "reading",
            "amendment",
            "motion",
            "introduced",
            "bill pass",
            "committee",
        ]
        motion_lines = [
            3,
            2,
            4,
            5,
        ]  # Relative LineNumbers to be checked for existence of motion

        for i in motion_lines:
            if any(
                motion_keyword in motion.lower() for motion_keyword in motion_keywords
            ):
                break
            motion = re.split(r"\s{2,}", lines[page_index - i].strip())[0]
        else:
            if not any(
                motion_keyword in motion.lower() for motion_keyword in motion_keywords
            ):
                # This condition covers for the bad formating in SB 1260
                motion = lines[page_index - 3]
            if not any(
                motion_keyword in motion.lower() for motion_keyword in motion_keywords
            ):
                # Check this one for SB 747
                motion = "No motion given"
                self.warning("No motion given")

        vote = VoteEvent(
            bill=bill,
            chamber=chamber,
            start_date=date.strftime("%Y-%m-%d"),
            motion_text=motion,
            classification="passage",
            result="pass" if passed else "fail",
        )

        # Include bill ID to avoid duplication for consent calendars
        vote.pupa_id = "{}#{}".format(vote_url, bill.identifier)

        for index, vote_type in enumerate(vote_types):
            vote.set_count(vote_type, vote_counts[index])
        page_index = page_index + 2

        # Keywords for identifying where names are located in the pdf
        show_stoppers = [
            "Voting Nay",
            "Not Voting",
            "COPY",
            "Excused",
            "indicates vote change",
            "Indicates Vote Change",
        ]
        vote_index = 0

        # For matching number of names extracted with vote counts(extracted independently)
        vote_name_counts = 5 * [0]

        while page_index < len(lines):

            current_line = lines[page_index].strip()

            if not current_line or "Voting Yea" in current_line:
                page_index += 1
                continue

            if any(show_stopper in current_line for show_stopper in show_stoppers):
                page_index += 1
                vote_index = vote_index + 1
                continue

            names = re.split(r"\s{2,}", current_line)

            vote_name_counts[vote_index] += len(names)

            for name in names:
                vote.vote(vote_types[vote_index], name)
            page_index += 1

        if vote_counts != vote_name_counts:
            raise ValueError("Votes Count and Number of Names don't match")

        return vote
Exemplo n.º 30
0
    def scrape_vote(self, bill, vote_id, session):
        vote_url = (
            "https://legis.delaware.gov/json/RollCall/GetRollCallVoteByRollCallId"
        )
        form = {"rollCallId": vote_id, "sort": "", "group": "", "filter": ""}

        self.info("Fetching vote {} for {}".format(vote_id, bill.identifier))
        page = self.post(url=vote_url, data=form, allow_redirects=True).json()
        if page:
            roll = page["Model"]
            vote_chamber = self.chamber_map[roll["ChamberName"]]
            # "7/1/16 01:00 AM"
            vote_date = dt.datetime.strptime(
                roll["TakenAtDateTime"],
                "%m/%d/%y %I:%M %p").strftime("%Y-%m-%d")

            # TODO: What does this code mean?
            vote_motion = roll["RollCallVoteType"]

            vote_passed = "pass" if roll[
                "RollCallStatus"] == "Passed" else "fail"
            other_count = (int(roll["NotVotingCount"]) +
                           int(roll["VacantVoteCount"]) +
                           int(roll["AbsentVoteCount"]) +
                           int(roll["ConflictVoteCount"]))
            vote = VoteEvent(
                chamber=vote_chamber,
                start_date=vote_date,
                motion_text=vote_motion,
                result=vote_passed,
                bill=bill,
                legislative_session=session,
                classification=[],
            )
            vote_pdf_url = ("https://legis.delaware.gov"
                            "/json/RollCallController/GenerateRollCallPdf"
                            "?rollCallId={}&chamberId={}".format(
                                vote_id, self.chamber_codes[vote_chamber]))
            # Vote URL is just a generic search URL with POSTed data,
            # so provide a different link
            vote.add_source(vote_pdf_url)
            vote.pupa_id = vote_pdf_url
            vote.set_count("yes", roll["YesVoteCount"])
            vote.set_count("no", roll["NoVoteCount"])
            vote.set_count("other", other_count)

            for row in roll["AssemblyMemberVotes"]:
                # AssemblyMemberId looks like it should work here,
                # but for some sessions it's bugged to only return session
                try:
                    voter = self.legislators_by_short[str(row["ShortName"])]
                    name = voter["DisplayName"]
                except KeyError:
                    self.warning("could not find legislator short name %s",
                                 row["ShortName"])
                    name = row["ShortName"]
                if row["SelectVoteTypeCode"] == "Y":
                    vote.yes(name)
                elif row["SelectVoteTypeCode"] == "N":
                    vote.no(name)
                else:
                    vote.vote("other", name)

            yield vote