예제 #1
0
 def scrape_vote_text(self, filelocation, local=False):
     """Retrieves or uses local copy of vote pdf and converts into XML."""
     if not local:
         try:
             filename, response = self.urlretrieve(url=filelocation)
             vote_text = convert_pdf(filename, type="xml")
             os.remove(filename)
         except scrapelib.HTTPError:
             self.warning("Request failed: {}".format(filelocation))
             return
     else:
         vote_text = convert_pdf(filelocation, type="xml")
         os.remove(filelocation)
     return vote_text
예제 #2
0
    def _fix_house_text(self, filename):
        """
        TLDR: throw out bad text, replace it using different parser
        settings.

        When using `pdftotext` on the 2015 House committee list,
        the second and third columns of the second page get mixed up,
        which makes it very difficult to parse. Adding the `--layout`
        option fixes this, but isn't worth switching all parsing to
        that since the standard `pdftotext --nolayout` is easier in all
        other cases.

        The best solution to this is to throw out the offending text,
        and replace it with the correct text. The third and fourth
        columns are joint comittees that are scraped from the Senate
        document, so the only column that needs to be inserted this way
        is the second.
        """

        # Take the usable text from the normally-working parsing settings
        text = convert_pdf(filename, type="text-nolayout")
        assert "Revised: January 23, 2015" in text, (
            "House committee list has changed; check that the special-case"
            " fix is still necessary, and that the result is still correct")
        text = re.sub(r"(?sm)Appropriations/F&C.*$", "", text)

        # Take the usable column from the alternate parser
        alternate_text = convert_pdf(filename, type="text")
        alternate_lines = alternate_text.split("\n")

        HEADER_OF_COLUMN_TO_REPLACE = "State Administration (cont.)      "
        (text_of_line_to_replace, ) = [
            x for x in alternate_lines if HEADER_OF_COLUMN_TO_REPLACE in x
        ]
        first_line_to_replace = alternate_lines.index(text_of_line_to_replace)
        first_character_to_replace = (alternate_lines[first_line_to_replace].
                                      index(HEADER_OF_COLUMN_TO_REPLACE) - 1)
        last_character_to_replace = first_character_to_replace + len(
            HEADER_OF_COLUMN_TO_REPLACE)

        column_lines_to_add = [
            x[first_character_to_replace:last_character_to_replace]
            for x in alternate_lines[first_line_to_replace + 1:]
        ]
        column_text_to_add = "\n".join(column_lines_to_add)

        text = text + column_text_to_add
        return text
예제 #3
0
    def __init__(self, url, resp, bill):
        self.url = url
        self.bill = bill

        # Fetch the document and put it into tempfile.
        fd, filename = tempfile.mkstemp()

        with open(filename, "wb") as f:
            f.write(resp)

        # Convert it to text.
        try:
            text = convert_pdf(filename, type="text")
        except Exception:
            msg = "couldn't convert pdf."
            raise PDFCommitteeVoteParseError(msg)

        # Get rid of the temp file.
        os.close(fd)
        os.remove(filename)

        if not text.strip():
            msg = "PDF file was empty."
            raise PDFCommitteeVoteParseError(msg)

        self.text = "\n".join(
            [line.decode() for line in text.splitlines() if line])
예제 #4
0
    def _scrape_from_pdf(self):
        # FIXME: change for other years (2019 URL still valid for 2020)
        pdf_url = (
            "https://www.elections.ny.gov/NYSBOE/Elections/2019/ElectedOfficials.pdf"
        )
        filename, response = self.urlretrieve(pdf_url)
        text = convert_pdf(filename, type="text")
        columns = []
        lines = iter(text.decode().split("\n"))
        for ln in lines:
            if "ELECTED REPRESENTATIVES FOR NEW YORK STATE" in ln:
                next(lines)
                next(lines)
            else:
                columns.append(ln)

        serial = chain((ln[:40].strip() for ln in columns),
                       (ln[40:].strip() for ln in columns))

        for ln in serial:
            if not ln:
                continue
            section = []
            while ln:
                section.append(ln)
                ln = next(serial)
            yield section
예제 #5
0
    def scrape_senate_vote(self, bill, url, date):
        try:
            filename, resp = self.urlretrieve(url)
        except scrapelib.HTTPError:
            self.warning("missing vote file %s" % url)
            return

        vote = VoteEvent(
            chamber="upper",
            start_date=date.strftime("%Y-%m-%d"),
            motion_text="Passage",
            # setting 'fail' for now.
            result="fail",
            classification="passage",
            bill=bill,
        )
        vote.add_source(url)
        vote.pupa_id = url

        text = convert_pdf(filename, "text").decode("utf-8")
        os.remove(filename)

        if re.search(r"Yea:\s+\d+\s+Nay:\s+\d+\s+Absent:\s+\d+", text):
            yield from self.scrape_senate_vote_3col(bill, vote, text, url,
                                                    date)
            return

        data = re.split(r"(Yea|Nay|Absent)s?:", text)[::-1]
        data = filter(None, data)
        keymap = dict(yea="yes", nay="no")
        actual_vote = collections.defaultdict(int)
        vote_count = {"yes": 0, "no": 0, "other": 0}
        while True:
            if not data:
                break
            vote_val = data.pop()
            key = keymap.get(vote_val.lower(), "other")
            values = data.pop()
            for name in re.split(r"(?:[\s,]+and\s|[\s,]{2,})", values):
                if name.lower().strip() == "none.":
                    continue
                name = name.replace("..", "")
                name = re.sub(r"\.$", "", name)
                name = name.strip("-1234567890 \n")
                if not name:
                    continue
                vote.vote(key, name)
                actual_vote[vote_val] += 1
                vote_count[key] += 1
            assert actual_vote[vote_val] == vote_count[key]

        for key, value in vote_count.items():
            vote.set_count(key, value)
        # updating result with actual value
        vote.result = ("pass" if vote_count["yes"] >
                       (vote_count["no"] + vote_count["other"]) else "fail")

        yield vote
예제 #6
0
 def get_house_pdf(self, vurl):
     """cache house PDFs since they are done by year"""
     if vurl not in self.house_pdf_cache:
         (path, resp) = self.urlretrieve(vurl)
         pdflines = convert_pdf(path, "text")
         os.remove(path)
         self.house_pdf_cache[vurl] = pdflines.decode("utf-8").replace(
             "\u2019", "'")
     return self.house_pdf_cache[vurl]
예제 #7
0
 def fetch_pdf_lines(self, href):
     # download the file
     try:
         fname, resp = self.urlretrieve(href)
         pdflines = [
             line.decode("utf-8") for line in convert_pdf(fname, "text").splitlines()
         ]
         os.remove(fname)
         return pdflines
     except scrapelib.HTTPError as e:
         assert "404" in e.args[0], "File not found: {}".format(e)
         self.warning("404 error for vote; skipping vote")
         return False
예제 #8
0
    def _load_emails_from_directory_pdf(self):
        """
        Load the house PDF directory and convert to LXML - needed to
        find email addresses which are gone from the website.
        """
        with tempfile.NamedTemporaryFile() as temp:
            self.scraper.urlretrieve(self.directory_pdf_url, temp.name)
            directory = convert_pdf(temp.name, "xml").decode('latin1')

        # pull out member email addresses from the XML salad produced
        # above - there's no obvious way to match these to names, but
        # fortunately they have names in them
        return set(re.findall(r'[\w.][email protected]', directory))
예제 #9
0
    def scrape_rollcall(self, vote, vurl):
        """
         Get text information from the pdf, containing the vote roll call
         and add the information obtained to the related voteEvent object
        :param vote:  related voteEvent object
        :param vurl:  pdf source url
        """
        (path, resp) = self.urlretrieve(vurl)
        pdflines = convert_pdf(path, "text")
        os.remove(path)

        current_vfunc = None
        option = None

        for line in pdflines.split(b"\n"):
            line = line.strip().decode()

            # change what is being recorded
            if line.startswith("YEAS") or line.startswith("AYES"):
                current_vfunc = vote.yes
            elif line.startswith("NAYS"):
                current_vfunc = vote.no
            elif line.startswith("EXCUSED"):
                current_vfunc = vote.vote
                option = "excused"
            elif line.startswith("NOT VOTING"):
                current_vfunc = vote.vote
                option = "excused"
            elif line.startswith("ABSTAIN"):
                current_vfunc = vote.vote
                option = "excused"
            elif line.startswith("PAIRED"):
                current_vfunc = vote.vote
                option = "paired"

            # skip these
            elif not line or line.startswith("Page "):
                continue

            # if a vfunc is active
            elif current_vfunc:
                # split names apart by 3 or more spaces
                names = re.split(r"\s{3,}", line)
                for name in names:
                    if name:
                        if not option:
                            current_vfunc(name.strip())
                        else:
                            current_vfunc(option=option, voter=name.strip())
예제 #10
0
    def _load_emails_from_directory_pdf(self):
        """
        Load the house PDF directory and convert to LXML - needed to
        find email addresses which are gone from the website.
        """
        with tempfile.NamedTemporaryFile() as temp:
            self.scraper.urlretrieve(self.directory_pdf_url, temp.name)
            directory = lxml.etree.fromstring(convert_pdf(temp.name, "xml"))

        # pull out member email addresses from the XML salad produced
        # above - there's no obvious way to match these to names, but
        # fortunately they have names in them
        return set(
            directory.xpath(
                '//text[contains(text(), "@myfloridahouse.gov")]/text()'))
예제 #11
0
    def scrape_senate_vote(self, vote, vurl):
        # download file to server
        (path, resp) = self.urlretrieve(vurl)
        pdflines = convert_pdf(path, "text")
        os.remove(path)

        # for y, n
        mode = None

        lines = pdflines.splitlines()

        # handle individual lines in pdf to id legislator votes
        for line in lines:
            line = line.strip()
            line = line.decode("utf-8").replace("\u2212", "-")
            if line == "":
                continue
            # change mode accordingly
            elif line.startswith("YEAS"):
                mode = "y"
            elif line.startswith("NAYS"):
                mode = "n"
            elif line.startswith("ABSENT OR"):
                mode = "o"
            # else parse line with names
            else:
                nameline = line.split("   ")

                for raw_name in nameline:
                    raw_name = raw_name.strip()
                    if raw_name == "":
                        continue

                    # handles vote count lines
                    cut_name = raw_name.split("-")
                    clean_name = ""
                    if cut_name[-1].strip(" .").isdigit():
                        del cut_name[-1]
                        clean_name = "".join(cut_name)
                    else:
                        clean_name = raw_name.strip()
                    # update vote object with names
                    if mode == "y":
                        vote.yes(clean_name)
                    elif mode == "n":
                        vote.no(clean_name)
                    elif mode == "o":
                        vote.vote("other", clean_name)
예제 #12
0
    def scrape_votes(self, url, motion, date, chamber, bill):
        try:
            vote_pdf, resp = self.urlretrieve(url)
        except scrapelib.HTTPError:
            self.warning("Can't find vote file {}, skipping".format(url))
            return

        text = convert_pdf(vote_pdf, "text")
        os.remove(vote_pdf)

        # this way we get a key error on a missing vote type
        motion, passed = self._vote_mapping[motion]

        yes_votes = []
        no_votes = []
        other_votes = []
        absent_votes = []
        not_voting_votes = []
        # point at array to add names to
        cur_array = None

        precursors = (
            ("yeas--", yes_votes),
            ("nays--", no_votes),
            ("absent or those not voting--", absent_votes),
            ("absent and those not voting--", absent_votes),
            ("not voting--", not_voting_votes),
            ("voting present--", other_votes),
            ("present--", other_votes),
            ("disclaimer", None),
        )

        # split lines on newline, recombine lines that don't end in punctuation
        lines = _combine_lines(text.decode().split("\n"))

        for line in lines:

            # check if the line starts with a precursor, switch to that array
            for pc, arr in precursors:
                if pc in line.lower():
                    cur_array = arr
                    line = line.replace(pc, "")

            # split names
            for name in line.split(","):
                name = name.strip()

                # move on if that's all there was
                if not name:
                    continue

                # None or a Total indicate the end of a section
                if "None." in name:
                    cur_array = None

                match = re.match(r"(.+?)\. Total--.*", name)
                if match:
                    cur_array.append(match.groups()[0])
                    cur_array = None

                # append name if it looks ok
                junk_in_name = False
                for junk in (
                        "on final passage",
                        "Necessary",
                        "who would have",
                        "being a tie",
                        "therefore",
                        "Vacancies",
                        "a pair",
                        "Total-",
                        "ATTORNEY",
                        "on final passage",
                        "SPEAKER",
                        "BOARD",
                        "TREASURER",
                        "GOVERNOR",
                        "ARCHIVES",
                        "SECRETARY",
                ):
                    if junk in name:
                        junk_in_name = True
                        break
                if cur_array is not None and not junk_in_name:
                    # strip trailing .
                    if name[-1] == ".":
                        name = name[:-1]
                    name = self.clean_voter_name(name)
                    cur_array.append(name)

        # return vote object
        yes_count = len(yes_votes)
        no_count = len(no_votes)
        absent_count = len(absent_votes)
        not_voting_count = len(not_voting_votes)
        other_count = len(other_votes)

        vote = VoteEvent(
            chamber=chamber,
            start_date=self._tz.localize(date),
            motion_text=motion,
            result="pass" if passed else "fail",
            classification="passage",
            bill=bill,
        )
        vote.dedupe_key = url + "#" + bill.identifier

        vote.set_count("yes", yes_count)
        vote.set_count("no", no_count)
        vote.set_count("absent", absent_count)
        vote.set_count("not voting", not_voting_count)
        vote.set_count("other", other_count)
        vote.add_source(url)
        for yes_vote in yes_votes:
            vote.vote("yes", self.clean_voter_name(yes_vote))
        for no_vote in no_votes:
            vote.vote("no", self.clean_voter_name(no_vote))
        for absent_vote in absent_votes:
            vote.vote("absent", self.clean_voter_name(absent_vote))
        for not_voting_vote in not_voting_votes:
            vote.vote("not voting", self.clean_voter_name(not_voting_vote))
        for other_vote in other_votes:
            vote.vote("other", self.clean_voter_name(other_vote))
        yield vote
예제 #13
0
    def scrape_committees_pdf(self, year, chamber, filename, url):
        if chamber == "lower" and year == "2015":
            text = self._fix_house_text(filename).decode()
        else:
            text = convert_pdf(filename, type="text-nolayout").decode()

        for hotgarbage, replacement in (
            (
                r"Judicial Branch, Law Enforcement,\s+and\s+Justice",
                "Judicial Branch, Law Enforcement, and Justice",
            ),
            (
                r"Natural Resources and\s+Transportation",
                "Natural Resources and Transportation",
            ),
            (
                r"(?u)Federal Relations, Energy,?\s+and\s+Telecommunications",
                "Federal Relations, Energy, and Telecommunications",
            ),
        ):
            text = re.sub(hotgarbage, replacement, text)

        lines = iter(text.splitlines())

        # Drop any lines before the ag committee.
        lines = dropwhile(lambda s: "Agriculture" not in s, lines)

        comm = None
        for line in lines:
            # Replace Unicode variants with ASCII equivalents
            line = line.replace(" ", " ").replace("‐", "-")

            if "Subcommittees" in line:
                self.warning("Currently, we're skipping subcommittees")
                # https://github.com/openstates/openstates/issues/2099
                break
            if is_committee_name(line):
                if comm and comm._related:
                    yield comm

                committee = line.strip()
                comm = Organization(name=committee,
                                    chamber=chamber,
                                    classification="committee")

                comm.add_source(url)

            elif is_legislator_name(line):
                name, party = line.rsplit("(", 1)
                name = name.strip().replace("Rep. ", "").replace("Sen. ", "")
                if re.search(" Ch", party):
                    role = "chair"
                elif " VCh" in party:
                    role = "vice chair"
                elif " MVCh" in party:
                    role = "minority vice chair"
                else:
                    role = "member"
                comm.add_member(name, role)

        if comm._related:
            yield comm
예제 #14
0
    def parse_vote_pdf(self, vote_url, bill):

        filename, response = self.urlretrieve(vote_url)

        text = convert_pdf(filename, type="text").decode()
        lines = text.splitlines()

        if "Senate" in vote_url:
            chamber = "upper"
        else:
            chamber = "lower"

        date_string = lines[0].split("Calendar Date:")[1].strip()
        date = datetime.datetime.strptime(date_string, "%b %d, %Y %I:%M (%p)")

        page_index = None
        for index, line in enumerate(lines):
            if "Yeas" in line and "Nays" in line:
                page_index = index
                break

        vote_counts = 5 * [0]
        vote_types = ["yes", "no", "not voting", "excused", "absent"]

        if page_index:

            counts = re.split(r"\s{2,}", lines[page_index].strip())

            for index, count in enumerate(counts):
                number, string = count.split(" ", 1)
                number = int(number)
                vote_counts[index] = number
        else:
            raise ValueError("Vote Counts Not found at %s" % vote_url)

        passed = vote_counts[0] > vote_counts[1]

        # Consent calendar votes address multiple bills in one VoteEvent
        # eg, http://mgaleg.maryland.gov/2018RS/votes/Senate/0478.pdf
        is_consent_calendar = any(
            ["Consent Calendar" in line for line in lines[:page_index]]
        )
        consent_calendar_bills = None
        motion = ""
        if is_consent_calendar:
            motion = re.split(r"\s{2,}", lines[page_index - 4].strip())[0]
            consent_calendar_bills = re.split(r"\s{2,}", lines[page_index - 1].strip())
            assert (
                consent_calendar_bills
            ), "Could not find bills for consent calendar vote"

        motion_keywords = [
            "favorable",
            "reading",
            "amendment",
            "motion",
            "introduced",
            "bill pass",
            "committee",
        ]
        motion_lines = [
            3,
            2,
            4,
            5,
        ]  # Relative LineNumbers to be checked for existence of motion

        for i in motion_lines:
            if any(
                motion_keyword in motion.lower() for motion_keyword in motion_keywords
            ):
                break
            motion = re.split(r"\s{2,}", lines[page_index - i].strip())[0]
        else:
            if not any(
                motion_keyword in motion.lower() for motion_keyword in motion_keywords
            ):
                # This condition covers for the bad formating in SB 1260
                motion = lines[page_index - 3]
            if not any(
                motion_keyword in motion.lower() for motion_keyword in motion_keywords
            ):
                # Check this one for SB 747
                motion = "No motion given"
                self.warning("No motion given")

        vote = VoteEvent(
            bill=bill,
            chamber=chamber,
            start_date=date.strftime("%Y-%m-%d"),
            motion_text=motion,
            classification="passage",
            result="pass" if passed else "fail",
        )

        # Include bill ID to avoid duplication for consent calendars
        vote.pupa_id = "{}#{}".format(vote_url, bill.identifier)

        for index, vote_type in enumerate(vote_types):
            vote.set_count(vote_type, vote_counts[index])
        page_index = page_index + 2

        # Keywords for identifying where names are located in the pdf
        show_stoppers = [
            "Voting Nay",
            "Not Voting",
            "COPY",
            "Excused",
            "indicates vote change",
            "Indicates Vote Change",
        ]
        vote_index = 0

        # For matching number of names extracted with vote counts(extracted independently)
        vote_name_counts = 5 * [0]

        while page_index < len(lines):

            current_line = lines[page_index].strip()

            if not current_line or "Voting Yea" in current_line:
                page_index += 1
                continue

            if any(show_stopper in current_line for show_stopper in show_stoppers):
                page_index += 1
                vote_index = vote_index + 1
                continue

            names = re.split(r"\s{2,}", current_line)

            vote_name_counts[vote_index] += len(names)

            for name in names:
                vote.vote(vote_types[vote_index], name)
            page_index += 1

        if vote_counts != vote_name_counts:
            raise ValueError("Votes Count and Number of Names don't match")

        return vote
예제 #15
0
    def scrape_journal(self, url, chamber, session, date):

        filename, response = self.urlretrieve(url)
        self.logger.info("Saved journal to %r" % filename)
        all_text = convert_pdf(filename, type="text")

        lines = all_text.split(b"\n")
        lines = [line.decode("utf-8") for line in lines]
        lines = [
            line.strip()
            .replace("–", "-")
            .replace("―", '"')
            .replace("‖", '"')
            .replace("“", '"')
            .replace("”", '"')
            for line in lines
        ]

        # Do not process headers or completely empty lines
        header_date_re = r"\d+\w{2} Day\s+\w+DAY, \w+ \d{1,2}, \d{4}\s+\d+"
        header_journal_re = r"\d+\s+JOURNAL OF THE \w+\s+\d+\w{2} Day"
        lines = iter(
            [
                line
                for line in lines
                if not (
                    line == ""
                    or re.match(header_date_re, line)
                    or re.match(header_journal_re, line)
                )
            ]
        )

        # bill_id -> motion -> count
        motions_per_bill = collections.defaultdict(collections.Counter)

        for line in lines:
            # Go through with vote parse if any of
            # these conditions match.
            if not line.startswith("On the question") or "shall" not in line.lower():
                continue

            # Get the bill_id
            bill_id = None
            bill_re = r"\(\s*([A-Z\.]+\s\d+)\s*\)"

            # The Senate ends its motion text with a vote announcement
            if chamber == "upper":
                end_of_motion_re = r".* the vote was:\s*"
            # The House may or may not end motion text with a bill name
            elif chamber == "lower":
                end_of_motion_re = r'.*Shall.*(?:\?"?|")(\s{})?\s*'.format(bill_re)

            while not re.match(end_of_motion_re, line, re.IGNORECASE):
                line += " " + next(lines)

            try:
                bill_id = re.search(bill_re, line).group(1)
            except AttributeError:
                self.warning(
                    "This motion did not pertain to legislation: {}".format(line)
                )
                continue

            # Get the motion text
            motion_re = r"""
                    ^On\sthe\squestion\s  # Precedes any motion
                    "+  # Motion is preceded by a quote mark (or two)
                    (Shall\s.+?\??)  # The motion text begins with "Shall"
                    \s*(?:\?"?|"|’)\s+  # Motion is followed by a question mark and/or a quote mark
                    (?:{})?  # If the vote regards a bill, its number is listed
                    {}  # Senate has trailing text
                    \s*$
                    """.format(
                # in at least one case [SF 457 from 2020] the bill number is followed by )0
                # seemingly just a typo, this gets around that
                bill_re,
                r",?.*?the\svote\swas:" if chamber == "upper" else r"\d?",
            )
            # print("motion candidate line:", line)
            motion = re.search(motion_re, line, re.VERBOSE | re.IGNORECASE)
            if motion:
                motion = motion.group(1)

            for word, letter in (("Senate", "S"), ("House", "H"), ("File", "F")):

                if bill_id is None:
                    return

                bill_id = bill_id.replace(word, letter)

            bill_id = bill_id.replace(".", "")

            bill_chamber = dict(h="lower", s="upper")[bill_id.lower()[0]]
            votes, passed = self.parse_votes(lines)

            # at the very least, there should be a majority
            # for the bill to have passed, so check that,
            # but if the bill didn't pass, it could still be OK if it got a majority
            # eg constitutional amendments
            if not (
                (passed == (votes["yes_count"] > votes["no_count"])) or (not passed)
            ):
                self.error("The bill passed without a majority?")
                raise ValueError("invalid vote")

            # also throw a warning if the bill failed but got a majority
            # it could be OK, but is probably something we'd want to check
            if not passed and votes["yes_count"] > votes["no_count"]:
                self.logger.warning(
                    "The bill got a majority but did not pass. "
                    "Could be worth confirming."
                )

            result = ""
            if passed:
                result = "pass"
            else:
                result = "fail"

            # check for duplicate motions and number second and up if needed
            motion_text = re.sub("\xad", "-", motion)
            motions_per_bill[bill_id][motion_text] += 1
            new_count = motions_per_bill[bill_id][motion_text]
            if new_count > 1:
                motion_text += f" #{new_count}"

            vote = VoteEvent(
                chamber=chamber,
                start_date=date,
                motion_text=motion_text,
                result=result,
                classification="passage",
                legislative_session=session,
                bill=bill_id,
                bill_chamber=bill_chamber,
            )

            # add votes and counts
            for vtype in ("yes", "no", "absent", "abstain"):
                vcount = votes["{}_count".format(vtype)] or 0
                vote.set_count(vtype, vcount)
                for voter in votes["{}_votes".format(vtype)]:
                    vote.vote(vtype, voter)

            vote.add_source(url)
            yield vote
예제 #16
0
    def scrape_journal(self, url, chamber, session, date):
        filename = self.urlretrieve(url)[0]
        self.logger.info("Saved journal to %r", filename)
        all_text = convert_pdf(filename, type="text")

        lines = all_text.split(b"\n")
        lines = [line.decode("utf-8") for line in lines]
        lines = [line.strip() for line in lines]

        for index, line in enumerate(lines):
            if "Resultado de la Votación para la Medida" not in line:
                continue
            name_line = lines[index + 1]
            result_line = lines[index + 2]
            nomination_result_line = lines[index + 3]

            name_match = re.match(r"^(?P<type>.*) (?P<num>\d*) (?P<ref>.*)$",
                                  name_line).groupdict()

            bill = self.classify_measure_type(name_match)
            if not bill:
                continue

            if re.match("^NM", bill):
                # Nomination
                if re.match(r"(.*)Confirmado", nomination_result_line):
                    result = "pass"
                else:
                    msg = "Unhandled nomination result of: {}. Skipping.".format(
                        nomination_result_line)
                    self.logger.warning(msg)
                    continue
                name_line = result_line

            else:
                # Not a Nomination
                if re.match(r"(.*)Recibido", result_line):
                    msg = "Result was 'Recibido': {}. Skipping.".format(
                        result_line)
                    self.logger.warning(msg)
                    continue
                try:
                    vote_result = re.match(
                        r".* (?P<yes>\d*)X(?P<no>\d*)X(?P<abstain>\d*)X(?P<absent>\d*) (?P<result>\w*)",
                        result_line,
                    ).groupdict()
                except AttributeError:
                    msg = "Could not determine voting result of: {}. Skipping.".format(
                        result_line)
                    self.logger.warning(msg)
                    continue

                if vote_result["result"] == "Aprobada":
                    result = "pass"
                else:
                    result = "fail"
                    msg = "Voting result {} not guarenteed to be 'fail'. Take a look.".format(
                        vote_result["result"])
                    self.logger.warning(msg)

            vote = VoteEvent(
                chamber=chamber,
                start_date=date,
                motion_text=name_line,
                result=result,
                classification="passage",
                legislative_session=session,
                bill=bill,
                bill_chamber=chamber,
            )

            vote_index = index + 3

            while not re.match("^Votante", lines[vote_index]):
                vote_index = vote_index + 1

            vote_index = vote_index + 1

            votes = {
                "yes": 0,
                "no": 0,
                "absent": 0,
                "abstain": 0,
            }

            while lines[vote_index].strip() and not re.match(
                    r"Senado de", lines[vote_index]):
                name, vtype = parse_vote(lines[vote_index])
                votes[vtype] += 1
                vote.vote(vtype, name)
                vote_index = vote_index + 1

            for vtype in ("yes", "no", "absent", "abstain"):
                vote.set_count(vtype, votes[vtype])

            vote.add_source(url)
            yield vote
예제 #17
0
    def scrape_house_vote(self, bill, url):
        try:
            filename, resp = self.urlretrieve(url)
        except scrapelib.HTTPError:
            self.warning("missing vote file %s" % url)
            return
        text = convert_pdf(filename, "text")
        os.remove(filename)

        lines = text.splitlines()

        vote_type = None
        votes = collections.defaultdict(list)
        date = None

        for idx, line in enumerate(lines):
            line = line.rstrip().decode("utf-8")
            match = re.search(r"(\d+)/(\d+)/(\d{4,4})$", line)
            if match:
                date = datetime.datetime.strptime(match.group(0), "%m/%d/%Y")
                continue

            match = re.match(
                r"\s+YEAS: (\d+)\s+NAYS: (\d+)\s+NOT VOTING: (\d+)", line)
            if match:
                motion = (lines[idx - 2].strip()).decode("utf-8")
                if not motion:
                    self.warning("No motion text found for vote")
                    motion = "PASSAGE"
                yes_count, no_count, other_count = [
                    int(g) for g in match.groups()
                ]

                exc_match = re.search(r"EXCUSED: (\d+)", line)
                if exc_match:
                    other_count += int(exc_match.group(1))

                if line.endswith("ADOPTED") or line.endswith("PASSED"):
                    passed = True
                else:
                    passed = False

                continue

            match = re.match(
                r"(YEAS|NAYS|NOT VOTING|PAIRED|EXCUSED):\s+(\d+)\s*$", line)
            if match:
                vote_type = {
                    "YEAS": "yes",
                    "NAYS": "no",
                    "NOT VOTING": "other",
                    "EXCUSED": "other",
                    "PAIRED": "paired",
                }[match.group(1)]
                continue

            if vote_type == "paired":
                for part in line.split("   "):
                    part = part.strip()
                    if not part:
                        continue
                    name, pair_type = re.match(r"([^\(]+)\((YEA|NAY)\)",
                                               line).groups()
                    name = name.strip()
                    if pair_type == "YEA":
                        votes["yes"].append(name)
                    elif pair_type == "NAY":
                        votes["no"].append(name)
            elif vote_type:
                for name in line.split("   "):
                    name = name.strip()
                    if not name:
                        continue
                    votes[vote_type].append(name)
        if date:
            vote = VoteEvent(
                chamber="lower",
                start_date=date.strftime("%Y-%m-%d"),
                motion_text=motion,
                result="pass" if passed else "fail",
                classification="passage",
                bill=bill,
            )

            vote.set_count("yes", yes_count)
            vote.set_count("no", no_count)
            vote.set_count("other", other_count)
            vote.add_source(url)
            vote.pupa_id = url

            for key, values in votes.items():
                for value in values:
                    if "Committee" in value:
                        continue
                    if "*" in value:
                        value = value.replace("*", "")
                    vote.vote(key, value)

            yield vote
        else:
            self.warning("Syntax Error/Warning using 'convert_pdf'")
예제 #18
0
    def _process_votes(self, rollcalls, bill_id, original_chamber, session):
        result_types = {
            "FAILED": False,
            "DEFEATED": False,
            "PREVAILED": True,
            "PASSED": True,
            "SUSTAINED": True,
            "NOT SECONDED": False,
            "OVERRIDDEN": True,
            "ADOPTED": True,
        }

        for r in rollcalls:
            proxy_link = PROXY_BASE_URL + r["link"]

            try:
                (path, resp) = self.urlretrieve(proxy_link)
            except scrapelib.HTTPError as e:
                self.warning(e)
                self.warning(
                    "Unable to contact openstates proxy, skipping vote {}".format(
                        r["link"]
                    )
                )
                continue

            text = convert_pdf(path, "text").decode("utf-8")
            lines = text.split("\n")
            os.remove(path)

            chamber = (
                "lower" if "house of representatives" in lines[0].lower() else "upper"
            )
            date_parts = lines[1].strip().split()[-3:]
            date_str = " ".join(date_parts).title() + " " + lines[2].strip()

            vote_date = datetime.datetime.strptime(date_str, "%b %d, %Y %I:%M:%S %p")
            vote_date = pytz.timezone("America/Indiana/Indianapolis").localize(
                vote_date
            )
            vote_date = vote_date.isoformat()

            passed = None

            for res, val in result_types.items():
                # We check multiple lines now because the result of the
                # roll call vote as parsed can potentially be split.
                # PDF documents suck.
                for line in lines[3:5]:
                    if res in line.upper():
                        passed = val
                        break

            if passed is None:
                raise AssertionError("Missing bill passage type")

            motion = " ".join(lines[4].split()[:-2])
            try:
                yeas = int(lines[4].split()[-1])
                nays = int(lines[5].split()[-1])
                excused = int(lines[6].split()[-1])
                not_voting = int(lines[7].split()[-1])
            except ValueError:
                self.logger.warning("Vote format is weird, skipping")
                continue

            vote = VoteEvent(
                chamber=chamber,
                legislative_session=session,
                bill=bill_id,
                bill_chamber=original_chamber,
                start_date=vote_date,
                motion_text=motion,
                result="pass" if passed else "fail",
                classification="passage",
            )

            vote.set_count("yes", yeas)
            vote.set_count("no", nays)
            vote.set_count("excused", excused)
            vote.set_count("not voting", not_voting)
            vote.add_source(proxy_link)

            currently_counting = ""

            possible_vote_lines = lines[8:]
            for line in possible_vote_lines:
                line = line.replace("NOT\xc2\xa0VOTING", "NOT VOTING")
                line = line.replace("\xc2\xa0", " -")
                if "yea-" in line.lower().replace(" ", ""):
                    currently_counting = "yes"
                elif "nay-" in line.lower().replace(" ", ""):
                    currently_counting = "no"
                elif "excused-" in line.lower().replace(" ", ""):
                    currently_counting = "excused"
                elif "notvoting-" in line.lower().replace(" ", ""):
                    currently_counting = "not voting"
                elif currently_counting == "":
                    pass
                elif re.search(r"v\. \d\.\d", line):
                    # this gets rid of the version number
                    # which is often found at the bottom of the doc
                    pass
                else:
                    voters = line.split("  ")
                    for v in voters:
                        if v.strip():
                            vote.vote(currently_counting, v.strip())

            yield vote
예제 #19
0
 def pdf_to_lxml(self):
     filename, resp = self.scraper.urlretrieve(self.url)
     text = convert_pdf(filename, "html")
     return lxml.html.fromstring(text)
예제 #20
0
    def scrape_lower(self):
        PDF_URL = "http://www.ohiohouse.gov/Assets/CommitteeSchedule/calendar.pdf"
        (path, _response) = self.urlretrieve(PDF_URL)
        text = convert_pdf(path, type="text-nolayout").decode()
        os.remove(path)

        days = re.split(r"(\wF+day, \w+ \d{1,2}, 20\d{2})", text)
        date = None
        for day in enumerate(days[1:]):
            if day[0] % 2 == 0:
                date = day[1]
            else:

                events = re.split(r"\n((?:\w+\s?)+)\n", day[1])
                comm = ""
                for event in enumerate(events[1:]):
                    if event[0] % 2 == 0:
                        comm = event[1].strip()
                    else:

                        try:
                            (time, location, description) = re.search(
                                r"""(?mxs)
                                    (\d{1,2}:\d{2}\s[ap]\.m\.)  # Meeting time
                                    .*?,\s  # Potential extra text for meeting time
                                    (.*?),\s  # Location, usually a room
                                    .*?\n  # Chairman of committee holding event
                                    (.*)  # Description of event
                                    """,
                                event[1],
                            ).groups()
                        except AttributeError:
                            continue

                        time = time.replace(".", "").upper()
                        time = datetime.datetime.strptime(
                            time + "_" + date, "%I:%M %p_%A, %B %d, %Y")
                        time = self._tz.localize(time)

                        location = location.strip()

                        description = "\n".join([
                            x.strip() for x in description.split("\n")
                            if x.strip() and not x.strip()[0].isdigit()
                        ])

                        if not description:
                            description = "[No description provided by state]"

                        event = Event(
                            name=description,
                            start_date=time,
                            location_name=location,
                            description=description,
                        )
                        event.add_source(PDF_URL)
                        event.add_participant(comm,
                                              type="committee",
                                              note="host")
                        for line in description.split("\n"):
                            related_bill = re.search(
                                r"(H\.?(?:[JC]\.?)?[BR]\.?\s+\d+)\s+(.*)$",
                                line)
                            if related_bill:
                                (related_bill,
                                 relation) = related_bill.groups()
                                relation = relation.strip()
                                related_bill = related_bill.replace(".", "")
                                item = event.add_agenda_item(relation)
                                item.add_bill(related_bill)

                        yield event
예제 #21
0
 def _get_pdf(self, url):
     (path, response) = self.urlretrieve(url)
     data = convert_pdf(path, type="text")
     os.remove(path)
     return data
예제 #22
0
 def pdf_to_lxml(self, filename, type="html"):
     text = convert_pdf(filename, type)
     return lxml.html.fromstring(text)
예제 #23
0
    def scrape_vote(self, url, session):
        fname, _ = self.urlretrieve(url)
        text = convert_pdf(fname, type="text").decode()
        lines = text.splitlines()

        chamber = "upper" if "senate" in url else "lower"
        if "Maryland" not in text:
            self.warning(f"empty vote from {url}")
            return
        date = re.findall(r"Legislative Date: (\w+ \d+, \d{4})", text)[0]

        section = "preamble"
        motion = None
        bill_id = None
        how = None
        voters = defaultdict(list)

        for line in lines:
            if section == "preamble":
                if "vetoed" in line.lower():
                    self.warning(
                        f"skipping vote that appears to be on prior session: {line}, {bill_id}"
                    )
                    return
                possible_bill_id = re.findall(r"([HS][BJR] \d+)", line)
                if possible_bill_id:
                    bill_id = possible_bill_id[0]

                # preamble has metadata, then motion, then counts.  our process then is to
                # store the last line as the motion, but if the last line looks like a
                # continuation, append it to the prior line

                line = line.strip()
                counts = re.findall(
                    r"(\d+) Yeas\s+(\d+) Nays\s+(\d+) Not Voting\s+(\d+) Excused\s+(\d+) Absent",
                    line,
                )
                if counts:
                    yes_count, no_count, nv_count, excused_count, absent_count = counts[
                        0]
                    yes_count = int(yes_count)
                    no_count = int(no_count)
                    nv_count = int(nv_count)
                    excused_count = int(excused_count)
                    absent_count = int(absent_count)
                    section = "votes"
                elif line and line != "(Const)":
                    # questions seem to be split across two lines
                    if line.endswith("?"):
                        motion = motion + " " + line
                    else:
                        motion = line
            elif section == "votes":
                if line.startswith("Voting Yea"):
                    how = "yes"
                elif line.startswith("Voting Nay"):
                    how = "no"
                elif line.startswith("Not Voting"):
                    how = "not voting"
                elif line.startswith("Excused from Voting"):
                    how = "excused"
                elif line.startswith("Excused (Absent)"):
                    how = "absent"
                elif how:
                    names = re.split(r"\s{2,}", line)
                    voters[how].extend(names)

        if not bill_id and not motion:
            return
        elif bill_id and not motion:
            self.warning(
                f"got {bill_id} but no motion, not registering as a vote")
        elif motion and not bill_id:
            self.warning(
                f"got {motion} but no bill_id, not registering as a vote")
            return

        # bleh - result not indicated anywhere
        result = "pass" if yes_count > no_count else "fail"
        bill_chamber = "upper" if bill_id.startswith("S") else "lower"
        date = datetime.datetime.strptime(date,
                                          "%b %d, %Y").strftime("%Y-%m-%d")
        vote = VoteEvent(
            chamber=chamber,
            start_date=date,
            result=result,
            classification="passage",
            motion_text=motion,
            legislative_session=session,
            bill=bill_id,
            bill_chamber=bill_chamber,
        )
        # URL includes sequence ID, will be unique
        vote.dedupe_key = url
        vote.add_source(url)
        vote.set_count("yes", yes_count)
        vote.set_count("no", no_count)
        vote.set_count("not voting", nv_count)
        vote.set_count("excused", excused_count)
        vote.set_count("absent", absent_count)
        for how, names in voters.items():
            for name in names:
                name = name.strip().replace("*", "")
                if name and "COPY" not in name and "Indicates Vote Change" not in name:
                    vote.vote(how, name)
        check_counts(vote, raise_error=True)
        return vote
예제 #24
0
    def scrape_chamber(self, chamber, session):
        chamber_name = "house" if chamber == "lower" else "senate"
        session_slug = {
            "62": "62-2011",
            "63": "63-2013",
            "64": "64-2015",
            "65": "65-2017",
            "66": "66-2019",
        }[session]

        # Open the index page of the session's Registers, and open each
        url = "http://www.legis.nd.gov/assembly/%s/journals/%s-journal.html" % (
            session_slug,
            chamber_name,
        )
        page = self.lxmlize(url)
        pdfs = page.xpath("//a[contains(@href, '.pdf')]")
        for pdf in pdfs:

            # Initialize information about the vote parsing
            results = {}
            in_motion = False
            cur_vote = None
            in_vote = False
            cur_motion = ""
            bills = []

            # Determine which URLs the information was pulled from
            pdf_url = pdf.attrib["href"]

            try:
                (path, response) = self.urlretrieve(pdf_url)
            except requests.exceptions.ConnectionError:
                continue

            # Convert the PDF to text
            data = convert_pdf(path, type="text").decode("utf-8")
            os.unlink(path)

            # Determine the date of the document
            date = re.findall(date_re, data)
            if date:
                date = date[0][0]
                cur_date = datetime.datetime.strptime(date, "%A, %B %d, %Y")
            else:
                # If no date is found anywhere, do not process the document
                self.warning("No date was found for the document; skipping.")
                continue

            # Check each line of the text for motion and vote information
            lines = data.splitlines()
            for line in lines:
                # Ignore lines with no information
                if (re.search(chamber_re, line) or re.search(date_re, line)
                        or re.search(page_re, line) or line.strip() == ""):
                    pass

                # Ensure that motion and vote capturing are not _both_ active
                elif in_motion and in_vote:
                    raise AssertionError(
                        "Scraper should not be simultaneously processing " +
                        "motion name and votes, as it is for this motion: " +
                        cur_motion)

                # Start capturing motion text after a ROLL CALL header
                elif not in_motion and not in_vote:
                    if line.strip() == "ROLL CALL":
                        in_motion = True

                elif in_motion and not in_vote:
                    if cur_motion == "":
                        cur_motion = line.strip()
                    else:
                        cur_motion = cur_motion + " " + line.strip()

                    # ABSENT AND NOT VOTING marks the end of each motion name
                    # In this case, prepare to capture votes
                    if line.strip().endswith(
                            "VOTING") or line.strip().endswith("VOTING."):
                        in_motion = False
                        in_vote = True

                elif not in_motion and in_vote:
                    # Ignore appointments and confirmations
                    if "The Senate advises and consents to the appointment" in line:
                        in_vote = False
                        cur_vote = None
                        results = {}
                        cur_motion = ""
                        bills = []

                    # If votes are being processed, record the voting members
                    elif ":" in line:
                        cur_vote, who = (x.strip() for x in line.split(":", 1))
                        who = [
                            x.strip() for x in who.split(";")
                            if x.strip() != ""
                        ]
                        results[cur_vote] = who

                        name_may_be_continued = False if line.endswith(
                            ";") else True

                    # Extracts bill numbers in the closing text
                    # used for when the closing text is multiple lines.
                    elif (cur_vote is not None
                          and re.findall(r"(?i)(H|S|J)(C?)(B|R|M) (\d+)", line)
                          and not any(x in line.lower() for x in [
                              "passed",
                              "adopted",
                              "sustained",
                              "prevailed",
                              "lost",
                              "failed",
                          ])):
                        bills.extend(
                            re.findall(r"(?i)(H|S|J)(C?)(B|R|M) (\d+)", line))

                    elif cur_vote is not None and not any(x in line.lower()
                                                          for x in [
                                                              "passed",
                                                              "adopted",
                                                              "sustained",
                                                              "prevailed",
                                                              "lost",
                                                              "failed",
                                                          ]):
                        who = [
                            x.strip() for x in line.split(";")
                            if x.strip() != ""
                        ]

                        if name_may_be_continued:
                            results[cur_vote][-1] = (results[cur_vote][-1] +
                                                     " " + who.pop(0))

                        name_may_be_continued = False if line.endswith(
                            ";") else True

                        results[cur_vote].extend(who)

                    # At the conclusion of a vote, save its data
                    elif any(x in line.lower() for x in [
                            "passed",
                            "adopted",
                            "sustained",
                            "prevailed",
                            "lost",
                            "failed",
                    ]):

                        in_vote = False
                        cur_vote = None

                        # Identify what is being voted on
                        # Throw a warning if impropper informaiton found
                        bills.extend(
                            re.findall(r"(?i)(H|S|J)(C?)(B|R|M) (\d+)", line))
                        if bills == [] or cur_motion.strip() == "":
                            results = {}
                            cur_motion = ""
                            self.warning("No motion or bill name found: " +
                                         "motion name: " + cur_motion + "; " +
                                         "decision text: " + line.strip())
                            continue

                        # If votes are found in the motion name, throw an error
                        if "YEAS:" in cur_motion or "NAYS:" in cur_motion:
                            raise AssertionError(
                                "Vote data found in motion name: " +
                                cur_motion)

                        # Use the collected results to determine who voted how
                        keys = {
                            "YEAS": "yes",
                            "NAYS": "no",
                            "ABSENT AND NOT VOTING": "other",
                        }
                        res = {}
                        for key in keys:
                            if key in results:
                                res[keys[key]] = results[key]
                            else:
                                res[keys[key]] = []

                        # Count the number of members voting each way
                        yes, no, other = (
                            len(res["yes"]),
                            len(res["no"]),
                            len(res["other"]),
                        )
                        chambers = {
                            "H": "lower",
                            "S": "upper",
                            "J": "legislature"
                        }

                        # Almost all of the time, a vote only applies to one bill and this loop
                        # will only be run once.
                        # Some exceptions exist.

                        for bill in bills:

                            cur_bill_id = "%s%s%s %s" % bill

                            # Identify the source chamber for the bill
                            try:
                                bc = chambers[cur_bill_id[0]]
                            except KeyError:
                                bc = "other"

                            # Determine whether or not the vote passed
                            if "over the governor's veto" in cur_motion.lower(
                            ):
                                VETO_SUPERMAJORITY = 2 / 3
                                passed = yes / (yes + no) > VETO_SUPERMAJORITY
                            else:
                                passed = yes > no
                            # Create a Vote object based on the scraped information
                            vote = Vote(
                                chamber=chamber,
                                start_date=cur_date.strftime("%Y-%m-%d"),
                                motion_text=cur_motion,
                                result="pass" if passed else "fail",
                                legislative_session=session,
                                classification="passage",
                                bill=cur_bill_id,
                                bill_chamber=bc,
                            )

                            vote.add_source(pdf_url)
                            vote.add_source(url)
                            vote.set_count("yes", yes)
                            vote.set_count("no", no)
                            vote.set_count("other", other)
                            # For each category of voting members,
                            # add the individuals to the Vote object
                            for key in res:
                                for voter in res[key]:
                                    vote.vote(key, voter)

                            # Check the vote counts in the motion text against
                            # the parsed results
                            for category_name in keys.keys():
                                # Need to search for the singular, not plural, in the text
                                # so it can find, for example,  " 1 NAY "
                                vote_re = r"(\d+)\s{}".format(
                                    category_name[:-1])
                                motion_count = int(
                                    re.findall(vote_re, cur_motion)[0])

                                for item in vote.counts:
                                    if item["option"] == keys[category_name]:
                                        vote_count = item["value"]

                                if motion_count != vote_count:
                                    self.warning(
                                        "Motion text vote counts ({}) ".format(
                                            motion_count) +
                                        "differed from roll call counts ({}) ".
                                        format(vote_count) +
                                        "for {0} on {1}".format(
                                            category_name, cur_bill_id))

                                    for item in vote.counts:
                                        if item["option"] == keys[
                                                category_name]:
                                            vote_count = motion_count

                            yield vote

                        # With the vote successfully processed,
                        # wipe its data and continue to the next one
                        results = {}
                        cur_motion = ""
                        bills = []
예제 #25
0
    def scrape_votes(self, vote_url, bill, chamber):

        try:
            filename, response = self.urlretrieve(vote_url)
        except scrapelib.HTTPError:
            self.logger.warning("PDF not posted or available")
            return
        # Grabs text from pdf
        pdflines = [
            line.decode("utf-8")
            for line in convert_pdf(filename, "text").splitlines()
        ]
        os.remove(filename)

        vote_date = 0
        voters = defaultdict(list)
        for x in range(len(pdflines)):
            line = pdflines[x]
            if re.search(r"(\d+/\d+/\d+)", line):
                initial_date = line.strip()
            if ("AM" in line) or ("PM" in line):
                split_l = line.split()
                for y in split_l:
                    if ":" in y:
                        time_location = split_l.index(y)
                        motion = " ".join(split_l[0:time_location])
                        time = split_l[time_location:]
                        if len(time) > 0:
                            time = "".join(time)
                        dt = initial_date + " " + time
                        dt = datetime.strptime(dt, "%m/%d/%Y %I:%M:%S%p")
                        vote_date = central.localize(dt)
                        vote_date = vote_date.isoformat()
                        # In rare case that no motion is provided
                        if len(motion) < 1:
                            motion = "No Motion Provided"
            if "YEAS:" in line:
                yeas = int(line.split()[-1])
            if "NAYS:" in line:
                nays = int(line.split()[-1])
            if "ABSTAINED:" in line:
                abstained = int(line.split()[-1])
            if "PASSES:" in line:
                abstained = int(line.split()[-1])
            if "NOT VOTING:" in line:
                not_voting = int(line.split()[-1])

            if "YEAS :" in line:
                y = 0
                next_line = pdflines[x + y]
                while "NAYS : " not in next_line:
                    next_line = next_line.split("  ")
                    if next_line and ("YEAS" not in next_line):
                        for v in next_line:
                            if v and "YEAS" not in v:
                                voters["yes"].append(v.strip())
                    next_line = pdflines[x + y]
                    y += 1
            if line and "NAYS :" in line:
                y = 0
                next_line = 0
                next_line = pdflines[x + y]
                while ("ABSTAINED : " not in next_line) and ("PASSES :"
                                                             not in next_line):
                    next_line = next_line.split("  ")
                    if next_line and "NAYS" not in next_line:
                        for v in next_line:
                            if v and "NAYS" not in v:
                                voters["no"].append(v.strip())
                    next_line = pdflines[x + y]
                    y += 1

            if line and ("ABSTAINED :" in line or "PASSES :" in line):
                y = 2
                next_line = 0
                next_line = pdflines[x + y]
                while "NOT VOTING :" not in next_line:
                    next_line = next_line.split("  ")
                    if next_line and ("ABSTAINED" not in next_line
                                      or "PASSES" not in next_line):
                        for v in next_line:
                            if v:
                                voters["abstain"].append(v.strip())
                    next_line = pdflines[x + y]
                    y += 1

            if line and "NOT VOTING : " in line:
                lines_to_go_through = math.ceil(not_voting / len(line.split()))
                next_line = pdflines[x]
                for y in range(lines_to_go_through):
                    if len(pdflines) > (x + y + 2):
                        next_line = pdflines[x + y + 2].split("  ")
                        for v in next_line:
                            if v:
                                voters["not voting"].append(v.strip())
                if yeas > (nays + abstained + not_voting):
                    passed = True
                else:
                    passed = False

                ve = VoteEvent(
                    chamber=chamber,
                    start_date=vote_date,
                    motion_text=motion,
                    result="pass" if passed else "fail",
                    bill=bill,
                    classification="passage",
                )
                ve.add_source(vote_url)
                for how_voted, how_voted_voters in voters.items():
                    for voter in how_voted_voters:
                        if len(voter) > 0:
                            ve.vote(how_voted, voter)
                # Resets voters dictionary before going onto next page in pdf
                voters = defaultdict(list)
                yield ve
예제 #26
0
    def scrape_upper(self):
        PDF_URL = "http://www.ohiosenate.gov/Assets/CommitteeSchedule/calendar.pdf"
        (path, _response) = self.urlretrieve(PDF_URL)
        text = convert_pdf(path, type="text").decode()
        os.remove(path)

        days = re.split(r"(\w+day, \w+ \d{1,2})", text)
        date = None
        for day in enumerate(days[1:]):
            if day[0] % 2 == 0:
                # Calendar is put out for the current week, so use that year
                date = day[1] + ", " + str(datetime.datetime.now().year)
            else:

                events = re.split(r"\n\n((?:\w+\s?)+),\s", day[1])
                comm = ""
                for event in enumerate(events[1:]):
                    if event[0] % 2 == 0:
                        comm = event[1].strip()
                    else:

                        try:
                            (time, location, description) = re.search(
                                r"""(?mxs)
                                    (\d{1,2}:\d{2}\s[AP]M)  # Meeting time
                                    .*?,\s  # Potential extra text for meeting time
                                    (.*?)\n  # Location, usually a room
                                    .*?\n  # Chairman of committee holding event
                                    (.*)  # Description of event
                                    """,
                                event[1],
                            ).groups()
                        except AttributeError:
                            continue

                        time = datetime.datetime.strptime(
                            time + "_" + date, "%I:%M %p_%A, %B %d, %Y")
                        time = self._tz.localize(time)

                        location = location.strip()

                        description = "\n".join([
                            x.strip() for x in description.split("\n")
                            if x.strip() and not x.strip().startswith("Page ")
                            and not x.strip().startswith("*Possible Vote") and
                            not x.strip() == "NO OTHER COMMITTEES WILL MEET"
                        ])

                        if not description:
                            description = "[No description provided by state]"

                        event = Event(
                            name=description,
                            start_date=time,
                            location_name=location,
                            description=description,
                        )

                        event.add_source(PDF_URL)
                        event.add_participant(comm,
                                              type="committee",
                                              note="host")
                        for line in description.split("\n"):
                            related_bill = re.search(
                                r"(S\.?(?:[JC]\.?)?[BR]\.?\s+\d+)\s+(.*)$",
                                line)
                            if related_bill:
                                (related_bill,
                                 relation) = related_bill.groups()
                                relation = relation.strip()
                                related_bill = related_bill.replace(".", "")
                                item = event.add_agenda_item(relation)
                                item.add_bill(related_bill)

                        yield event