Пример #1
0
 def apply_votes(self, bill):
     """Given a bill (and assuming it has a status_url in its dict), parse all of the votes
     """
     bill_votes = votes.all_votes_for_url(self, bill['status_url'])
     for (chamber,vote_desc,pdf_url,these_votes) in bill_votes:
         try:
             date = vote_desc.split("-")[-1]
         except IndexError:
             self.warning("[%s] Couldn't get date out of [%s]" % (bill['bill_id'],vote_desc))
             continue
         yes_votes = []
         no_votes = []
         other_votes = []
         for voter,vote in these_votes.iteritems():
             if vote == 'Y':
                 yes_votes.append(voter)
             elif vote == 'N':
                 no_votes.append(voter)
             else:
                 other_votes.append(voter)
         passed = len(yes_votes) > len(no_votes) # not necessarily correct, but not sure where else to get it. maybe from pdf
         vote = Vote(standardize_chamber(chamber),date,vote_desc,passed, len(yes_votes), len(no_votes), len(other_votes),pdf_url=pdf_url)
         for voter in yes_votes:
             vote.yes(voter)
         for voter in no_votes:
             vote.no(voter)
         for voter in other_votes:
             vote.other(voter)
         bill.add_vote(vote)
Пример #2
0
    def scrape(self, chamber, session):
        self.validate_session(session)

        if chamber == 'upper':
            other_chamber = 'lower'
            bill_id = 'SB 1'
        else:
            other_chamber = 'upper'
            bill_id = 'HB 1'

        b1 = Bill(session, chamber, bill_id, 'A super bill')
        b1.add_source('http://example.com/')
        b1.add_version('As Introduced', 'http://example.com/SB1.html')
        b1.add_document('Google', 'http://google.com')
        b1.add_sponsor('primary', 'Bob Smith')
        b1.add_sponsor('secondary', 'Johnson, Sally')

        d1 = datetime.datetime.strptime('1/29/2010', '%m/%d/%Y')
        v1 = Vote('upper', d1, 'Final passage', True, 2, 0, 0)
        v1.yes('Smith')
        v1.yes('Johnson')

        d2 = datetime.datetime.strptime('1/30/2010', '%m/%d/%Y')
        v2 = Vote('lower', d2, 'Final passage', False, 0, 1, 1)
        v2.no('Bob Smith')
        v2.other('S. Johnson')

        b1.add_vote(v1)
        b1.add_vote(v2)

        b1.add_action(chamber, 'introduced', d1)
        b1.add_action(chamber, 'read first time', d2)
        b1.add_action(other_chamber, 'introduced', d2)

        self.save_bill(b1)
Пример #3
0
    def scrape_vote(self, bill, date, motion, url):
        page = self.urlopen(url)

        if "not yet official" in page:
            # Sometimes they link to vote pages before they go live
            return

        page = lxml.html.fromstring(page)

        if url.endswith("Senate"):
            actor = "upper"
        else:
            actor = "lower"

        count_path = "string(//td[@align = 'center' and contains(., '%s: ')])"
        yes_count = int(page.xpath(count_path % "Yeas").split()[-1])
        no_count = int(page.xpath(count_path % "Nays").split()[-1])
        other_count = int(page.xpath(count_path % "Non Voting").split()[-1])
        other_count += int(page.xpath(count_path % "Present").split()[-1])

        passed = yes_count > no_count + other_count
        vote = Vote(actor, date, motion, passed, yes_count, no_count, other_count)
        vote.add_source(url)

        vote_path = "//h3[. = '%s']/following-sibling::table[1]/tr/td/a"
        for yes in page.xpath(vote_path % "Yeas"):
            vote.yes(yes.text)
        for no in page.xpath(vote_path % "Nays"):
            vote.no(no.text)
        for other in page.xpath(vote_path % "Non Voting"):
            vote.other(other.text)
        for other in page.xpath(vote_path % "Present"):
            vote.other(other.text)

        bill.add_vote(vote)
Пример #4
0
    def parse_vote(self, actor, date, row):
        """
        takes the actor, date and row element and returns a Vote object
        """
        spans = row.xpath('.//span')
        motion = row.text
        passed, yes_count, no_count, other_count = spans[0].text_content().split('-')
        yes_votes = [ name for name in
                      spans[1].tail.replace(u'\xa0--\xa0', '').split(',')
                      if name ]

        no_votes = [ name for name in
                     spans[2].tail.replace(u'\xa0--\xa0', '').split(',')
                     if name ]
        other_votes = []
        if spans[3].text.startswith('Absent'):
            other_votes = [ name for name in
                            spans[3].tail.replace(u'\xa0--\xa0', '').split(',')
                            if name ]
        for key, val in {'adopted': True, 'passed': True, 'failed':False}.items():
            if key in passed.lower():
                passed = val
                break
        vote = Vote(actor, date, motion, passed, int(yes_count), int(no_count),
                    int(other_count))
        for name in yes_votes:
            if name and name != 'None':
                vote.yes(name)
        for name in no_votes:
            if name and name != 'None':
                vote.no(name)
        for name in other_votes:
            if name and name != 'None':
                vote.other(name)
        return vote
Пример #5
0
    def parse_vote(self, actor, date, row):
        """
        takes the actor, date and row element and returns a Vote object
        """
        spans = row.xpath('.//span')
        motion = row.text.replace(u'\u00a0', " ").replace("-", "").strip()
        motion = motion if motion else "passage"
        passed, yes_count, no_count, other_count = spans[0].text_content().rsplit('-', 3)
        yes_votes = self.get_names(spans[1].tail)
        no_votes = self.get_names(spans[2].tail)

        other_votes = []
        for span in spans[3:]:
            if span.text.startswith(('Absent', 'Excused')):
                other_votes += self.get_names(span.tail)
        for key, val in {'adopted': True, 'passed': True, 'failed': False}.items():
            if key in passed.lower():
                passed = val
                break
        vote = Vote(actor, date, motion, passed, int(yes_count), int(no_count),
                    int(other_count))
        for name in yes_votes:
            if name and name != 'None':
                vote.yes(name)
        for name in no_votes:
            if name and name != 'None':
                vote.no(name)
        for name in other_votes:
            if name and name != 'None':
                vote.other(name)
        return vote
Пример #6
0
    def scrape_vote(self, bill, date, url):
        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)

            header = page.xpath("string(//h4[contains(@id, 'hdVote')])")

            location = header.split(', ')[1]

            if location.startswith('House'):
                chamber = 'lower'
            elif location.startswith('Senate'):
                chamber = 'upper'
            else:
                raise ScrapeError("Bad chamber: %s" % chamber)

            committee = ' '.join(location.split(' ')[1:]).strip()
            if not committee or committee.startswith('of Representatives'):
                committee = None

            motion = ', '.join(header.split(', ')[2:]).strip()

            yes_count = int(
                page.xpath("string(//td[contains(@id, 'tdAyes')])"))
            no_count = int(
                page.xpath("string(//td[contains(@id, 'tdNays')])"))
            excused_count = int(
                page.xpath("string(//td[contains(@id, 'tdExcused')])"))
            absent_count = int(
                page.xpath("string(//td[contains(@id, 'tdAbsent')])"))
            other_count = excused_count + absent_count

            passed = yes_count > no_count

            if motion.startswith('Do Pass'):
                type = 'passage'
            elif motion == 'Concurred in amendments':
                type = 'amendment'
            elif motion == 'Veto override':
                type = 'veto_override'
            else:
                type = 'other'

            vote = Vote(chamber, date, motion, passed, yes_count, no_count,
                        other_count)
            vote['type'] = type

            if committee:
                vote['committee'] = committee

            vote.add_source(url)

            for td in page.xpath("//table[contains(@id, 'tblVotes')]/tr/td"):
                if td.text == 'Yea':
                    vote.yes(td.getprevious().text.strip())
                elif td.text == 'Nay':
                    vote.no(td.getprevious().text.strip())
                elif td.text in ('Excused', 'Absent'):
                    vote.other(td.getprevious().text.strip())

            bill.add_vote(vote)
Пример #7
0
    def scrape_vote(self, bill, vote_chamber, bill_id, vote_id, vote_date,
                    action_text):
        url = ('http://alisondb.legislature.state.al.us/Alison/'
               'GetRollCallVoteResults.aspx?'
               'VOTE={0}&BODY={1}&INST={2}&SESS={3}'.
               format(vote_id, vote_chamber, bill_id, self.session_id))
        doc = lxml.html.fromstring(self.get(url=url).text)

        voters = {'Y': [], 'N': [], 'P': [], 'A': []}

        voters_and_votes = doc.xpath('//table/tr/td/font/text()')
        capture_vote = False
        name = ''
        for item in voters_and_votes:
            if capture_vote:
                capture_vote = False
                if name:
                    voters[item].append(name)
            else:
                capture_vote = True
                name = item
                if (name.endswith(", Vacant") or
                        name.startswith("Total ") or
                        not name.strip()):
                    name = ''

        # Check name counts against totals listed on the site
        total_yea = doc.xpath('//*[starts-with(text(), "Total Yea")]/text()')
        if total_yea:
            total_yea = int(total_yea[0].split(":")[-1])
            assert total_yea == len(voters['Y']), "Yea count incorrect"
        else:
            total_yea = len(voters['Y'])

        total_nay = doc.xpath('//*[starts-with(text(), "Total Nay")]/text()')
        if total_nay:
            total_nay = int(total_nay[0].split(":")[-1])
            assert total_nay == len(voters['N']), "Nay count incorrect"
        else:
            total_nay = len(voters['N'])

        total_absent = doc.xpath(
            '//*[starts-with(text(), "Total Absent")]/text()')
        if total_absent:
            total_absent = int(total_absent[0].split(":")[-1])
            assert total_absent == len(voters['A']), "Absent count incorrect"
        total_other = len(voters['P']) + len(voters['A'])

        vote = Vote(
            self.CHAMBERS[vote_chamber[0]], vote_date, action_text,
            total_yea > total_nay, total_yea, total_nay, total_other)
        vote.add_source(url)
        for member in voters['Y']:
            vote.yes(member)
        for member in voters['N']:
            vote.no(member)
        for member in (voters['A'] + voters['P']):
            vote.other(member)

        bill.add_vote(vote)
Пример #8
0
    def scrape_vote(self, bill, date, motion, url):
        page = lxml.html.fromstring(self.urlopen(url))

        if url.endswith('Senate'):
            actor = 'upper'
        else:
            actor = 'lower'

        count_path = "string(//td[@align = 'center' and contains(., '%s: ')])"
        yes_count = int(page.xpath(count_path % "Yeas").split()[-1])
        no_count = int(page.xpath(count_path % "Nays").split()[-1])
        other_count = int(page.xpath(count_path % "Non Voting").split()[-1])
        other_count += int(page.xpath(count_path % "Present").split()[-1])

        passed = yes_count > no_count + other_count
        vote = Vote(actor, date, motion, passed, yes_count,
                    no_count, other_count)
        vote.add_source(url)

        vote_path = "//h3[. = '%s']/following-sibling::table[1]/tr/td/a"
        for yes in page.xpath(vote_path % "Yeas"):
            vote.yes(yes.text)
        for no in page.xpath(vote_path % "Nays"):
            vote.no(no.text)
        for other in page.xpath(vote_path % "Non Voting"):
            vote.other(other.text)
        for other in page.xpath(vote_path % "Present"):
            vote.other(other.text)

        bill.add_vote(vote)
Пример #9
0
    def scrape_vote(self, bill, vote_type_id, vote_type):
        base_url = "http://dcclims1.dccouncil.us/lims/voting.aspx?VoteTypeID=%s&LegID=%s"
        url = base_url % (vote_type_id, bill["bill_id"])

        with self.urlopen(url) as html:
            doc = lxml.html.fromstring(html)

            vote_date = convert_date(doc.get_element_by_id("VoteDate").text)

            # check if voice vote / approved boxes have an 'x'
            voice = doc.xpath('//span[@id="VoteTypeVoice"]/b/text()')[0] == "x"
            passed = doc.xpath('//span[@id="VoteResultApproved"]/b/text()')[0] == "x"

            yes_count = extract_int(doc.xpath('//span[@id="VoteCount1"]/b/text()')[0])
            no_count = extract_int(doc.xpath('//span[@id="VoteCount2"]/b/text()')[0])
            # every now and then this actually drops below 0 (error in count)
            other_count = max(13 - (yes_count + no_count), 0)

            vote = Vote("upper", vote_date, vote_type, passed, yes_count, no_count, other_count, voice_vote=voice)

            vote.add_source(url)

            # members are only text on page in a <u> tag
            for member_u in doc.xpath("//u"):
                member = member_u.text
                vote_text = member_u.xpath("../../i/text()")[0]
                if "Yes" in vote_text:
                    vote.yes(member)
                elif "No" in vote_text:
                    vote.no(member)
                else:
                    vote.other(member)
        bill.add_vote(vote)
Пример #10
0
def test_vote():
    v = Vote('upper', datetime.datetime(2012, 1, 1), 'passage', True,
             3, 1, 2, note='note')
    assert_equal(v, {'chamber': 'upper', 'date': datetime.datetime(2012, 1, 1),
                     'motion': 'passage', 'passed': True, 'yes_count': 3,
                     'no_count': 1, 'other_count': 2, 'type': 'other',
                     'yes_votes': [], 'no_votes': [], 'other_votes': [],
                     'note': 'note', '_type': 'vote', 'sources': []})

    yes_voters = ['Lincoln', 'Adams', 'Johnson']
    list(map(v.yes, yes_voters))
    assert_equal(v['yes_votes'], yes_voters)

    no_voters = ['Kennedy']
    list(map(v.no, no_voters))
    assert_equal(v['no_votes'], no_voters)

    other_voters = ['Polk', 'Pierce']
    list(map(v.other, other_voters))
    assert_equal(v['other_votes'], other_voters)

    # validate should work
    v.validate()

    # now add someone else and make sure it doesn't validate
    v.yes('Clinton')
    with assert_raises(ValueError):
        v.validate()
Пример #11
0
    def _build_lower_votes(self):
        url = self.shared_url + '&Votes=Y'
        self.urls.add(votes=url)
        self.bill.add_source(url)
        doc = self.urls.votes.doc
        if doc is None:
            return

        # Grab bill information.
        try:
            pre = doc.xpath('//pre')[0].text_content()

            no_votes = ('There are no votes for this bill in this legislative '
                        'session.')

            if pre == no_votes:
                raise ValueError('No votes for this bill.')
        # Skip bill if votes can't be found.
        except (IndexError, ValueError) as e:
            return

        actual_vote = collections.defaultdict(list)
        for table in doc.xpath('//table'):

            date = table.xpath('caption/label[contains(., "DATE:")]')
            date = date[0].itersiblings().next().text
            date = datetime.datetime.strptime(date, '%m/%d/%Y')

            votes = table.xpath('caption/span/label[contains(., "YEA/NAY:")]')
            votes = votes[0].itersiblings().next().text
            yes_count, no_count = map(int, votes.split('/'))

            passed = yes_count > no_count
            vote = Vote('lower', date, 'Floor Vote', passed, yes_count,
                        no_count, other_count=0)

            tds = table.xpath('tr/td/text()')
            votes = iter(tds)
            while True:
                try:
                    data = list(islice(votes, 2))
                    name, vote_val = data
                except (StopIteration, ValueError):
                    # End of data. Stop.
                    break
                name = self._scrub_name(name)

                if vote_val.strip() == 'Y':
                    vote.yes(name)
                elif vote_val.strip() in ('N', 'NO'):
                    vote.no(name)
                else:
                    vote.other(name)
                    actual_vote[vote_val].append(name)

            # The page doesn't provide an other_count.
            vote['other_count'] = len(vote['other_votes'])
            vote['actual_vote'] = actual_vote
            self.bill.add_vote(vote)
Пример #12
0
    def scrape_vote(self, bill, chamber, date, url):
        (path, resp) = self.urlretrieve(url)
        text = convert_pdf(path, 'text')
        os.remove(path)

        try:
            motion = text.split('\n')[4].strip()
        except IndexError:
            return

        try:
            yes_count = int(re.search(r'Yeas - (\d+)', text).group(1))
        except AttributeError:
            return

        no_count = int(re.search(r'Nays - (\d+)', text).group(1))
        other_count = int(re.search(r'Not Voting - (\d+)', text).group(1))
        passed = yes_count > (no_count + other_count)

        vote = Vote(chamber, date, motion, passed, yes_count, no_count,
                    other_count)
        vote.add_source(url)

        y,n,o = 0,0,0
        break_outter = False

        for line in text.split('\n')[9:]:
            if break_outter:
                break

            if 'after roll call' in line:
                break
            if 'Indication of Vote' in line:
                break
            if 'Presiding' in line:
                continue

            for col in re.split(r'-\d+', line):
                col = col.strip()
                if not col:
                    continue

                match = re.match(r'(Y|N|EX|\*)\s+(.+)$', col)

                if match:
                    if match.group(2) == "PAIR":
                        break_outter = True
                        break
                    if match.group(1) == 'Y':
                        vote.yes(match.group(2))
                    elif match.group(1) == 'N':
                        vote.no(match.group(2))
                    else:
                        vote.other(match.group(2))
                else:
                    vote.other(col.strip())

        vote.validate()
        bill.add_vote(vote)
Пример #13
0
    def build_lower_votes(self):

        url = "http://assembly.state.ny.us/leg/?" "default_fld=&bn=%s&term=%s&Votes=Y"
        url = url % (self.bill_id, self.term_start_year)
        self.urls.add(votes=url)
        self.bill.add_source(url)
        doc = self.urls.votes.doc
        if doc is None:
            return

        # Grab bill information.
        try:
            pre = doc.xpath("//pre")[0].text_content()

            no_votes = "There are no votes for this bill in this legislative " "session."

            if pre == no_votes:
                raise ValueError("No votes for this bill.")
        # Skip bill if votes can't be found.
        except (IndexError, ValueError) as e:
            return

        actual_vote = collections.defaultdict(list)
        for table in doc.xpath("//table"):

            date = table.xpath('caption/label[contains(., "DATE:")]')
            date = date[0].itersiblings().next().text
            date = datetime.datetime.strptime(date, "%m/%d/%Y")

            votes = table.xpath('caption/span/label[contains(., "YEA/NAY:")]')
            votes = votes[0].itersiblings().next().text
            yes_count, no_count = map(int, votes.split("/"))

            passed = yes_count > no_count
            vote = Vote("lower", date, "Floor Vote", passed, yes_count, no_count, other_count=0)

            tds = table.xpath("tr/td/text()")
            votes = iter(tds)
            while True:
                try:
                    data = list(islice(votes, 2))
                    name, vote_val = data
                except (StopIteration, ValueError):
                    # End of data. Stop.
                    break
                name = self._scrub_name(name)

                if vote_val.strip() == "Y":
                    vote.yes(name)
                elif vote_val.strip() in ("N", "NO"):
                    vote.no(name)
                else:
                    vote.other(name)
                    actual_vote[vote_val].append(name)

            # The page doesn't provide an other_count.
            vote["other_count"] = len(vote["other_votes"])
            vote["actual_vote"] = actual_vote
            self.bill.add_vote(vote)
Пример #14
0
    def scrape_bill(self, session, bills):

        billdata, details = bills[0]

        (senate_url, assembly_url, bill_chamber, bill_type, bill_id, title, (letter, number, is_amd)) = details

        data = billdata["data"]["bill"]

        assembly = AssemblyBillPage(self, session, bill_chamber, details)
        assembly.build()
        bill = assembly.bill
        bill.add_source(billdata["url"])

        # Add companion.
        if data["sameAs"]:
            bill.add_companion(data["sameAs"])

        if data["summary"]:
            bill["summary"] = data["summary"]

        if data["votes"]:
            for vote_data in data["votes"]:
                vote = Vote(
                    chamber="upper",
                    date=self.date_from_timestamp(vote_data["voteDate"]),
                    motion=vote_data["description"] or "[No motion available.]",
                    passed=False,
                    yes_votes=[],
                    no_votes=[],
                    other_votes=[],
                    yes_count=0,
                    no_count=0,
                    other_count=0,
                )

                for name in vote_data["ayes"]:
                    vote.yes(name)
                    vote["yes_count"] += 1
                for names in map(vote_data.get, ["absent", "excused", "abstains"]):
                    for name in names:
                        vote.other(name)
                        vote["other_count"] += 1
                for name in vote_data["nays"]:
                    vote.no(name)
                    vote["no_count"] += 1

                vote["passed"] = vote["yes_count"] > vote["no_count"]

                bill.add_vote(vote)

        # if data['previousVersions']:
        #   These are instances of the same bill from prior sessions.
        #     import pdb; pdb.set_trace()

        if not data["title"]:
            bill["title"] = bill["summary"]

        self.save_bill(bill)
Пример #15
0
    def scrape_vote(self, bill, name, url):
        if "VOTE/H" in url:
            vote_chamber = "lower"
            cols = (1, 5, 9, 13)
            name_offset = 3
            yes_offset = 0
            no_offset = 1
        else:
            vote_chamber = "upper"
            cols = (1, 6)
            name_offset = 4
            yes_offset = 1
            no_offset = 2

        # Connecticut's SSL is causing problems with Scrapelib, so use Requests
        page = requests.get(url, verify=False).text

        if "BUDGET ADDRESS" in page:
            return

        page = lxml.html.fromstring(page)

        yes_count = page.xpath("string(//span[contains(., 'Those voting Yea')])")
        yes_count = int(re.match(r"[^\d]*(\d+)[^\d]*", yes_count).group(1))

        no_count = page.xpath("string(//span[contains(., 'Those voting Nay')])")
        no_count = int(re.match(r"[^\d]*(\d+)[^\d]*", no_count).group(1))

        other_count = page.xpath("string(//span[contains(., 'Those absent')])")
        other_count = int(re.match(r"[^\d]*(\d+)[^\d]*", other_count).group(1))

        need_count = page.xpath("string(//span[contains(., 'Necessary for')])")
        need_count = int(re.match(r"[^\d]*(\d+)[^\d]*", need_count).group(1))

        date = page.xpath("string(//span[contains(., 'Taken on')])")
        date = re.match(r".*Taken\s+on\s+(\d+/\s?\d+)", date).group(1)
        date = date.replace(" ", "")
        date = datetime.datetime.strptime(date + " " + bill["session"], "%m/%d %Y").date()

        vote = Vote(vote_chamber, date, name, yes_count > need_count, yes_count, no_count, other_count)
        vote.add_source(url)

        table = page.xpath("//table")[0]
        for row in table.xpath("tr"):
            for i in cols:
                name = row.xpath("string(td[%d])" % (i + name_offset)).strip()

                if not name or name == "VACANT":
                    continue

                if "Y" in row.xpath("string(td[%d])" % (i + yes_offset)):
                    vote.yes(name)
                elif "N" in row.xpath("string(td[%d])" % (i + no_offset)):
                    vote.no(name)
                else:
                    vote.other(name)

        bill.add_vote(vote)
Пример #16
0
    def scrape_vote(self, bill, date, url):
        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)

            header = page.xpath("string(//h4[contains(@id, 'hdVote')])")

            location = header.split(", ")[1]

            if location.startswith("House"):
                chamber = "lower"
            elif location.startswith("Senate"):
                chamber = "upper"
            else:
                raise ScrapeError("Bad chamber: %s" % chamber)

            committee = " ".join(location.split(" ")[1:]).strip()
            if not committee or committee.startswith("of Representatives"):
                committee = None

            motion = ", ".join(header.split(", ")[2:]).strip()
            if not motion:
                # If we can't detect a motion, skip this vote
                return

            yes_count = int(page.xpath("string(//td[contains(@id, 'tdAyes')])"))
            no_count = int(page.xpath("string(//td[contains(@id, 'tdNays')])"))
            excused_count = int(page.xpath("string(//td[contains(@id, 'tdExcused')])"))
            absent_count = int(page.xpath("string(//td[contains(@id, 'tdAbsent')])"))
            other_count = excused_count + absent_count

            passed = yes_count > no_count

            if motion.startswith("Do Pass"):
                type = "passage"
            elif motion == "Concurred in amendments":
                type = "amendment"
            elif motion == "Veto override":
                type = "veto_override"
            else:
                type = "other"

            vote = Vote(chamber, date, motion, passed, yes_count, no_count, other_count)
            vote["type"] = type

            if committee:
                vote["committee"] = committee

            vote.add_source(url)

            for td in page.xpath("//table[contains(@id, 'tblVotes')]/tr/td"):
                if td.text == "Yea":
                    vote.yes(td.getprevious().text.strip())
                elif td.text == "Nay":
                    vote.no(td.getprevious().text.strip())
                elif td.text in ("Excused", "Absent"):
                    vote.other(td.getprevious().text.strip())

            bill.add_vote(vote)
Пример #17
0
    def scrape_bill(self, session, bills):

        billdata, details = bills[0]

        (senate_url, assembly_url, bill_chamber, bill_type, bill_id,
         title, (letter, number, is_amd)) = details

        data = billdata['data']['bill']

        assembly = AssemblyBillPage(self, session, bill_chamber, details)
        assembly.build()
        bill = assembly.bill
        bill.add_source(billdata['url'])

        # Add companion.
        if data['sameAs']:
            bill.add_companion(data['sameAs'])

        if data['summary']:
            bill['summary'] = data['summary']

        if data['votes']:
            for vote_data in data['votes']:
                vote = Vote(
                    chamber='upper',
                    date=self.date_from_timestamp(vote_data['voteDate']),
                    motion=vote_data['description'] or '[No motion available.]',
                    passed=False,
                    yes_votes=[],
                    no_votes=[],
                    other_votes=[],
                    yes_count=0,
                    no_count=0,
                    other_count=0)

                for name in vote_data['ayes']:
                    vote.yes(name)
                    vote['yes_count'] += 1
                for names in map(vote_data.get, ['absent', 'excused', 'abstains']):
                    for name in names:
                        vote.other(name)
                        vote['other_count'] += 1
                for name in vote_data['nays']:
                    vote.no(name)
                    vote['no_count'] += 1

                vote['passed'] = vote['yes_count'] > vote['no_count']

                bill.add_vote(vote)

        # if data['previousVersions']:
        #   These are instances of the same bill from prior sessions.
        #     import pdb; pdb.set_trace()

        if not data['title']:
            bill['title'] = bill['summary']

        self.save_bill(bill)
Пример #18
0
    def scrape_votes(self, bill, link):
        with self.urlopen(link) as page:
            page = lxml.html.fromstring(page)
            raw_vote_data = page.xpath("//span[@id='lblVoteData']")[0].text_content()
            raw_vote_data = re.split("\w+? by [\w ]+?\s+-", raw_vote_data.strip())[1:]
            for raw_vote in raw_vote_data:
                raw_vote = raw_vote.split(u"\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0")
                motion = raw_vote[0]

                vote_date = re.search("(\d+/\d+/\d+)", motion)
                if vote_date:
                    vote_date = datetime.datetime.strptime(vote_date.group(), "%m/%d/%Y")

                passed = "Passed" in motion or "Recommended for passage" in motion or "Adopted" in raw_vote[1]
                vote_regex = re.compile("\d+$")
                aye_regex = re.compile("^.+voting aye were: (.+) -")
                no_regex = re.compile("^.+voting no were: (.+) -")
                other_regex = re.compile("^.+present and not voting were: (.+) -")
                yes_count = 0
                no_count = 0
                other_count = 0
                ayes = []
                nos = []
                others = []

                for v in raw_vote[1:]:
                    v = v.strip()
                    if v.startswith("Ayes...") and vote_regex.search(v):
                        yes_count = int(vote_regex.search(v).group())
                    elif v.startswith("Noes...") and vote_regex.search(v):
                        no_count = int(vote_regex.search(v).group())
                    elif v.startswith("Present and not voting...") and vote_regex.search(v):
                        other_count += int(vote_regex.search(v).group())
                    elif aye_regex.search(v):
                        ayes = aye_regex.search(v).groups()[0].split(", ")
                    elif no_regex.search(v):
                        nos = no_regex.search(v).groups()[0].split(", ")
                    elif other_regex.search(v):
                        others += other_regex.search(v).groups()[0].split(", ")

                if "ChamberVoting=H" in link:
                    chamber = "lower"
                else:
                    chamber = "upper"

                vote = Vote(chamber, vote_date, motion, passed, yes_count, no_count, other_count)
                vote.add_source(link)
                for a in ayes:
                    vote.yes(a)
                for n in nos:
                    vote.no(n)
                for o in others:
                    vote.other(o)

                vote.validate()
                bill.add_vote(vote)

        return bill
Пример #19
0
def record_votes(root, session):
    for el in root.xpath(u'//div[starts-with(., "Yeas \u2014")]'):
        text = ''.join(el.getprevious().getprevious().itertext())
        text.replace('\n', ' ')
        m = re.search(r'(?P<bill_id>\w+\W+\d+)(,?\W+as\W+amended,?)?\W+was\W+'
                      '(?P<type>adopted|passed'
                      '(\W+to\W+(?P<to>engrossment|third\W+reading))?)\W+'
                      'by\W+\(Record\W+(?P<record>\d+)\):\W+'
                      '(?P<yeas>\d+)\W+Yeas,\W+(?P<nays>\d+)\W+Nays,\W+'
                      '(?P<present>\d+)\W+Present', text)
        if m:
            yes_count = int(m.group('yeas'))
            no_count = int(m.group('nays'))
            other_count = int(m.group('present'))

            bill_id = m.group('bill_id')
            bill_id = bill_id.replace(u'\xa0', ' ')
            bill_id = re.sub(r'CS(SB|HB)', r'\1', bill_id)

            if bill_id.startswith('H') or bill_id.startswith('CSHB'):
                bill_chamber = 'lower'
            elif bill_id.startswith('S') or bill_id.startswith('CSSB'):
                bill_chamber = 'upper'
            else:
                continue

            motion = get_motion(m)

            vote = Vote(None, None, motion, True,
                        yes_count, no_count, other_count)
            vote['bill_id'] = bill_id
            vote['bill_chamber'] = bill_chamber
            vote['session'] = session[0:2]
            vote['method'] = 'record'
            vote['record'] = m.group('record')
            vote['type'] = get_type(motion)

            for name in names(el):
                vote.yes(name)

            el = next_tag(el)
            if el.text and el.text.startswith('Nays'):
                for name in names(el):
                    vote.no(name)
                el = next_tag(el)

            while el.text and re.match(r'Present|Absent', el.text):
                for name in names(el):
                    vote.other(name)
                el = next_tag(el)

            vote['other_count'] = len(vote['other_votes'])
            yield vote
        else:
            pass
Пример #20
0
    def scrape_votes(self, bill, bill_prefix, number, session):
        vote_url = ('http://www.legislature.state.oh.us/votes.cfm?ID=' +
                    session + '_' + bill_prefix + '_' + str(number))

        page = self.urlopen(vote_url)
        page = lxml.html.fromstring(page)

        for jlink in page.xpath("//a[contains(@href, 'JournalText')]"):
            date = datetime.datetime.strptime(jlink.text,
                                              "%m/%d/%Y").date()

            details = jlink.xpath("string(../../../td[2])")

            chamber = details.split(" - ")[0]
            if chamber == 'House':
                chamber = 'lower'
            elif chamber == 'Senate':
                chamber = 'upper'
            else:
                raise ScrapeError("Bad chamber: %s" % chamber)

            motion = details.split(" - ")[1].split("\n")[0].strip()

            vote_row = jlink.xpath("../../..")[0].getnext()

            yea_div = vote_row.xpath(
                "td/font/div[contains(@id, 'Yea')]")[0]
            yeas = []
            for td in yea_div.xpath("table/tr/td"):
                name = td.xpath("string()")
                if name:
                    yeas.append(name)

            no_div = vote_row.xpath(
                "td/font/div[contains(@id, 'Nay')]")[0]
            nays = []
            for td in no_div.xpath("table/tr/td"):
                name = td.xpath("string()")
                if name:
                    nays.append(name)

            yes_count = len(yeas)
            no_count = len(nays)

            vote = Vote(chamber, date, motion, yes_count > no_count,
                        yes_count, no_count, 0)

            for yes in yeas:
                vote.yes(yes)
            for no in nays:
                vote.no(no)

            vote.add_source(vote_url)

            bill.add_vote(vote)
Пример #21
0
    def scrape_vote(self, bill, vote_url, chamber, date):
        page = self.lxmlize(vote_url)

        try:
            motion = page.xpath('//td/b/font[text()="MOTION:"]/../../following-sibling::td/font/text()')[0]
        except:
            self.warning("Vote Summary Page Broken ")
            return

        if 'withdrawn' not in motion:
            # Every table row after the one with VOTE in a td/div/b/font
            rolls = page.xpath('//tr[preceding-sibling::tr/td/div/b/font/text()="VOTE"]')

            count_row = rolls[-1]
            yes_count = count_row.xpath('.//b/font[normalize-space(text())="YES:"]'
                                        '/../following-sibling::font[1]/text()')[0]
            no_count = count_row.xpath('.//b/font[normalize-space(text())="NO:"]'
                                       '/../following-sibling::font[1]/text()')[0]
            exc_count = count_row.xpath('.//b/font[normalize-space(text())="EXC:"]'
                                        '/../following-sibling::font[1]/text()')[0]
            nv_count = count_row.xpath('.//b/font[normalize-space(text())="ABS:"]'
                                       '/../following-sibling::font[1]/text()')[0]

            if count_row.xpath('.//b/font[normalize-space(text())="FINAL ACTION:"]'
                               '/../following-sibling::b[1]/font/text()'):
                final = count_row.xpath('.//b/font[normalize-space(text())="FINAL ACTION:"]'
                                        '/../following-sibling::b[1]/font/text()')[0]
                passed = True if 'pass' in final.lower() or int(yes_count) > int(no_count) else False
            elif 'passed without objection' in motion.lower():
                passed = True
                yes_count = int(len(rolls[:-2]))
            else:
                self.warning("No vote breakdown found for %s" % vote_url)
                return


            other_count = int(exc_count) + int(nv_count)

            vote = Vote(chamber, date, motion, passed,
                        int(yes_count), int(no_count), int(other_count))

            for roll in rolls[:-2]:
                voter = roll.xpath('td[2]/div/font')[0].text_content()
                voted = roll.xpath('td[3]/div/font')[0].text_content().strip()
                if voted:
                    if 'Yes' in voted:
                        vote.yes(voter)
                    elif 'No' in voted:
                        vote.no(voter)
                    else:
                        vote.other(voter)
                elif 'passed without objection' in motion.lower() and voter:
                    vote.yes(voter)

            bill.add_vote(vote)
Пример #22
0
    def parse_vote(self, bill, vote_date, vote_chamber, vote_status, vote_url):
        vote_chamber = 'upper' if vote_chamber == 'Senate' else 'lower'
        vote_date = datetime.datetime.strptime(vote_date, '%a %d %b %Y')

        vote_doc, resp = self.urlretrieve(vote_url)

        subprocess.check_call('abiword --to=ksvote.txt %s' % vote_doc,
                              shell=True, cwd='/tmp/')
        vote_lines = open('/tmp/ksvote.txt').readlines()

        os.remove(vote_doc)

        vote = None
        passed = True
        for line in vote_lines:
            line = line.strip()
            totals = re.findall('Yeas (\d+)[;,] Nays (\d+)[;,] (?:Present but not voting:|Present and Passing) (\d+)[;,] (?:Absent or not voting:|Absent or Not Voting) (\d+)',
                                line)
            if totals:
                totals = totals[0]
                yeas = int(totals[0])
                nays = int(totals[1])
                nv = int(totals[2])
                absent = int(totals[3])
                # default passed to true
                vote = Vote(vote_chamber, vote_date, vote_status,
                            True, yeas, nays, nv+absent)
            elif line.startswith('Yeas:'):
                line = line.split(':', 1)[1].strip()
                for member in line.split(', '):
                    if member != 'None.':
                        vote.yes(member)
            elif line.startswith('Nays:'):
                line = line.split(':', 1)[1].strip()
                for member in line.split(', '):
                    if member != 'None.':
                        vote.no(member)
            elif line.startswith('Present '):
                line = line.split(':', 1)[1].strip()
                for member in line.split(', '):
                    if member != 'None.':
                        vote.other(member)
            elif line.startswith('Absent or'):
                line = line.split(':', 1)[1].strip()
                for member in line.split(', '):
                    if member != 'None.':
                        vote.other(member)
            elif 'the motion did not prevail' in line:
                passed = False

        if vote:
            vote['passed'] = passed
            vote.add_source(vote_url)
            bill.add_vote(vote)
Пример #23
0
    def scrape_votes(self, bill, votes_url):
        html = self.urlopen(votes_url)
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(votes_url)

        EXPECTED_VOTE_CODES = ['Y','N','E','NV','A','P','-']

        # vote indicator, a few spaces, a name, newline or multiple spaces
        VOTE_RE = re.compile('(Y|N|E|NV|A|P|-)\s{2,5}(\w.+?)(?:\n|\s{2})')

        for link in doc.xpath('//a[contains(@href, "votehistory")]'):

            pieces = link.text.split(' - ')
            date = pieces[-1]
            if len(pieces) == 3:
                motion = pieces[1]
            else:
                motion = 'Third Reading'

            chamber = link.xpath('../following-sibling::td/text()')[0]
            if chamber == 'HOUSE':
                chamber = 'lower'
            elif chamber == 'SENATE':
                chamber = 'upper'
            else:
                self.warning('unknown chamber %s' % chamber)

            date = datetime.datetime.strptime(date, "%A, %B %d, %Y")

            # download the file
            fname, resp = self.urlretrieve(link.get('href'))
            pdflines = convert_pdf(fname, 'text').splitlines()
            os.remove(fname)

            vote = Vote(chamber, date, motion.strip(), False, 0, 0, 0)

            for line in pdflines:
                for match in VOTE_RE.findall(line):
                    vcode, name = match
                    if vcode == 'Y':
                        vote.yes(name)
                    elif vcode == 'N':
                        vote.no(name)
                    else:
                        vote.other(name)

            # fake the counts
            vote['yes_count'] = len(vote['yes_votes'])
            vote['no_count'] = len(vote['no_votes'])
            vote['other_count'] = len(vote['other_votes'])
            vote['passed'] = vote['yes_count'] > vote['no_count']
            vote.add_source(link.get('href'))

            bill.add_vote(vote)
Пример #24
0
    def get_lower_votes(self):

        url = ('http://assembly.state.ny.us/leg/?'
               'default_fld=&bn=%s&term=%s&Votes=Y')
        url = url % (self.bill_id, self.term_start_year)
        doc = self.url2lxml(url)
        if doc is None:
            return

        pre = doc.xpath('//pre')[0].text_content()
        no_votes = ('There are no votes for this bill in this '
                    'legislative session.')
        if pre == no_votes:
            return

        actual_vote = collections.defaultdict(list)
        for table in doc.xpath('//table'):

            date = table.xpath('caption/label[contains(., "DATE:")]')
            date = date[0].itersiblings().next().text
            date = datetime.datetime.strptime(date, '%m/%d/%Y')

            votes = table.xpath('caption/span/label[contains(., "YEA/NAY:")]')
            votes = votes[0].itersiblings().next().text
            yes_count, no_count = map(int, votes.split('/'))

            passed = yes_count > no_count
            vote = Vote('lower', date, 'Floor Vote', passed, yes_count,
                        no_count, other_count=0)

            tds = table.xpath('tr/td/text()')
            votes = iter(tds)
            while True:
                try:
                    data = list(islice(votes, 2))
                    name, vote_val = data
                except (StopIteration, ValueError):
                    # End of data. Stop.
                    break
                name = self._scrub_name(name)

                if vote_val.strip() == 'Y':
                    vote.yes(name)
                elif vote_val.strip() in ('N', 'NO'):
                    vote.no(name)
                else:
                    vote.other(name)
                    actual_vote[vote_val].append(name)

            # The page doesn't provide an other_count.
            vote['other_count'] = len(vote['other_votes'])
            vote['actual_vote'] = actual_vote
            self.bill.add_vote(vote)
Пример #25
0
    def parse_vote(self, bill, actor, date, motion, url, uniqid):
        page = self.get(url).text
        bill.add_source(url)
        vote_re = re.compile(
            "YEAS -?\s?(\d+)(.*)NAYS -?\s?(\d+)" "(.*)ABSENT( OR NOT VOTING)? -?\s?" "(\d+)(.*)",
            re.MULTILINE | re.DOTALL,
        )
        match = vote_re.search(page)
        yes_count = int(match.group(1))
        no_count = int(match.group(3))
        other_count = int(match.group(6))

        if yes_count > no_count:
            passed = True
        else:
            passed = False

        if actor == "upper" or actor == "lower":
            vote_chamber = actor
            vote_location = ""
        else:
            vote_chamber = ""
            vote_location = actor

        vote = Vote(
            vote_chamber,
            date,
            motion,
            passed,
            yes_count,
            no_count,
            other_count,
            location=vote_location,
            _vote_id=uniqid,
        )
        vote.add_source(url)

        yes_votes = re.split("\s{2,}", match.group(2).strip())
        no_votes = re.split("\s{2,}", match.group(4).strip())
        other_votes = re.split("\s{2,}", match.group(7).strip())

        for yes in yes_votes:
            if yes:
                vote.yes(yes)
        for no in no_votes:
            if no:
                vote.no(no)
        for other in other_votes:
            if other:
                vote.other(other)

        bill.add_vote(vote)
Пример #26
0
    def scrape_vote(self, bill, chamber, url):
        page = self.urlopen(url)
        if 'There are no details available for this roll call' in page:
            return
        page = page.replace('&nbsp;', ' ')
        page = lxml.html.fromstring(page)

        info_row = page.xpath("//table[1]/tr[2]")[0]

        date = info_row.xpath("string(td[1])")
        date = datetime.datetime.strptime(date, "%m/%d/%Y")

        motion = info_row.xpath("string(td[2])")
        yes_count = int(info_row.xpath("string(td[3])"))
        no_count = int(info_row.xpath("string(td[4])"))
        other_count = int(info_row.xpath("string(td[5])"))
        passed = info_row.xpath("string(td[6])") == 'Pass'

        if motion == 'Shall the bill pass?':
            type = 'passage'
        elif motion == 'Shall the bill be read the third time?':
            type = 'reading:3'
        elif 'be amended as' in motion:
            type = 'amendment'
        else:
            type = 'other'

        vote = Vote(chamber, date, motion, passed,
                    yes_count, no_count, other_count)
        vote.add_source(url)

        for tr in page.xpath("//table[1]/tr")[3:]:
            if len(tr.xpath("td")) != 2:
                continue


            # avoid splitting duplicate names
            name = tr.xpath("string(td[1])").strip()
            if not name.startswith(DOUBLED_NAMES):
                name = name.split(' of')[0]

            type = tr.xpath("string(td[2])").strip()
            if type.startswith('Yea'):
                vote.yes(name)
            elif type.startswith('Nay'):
                vote.no(name)
            elif type.startswith('Not Voting'):
                pass
            else:
                vote.other(name)

        bill.add_vote(vote)
Пример #27
0
    def scrape_vote(self, bill, motion, url):
        page = self.urlopen(url, retry_on_404=True)
        page = lxml.html.fromstring(page)

        yeas_cell = page.xpath("//td[text() = 'Yeas (Y):']")[0]
        yes_count = int(yeas_cell.xpath("string(following-sibling::td)"))

        nays_cell = page.xpath("//td[text() = 'Nays (N):']")[0]
        no_count = int(nays_cell.xpath("string(following-sibling::td)"))

        abs_cell = page.xpath("//td[text() = 'Absent (X):']")[0]
        abs_count = int(abs_cell.xpath("string(following-sibling::td)"))

        ex_cell = page.xpath("//td[text() = 'Excused (E):']")[0]
        ex_count = int(ex_cell.xpath("string(following-sibling::td)"))

        other_count = abs_count + ex_count

        if 'chamber=House' in url:
            chamber = 'lower'
        elif 'chamber=Senate' in url:
            chamber = 'upper'

        date_cell = page.xpath("//td[text() = 'Date:']")[0]
        date = date_cell.xpath("string(following-sibling::td)")
        try:
            date = datetime.datetime.strptime(date, "%B %d, %Y")
        except ValueError:
            date = datetime.datetime.strptime(date, "%b. %d, %Y")

        outcome_cell = page.xpath("//td[text()='Outcome:']")[0]
        outcome = outcome_cell.xpath("string(following-sibling::td)")

        vote = Vote(chamber, date, motion,
                    outcome == 'PREVAILS',
                    yes_count, no_count, other_count)
        vote.add_source(url)

        member_cell = page.xpath("//td[text() = 'Member']")[0]
        for row in member_cell.xpath("../../tr")[1:]:
            name = row.xpath("string(td[2])")
            # name = name.split(" of ")[0]

            vtype = row.xpath("string(td[4])")
            if vtype == 'Y':
                vote.yes(name)
            elif vtype == 'N':
                vote.no(name)
            elif vtype == 'X' or vtype == 'E':
                vote.other(name)

        bill.add_vote(vote)
Пример #28
0
    def _parse_senate_votes(self, vote_data):
        vote_datetime = datetime.datetime.strptime(vote_data['voteDate'],
            '%Y-%m-%d')

        vote = Vote(
            chamber='upper',
            date=vote_datetime.date(),
            motion='[No motion available.]',
            passed=False,
            yes_votes=[],
            no_votes=[],
            other_votes=[],
            yes_count=0,
            no_count=0,
            other_count=0)

        if vote_data['voteType'] == 'FLOOR':
            vote['motion'] = 'Floor Vote'
        elif vote_data['voteType'] == 'COMMITTEE':
            vote['motion'] = '{} Vote'.format(vote_data['committee']['name'])
        else:
            raise ValueError('Unknown vote type encountered.')

        vote_rolls = vote_data['memberVotes']['items']

        # Count all yea votes.
        if 'items' in vote_rolls.get('AYE', {}):
            for legislator in vote_rolls['AYE']['items']:
                vote.yes(legislator['fullName'])
                vote['yes_count'] += 1
        if 'items' in vote_rolls.get('AYEWR', {}):
            for legislator in vote_rolls['AYEWR']['items']:
                vote.yes(legislator['fullName'])
                vote['yes_count'] += 1

        # Count all nay votes.
        if 'items' in vote_rolls.get('NAY', {}):
            for legislator in vote_rolls['NAY']['items']:
                vote.no(legislator['fullName'])
                vote['no_count'] += 1

        # Count all other types of votes.
        other_vote_types = ('EXC', 'ABS', 'ABD')
        for vote_type in other_vote_types:
            if vote_rolls.get(vote_type, []):
                for legislator in vote_rolls[vote_type]['items']:
                    vote.other(legislator['fullName'])
                    vote['other_count'] += 1

        vote['passed'] = vote['yes_count'] > vote['no_count']

        return vote
Пример #29
0
    def scrape_votes(self, bill, page):
        for b in page.xpath("//div/b[starts-with(., 'VOTE: FLOOR VOTE:')]"):
            date = b.text.split('-')[1].strip()
            date = datetime.datetime.strptime(date, "%b %d, %Y").date()

            yes_votes, no_votes, other_votes = [], [], []
            yes_count, no_count, other_count = 0, 0, 0

            vtype = None
            for tag in b.xpath("following-sibling::blockquote/*"):
                if tag.tag == 'b':
                    text = tag.text
                    if text.startswith('Ayes'):
                        vtype = 'yes'
                        yes_count = int(re.search(
                            r'\((\d+)\):', text).group(1))
                    elif text.startswith('Nays'):
                        vtype = 'no'
                        no_count = int(re.search(
                            r'\((\d+)\):', text).group(1))
                    elif (text.startswith('Excused') or
                          text.startswith('Abstains') or
                          text.startswith('Absent')
                         ):
                        vtype = 'other'
                        other_count += int(re.search(
                            r'\((\d+)\):', text).group(1))
                    else:
                        raise ValueError('bad vote type: %s' % tag.text)
                elif tag.tag == 'a':
                    name = tag.text.strip()
                    if vtype == 'yes':
                        yes_votes.append(name)
                    elif vtype == 'no':
                        no_votes.append(name)
                    elif vtype == 'other':
                        other_votes.append(name)

            passed = yes_count > (no_count + other_count)

            vote = Vote('upper', date, 'Floor Vote', passed, yes_count,
                        no_count, other_count)

            for name in yes_votes:
                vote.yes(name)
            for name in no_votes:
                vote.no(name)
            for name in other_votes:
                vote.other(name)

            bill.add_vote(vote)
Пример #30
0
    def scrape_votes(self, bill_page, bill, insert, year):
        root = lxml.html.fromstring(bill_page)
        for link in root.xpath('//a[contains(text(), "Passage")]'):
            motion = link.text
            if "Assembly" in motion:
                chamber = "lower"
            else:
                chamber = "upper"
            vote_url = "http://www.leg.state.nv.us/Session/%s/Reports/%s" % (insert, link.get("href"))
            bill.add_source(vote_url)
            with self.urlopen(vote_url) as page:
                page = page.decode("utf8").replace(u"\xa0", " ")
                root = lxml.html.fromstring(page)

                date = root.xpath("//h1/text()")[-1].strip()
                date = datetime.strptime(date, "%B %d, %Y at %H:%M %p")
                top_block_text = root.xpath('//div[@align="center"]')[0].text_content()
                yes_count = int(re.findall("(\d+) Yea", top_block_text)[0])
                no_count = int(re.findall("(\d+) Nay", top_block_text)[0])
                excused = int(re.findall("(\d+) Excused", top_block_text)[0])
                not_voting = int(re.findall("(\d+) Not Voting", top_block_text)[0])
                absent = int(re.findall("(\d+) Absent", top_block_text)[0])
                other_count = excused + not_voting + absent
                passed = yes_count > no_count

                vote = Vote(
                    chamber,
                    date,
                    motion,
                    passed,
                    yes_count,
                    no_count,
                    other_count,
                    not_voting=not_voting,
                    absent=absent,
                )

                for el in root.xpath("//table[2]/tr"):
                    tds = el.xpath("td")
                    name = tds[1].text_content().strip()
                    vote_result = tds[2].text_content().strip()

                    if vote_result == "Yea":
                        vote.yes(name)
                    elif vote_result == "Nay":
                        vote.no(name)
                    else:
                        vote.other(name)
                bill.add_vote(vote)
Пример #31
0
    def parse_house_vote(self, url):
        """ house votes are pdfs that can be converted to text, require some
        nasty regex to get votes out reliably """

        fname, resp = self.urlretrieve(url)
        text = convert_pdf(fname, 'text')
        if not text.strip():
            self.warning('image PDF %s' % url)
            return
        os.remove(fname)

        # get date
        date = re.findall('(\d+/\d+/\d+)', text)[0]
        date = datetime.strptime(date, '%m/%d/%y')

        # get totals
        absent, yea, nay, exc = self.HOUSE_TOTAL_RE.findall(text)[0]

        # make vote (faked passage indicator)
        vote = Vote('lower', date, 'house passage', int(yea) > int(nay),
                    int(yea), int(nay), int(absent)+int(exc))
        vote.add_source(url)

        # votes
        real_votes = False
        for v, name in HOUSE_VOTE_RE.findall(text):
            # our regex is a bit broad, wait until we see 'Nays' to start
            # and end when we see CERTIFIED or ____ signature line
            if 'Nays' in name or 'Excused' in name:
                real_votes = True
                continue
            elif 'CERTIFIED' in name or '___' in name:
                break
            elif real_votes and name.strip():
                if v == 'Y':
                    vote.yes(name)
                elif v == 'N':
                    vote.no(name)
                else:   # excused/absent
                    vote.other(name)
        return vote
Пример #32
0
    def scrape_votes(self, bill_page, bill, insert, year):
        root = lxml.html.fromstring(bill_page)
        for link in root.xpath('//a[contains(text(), "Passage")]'):
            motion = link.text
            if 'Assembly' in motion:
                chamber = 'lower'
            else:
                chamber = 'upper'
            vote_url = 'http://www.leg.state.nv.us/Session/%s/Reports/%s' % (
                insert, link.get('href'))
            bill.add_source(vote_url)
            page = self.urlopen(vote_url)
            page = page.replace(u"\xa0", " ")
            root = lxml.html.fromstring(page)

            date = root.xpath('//h1/text()')[-1].strip()
            date = datetime.strptime(date, "%B %d, %Y at %H:%M %p")
            top_block_text = root.xpath('//div[@align="center"]')[0].text_content()
            yes_count = int(re.findall("(\d+) Yea", top_block_text)[0])
            no_count = int(re.findall("(\d+) Nay", top_block_text)[0])
            excused = int(re.findall("(\d+) Excused", top_block_text)[0])
            not_voting = int(re.findall("(\d+) Not Voting", top_block_text)[0])
            absent = int(re.findall("(\d+) Absent", top_block_text)[0])
            other_count = excused + not_voting + absent
            passed = yes_count > no_count

            vote = Vote(chamber, date, motion, passed, yes_count, no_count,
                        other_count, not_voting=not_voting, absent=absent)

            for el in root.xpath('//table[2]/tr'):
                tds = el.xpath('td')
                name = tds[1].text_content().strip()
                vote_result = tds[2].text_content().strip()

                if vote_result == 'Yea':
                    vote.yes(name)
                elif vote_result == 'Nay':
                    vote.no(name)
                else:
                    vote.other(name)
            bill.add_vote(vote)
Пример #33
0
    def scrape_vote(self, bill, moid, vote_id, body, inst, motion, chamber):
        url = "http://alisondb.legislature.state.al.us/acas/GetRollCallVoteResults.asp?MOID=%s&VOTE=%s&BODY=%s&INST=%s&SESS=%s" % (
            moid, vote_id, body, inst, self.session_id)
        doc = lxml.html.fromstring(self.urlopen(url))

        voters = {'Y': [], 'N': [], 'P': [], 'A': []}

        leg_tds = doc.xpath('//td[@width="33%"]')
        for td in leg_tds:
            name = td.text
            two_after = td.xpath('following-sibling::td')[1].text
            if name == 'Total Yea:':
                total_yea = int(two_after)
            elif name == 'Total Nay:':
                total_nay = int(two_after)
            elif name == 'Total Abs:':
                total_abs = int(two_after)
            elif name == 'Legislative Date:':
                vote_date = datetime.datetime.strptime(two_after, '%m/%d/%Y')
            # lines to ignore
            elif name in ('Legislative Day:', 'Vote ID:'):
                pass
            elif 'Vacant' in name:
                pass
            else:
                # add legislator to list of voters
                voters[two_after].append(name)

        # TODO: passed is faked
        total_other = total_abs + len(voters['P'])
        vote = Vote(chamber, vote_date, motion, total_yea > total_nay,
                    total_yea, total_nay, total_other)
        vote.add_source(url)
        for member in voters['Y']:
            vote.yes(member)
        for member in voters['N']:
            vote.no(member)
        for member in (voters['A'] + voters['P']):
            vote.other(member)

        bill.add_vote(vote)
Пример #34
0
 def scrape(self, chamber, session):
     url = {
         "upper": "%s/%s" % (RI_URL_BASE, "SVotes"),
         "lower": "%s/%s" % (RI_URL_BASE, "HVotes")
     }
     url = url[chamber]
     action = "%s/%s" % (url, "votes.asp")
     dates = self.get_dates(url)
     for date in dates:
         votes = self.parse_vote_page(self.post_to(action, date), url,
                                      session)
         for vote_dict in votes:
             for vote in vote_dict:
                 vote = vote_dict[vote]
                 count = vote['count']
                 chamber = {
                     "H": "lower",
                     "S": "upper"
                 }[vote['meta']['chamber']]
                 v = Vote(
                     chamber,
                     vote['time'],
                     vote['meta']['extra']['motion'],
                     count['passage'],
                     int(count['YEAS']),
                     int(count['NAYS']),
                     int(count['NOT VOTING']),
                     session=session,
                     bill_id=vote['meta']['bill'],
                     bill_chamber=chamber,
                     bill_session=vote['meta']['year'],
                 )
                 v.add_source(vote['source'])
                 for vt in vote['votes']:
                     if vt['vote'] == "Y":
                         v.yes(vt['name'])
                     elif vt['vote'] == "N":
                         v.no(vt['name'])
                     else:
                         v.other(vt['name'])
                 self.save_vote(v)
Пример #35
0
    def scrape_votes(self, bill):
        bill_num = bill['bill_id'].split()[1]

        url = ("http://wslwebservices.leg.wa.gov/legislationservice.asmx/"
               "GetRollCalls?billNumber=%s&biennium=%s" %
               (bill_num, self.biennium))
        page = self.get(url)
        page = lxml.etree.fromstring(page.content)

        for rc in xpath(page, "//wa:RollCall"):
            motion = xpath(rc, "string(wa:Motion)")

            date = xpath(rc, "string(wa:VoteDate)").split("T")[0]
            date = datetime.datetime.strptime(date, "%Y-%m-%d").date()

            yes_count = int(xpath(rc, "string(wa:YeaVotes/wa:Count)"))
            no_count = int(xpath(rc, "string(wa:NayVotes/wa:Count)"))
            abs_count = int(xpath(rc, "string(wa:AbsentVotes/wa:Count)"))
            ex_count = int(xpath(rc, "string(wa:ExcusedVotes/wa:Count)"))

            other_count = abs_count + ex_count

            agency = xpath(rc, "string(wa:Agency)")
            chamber = {'House': 'lower', 'Senate': 'upper'}[agency]

            vote = Vote(chamber, date, motion, yes_count >
                        (no_count + other_count), yes_count, no_count,
                        other_count)

            for sv in xpath(rc, "wa:Votes/wa:Vote"):
                name = xpath(sv, "string(wa:Name)")
                vtype = xpath(sv, "string(wa:VOte)")

                if vtype == 'Yea':
                    vote.yes(name)
                elif vtype == 'Nay':
                    vote.no(name)
                else:
                    vote.other(name)

            bill.add_vote(vote)
Пример #36
0
    def parse_vote(self, bill, link):
        member_doc = lxml.html.fromstring(self.get(link).text)
        motion = member_doc.xpath("//div[@id='main_content']/h4/text()")
        opinions = member_doc.xpath("//div[@id='main_content']/h3/text()")
        if len(opinions) > 0:
            temp = opinions[0].split()
            vote_chamber = temp[0]
            vote_date = datetime.datetime.strptime(temp[-1], '%m/%d/%Y')
            vote_status = " ".join(temp[2:-2])
            vote_status = vote_status if vote_status.strip() else motion[0]
            vote_chamber = 'upper' if vote_chamber == 'Senate' else 'lower'

            for i in opinions:
                try:
                    count = int(i[i.find("(") + 1:i.find(")")])
                except:
                    pass
                if "yea" in i.lower():
                    yes_count = count
                elif "nay" in i.lower():
                    no_count = count
                elif "present" in i.lower():
                    p_count = count
                elif "absent" in i.lower():
                    a_count = count
            vote = Vote(vote_chamber, vote_date, vote_status,
                        yes_count > no_count, yes_count, no_count,
                        p_count + a_count)
            vote.add_source(link)
            a_links = member_doc.xpath("//div[@id='main_content']/a/text()")
            for i in range(1, len(a_links)):
                if i <= yes_count:
                    vote.yes(re.sub(',', '', a_links[i]).split()[0])
                elif no_count != 0 and i > yes_count and i <= yes_count + no_count:
                    vote.no(re.sub(',', '', a_links[i]).split()[0])
                else:
                    vote.other(re.sub(',', '', a_links[i]).split()[0])
            bill.add_vote(vote)
        else:
            print self.warning("No Votes for: %s", link)
Пример #37
0
    def scrape_vote(self, bill, chamber, date, url):
        (path, resp) = self.urlretrieve(url)
        text = convert_pdf(path, 'text')
        os.remove(path)

        motion = text.split('\n')[4].strip()

        yes_count = int(re.search(r'Yeas - (\d+)', text).group(1))
        no_count = int(re.search(r'Nays - (\d+)', text).group(1))
        other_count = int(re.search(r'Not Voting - (\d+)', text).group(1))
        passed = yes_count > (no_count + other_count)

        vote = Vote(chamber, date, motion, passed, yes_count, no_count,
                    other_count)
        vote.add_source(url)

        for line in text.split('\n')[9:]:
            if 'after roll call' in line:
                break
            if 'Presiding' in line:
                continue

            for col in re.split(r'-\d+', line):
                col = col.strip()
                if not col:
                    continue

                match = re.match(r'(Y|N|EX)\s+(.+)$', col)
                if match:
                    if match.group(1) == 'Y':
                        vote.yes(match.group(2))
                    elif match.group(1) == 'N':
                        vote.no(match.group(2))
                    else:
                        vote.other(match.group(2))
                else:
                    vote.other(col.strip())

        vote.validate()
        bill.add_vote(vote)
Пример #38
0
 def apply_votes(self, bill):
     """Given a bill (and assuming it has a status_url in its dict), parse all of the votes
     """
     bill_votes = votes.all_votes_for_url(self, bill['status_url'])
     for (chamber, vote_desc, pdf_url, these_votes) in bill_votes:
         try:
             date = vote_desc.split("-")[-1]
         except IndexError:
             self.warning("[%s] Couldn't get date out of [%s]" %
                          (bill['bill_id'], vote_desc))
             continue
         yes_votes = []
         no_votes = []
         other_votes = []
         for voter, vote in these_votes.iteritems():
             if vote == 'Y':
                 yes_votes.append(voter)
             elif vote == 'N':
                 no_votes.append(voter)
             else:
                 other_votes.append(voter)
         passed = len(yes_votes) > len(
             no_votes
         )  # not necessarily correct, but not sure where else to get it. maybe from pdf
         vote = Vote(standardize_chamber(chamber),
                     date,
                     vote_desc,
                     passed,
                     len(yes_votes),
                     len(no_votes),
                     len(other_votes),
                     pdf_url=pdf_url)
         for voter in yes_votes:
             vote.yes(voter)
         for voter in no_votes:
             vote.no(voter)
         for voter in other_votes:
             vote.other(voter)
         bill.add_vote(vote)
Пример #39
0
def record_votes(root, session):
    for el in root.xpath('//div{}'.format(''.join(vote_selectors))):
        mv = MaybeVote(el)
        if not mv.is_valid:
            continue

        v = Vote(None, None, 'passage' if mv.passed else 'other', mv.passed,
                 mv.yeas or 0, mv.nays or 0, mv.present or 0)
        v['bill_id'] = mv.bill_id
        v['bill_chamber'] = mv.chamber
        v['is_amendment'] = mv.is_amendment
        v['session'] = session[0:2]
        v['method'] = 'record'

        for each in mv.votes['yeas']:
            v.yes(each)
        for each in mv.votes['nays']:
            v.no(each)
        for each in mv.votes['present'] + mv.votes['absent']:
            v.other(each)

        yield v
Пример #40
0
    def scrape_vote(self, chamber, session, bill_id, vote_url):
        NO_VOTE_URL = 'http://www.house.leg.state.mn.us/votes/novotefound.asp'
        with self.urlopen(vote_url) as html:

            # sometimes the link is broken, will redirect to NO_VOTE_URL
            if html.response.url == NO_VOTE_URL:
                return

            doc = lxml.html.fromstring(html)
            paragraphs = doc.xpath('//h1/following-sibling::p')

            # first paragraph has motion and vote total
            top_par = paragraphs[0].text_content()
            lines = top_par.splitlines()
            # 3rd line is the motion except in cases where first line is gone
            motion = lines[2] or lines[1]
            # last line is "__ YEA and __ Nay"
            yeas, nays = self.yeanay_re.match(lines[-1]).groups()
            yeas = int(yeas)
            nays = int(nays)

            # second paragraph has date
            date = self.date_re.match(paragraphs[1].text_content()).groups()[0]
            date = datetime.datetime.strptime(date, '%m/%d/%Y')

            vote = Vote('lower', date, motion, yeas>nays, yeas, nays, 0,
                        session=session, bill_id=bill_id, bill_chamber=chamber)
            vote.add_source(vote_url)

            # first table has YEAs
            for name in doc.xpath('//table[1]/tr/td/font/text()'):
                vote.yes(name.strip())

            # second table is nays
            for name in doc.xpath('//table[2]/tr/td/font/text()'):
                vote.no(name.strip())

            self.save_vote(vote)
Пример #41
0
    def scrape_vote(self, bill, vote_type_id, vote_type):
        base_url = 'http://www.dccouncil.washington.dc.us/lims/voting.aspx?VoteTypeID=%s&LegID=%s'
        url = base_url % (vote_type_id, bill['bill_id'])

        with self.urlopen(url) as html:
            doc = lxml.html.fromstring(html)

            vote_date = convert_date(doc.get_element_by_id('VoteDate').text)

            # check if voice vote / approved boxes have an 'x'
            voice = (doc.xpath('//span[@id="VoteTypeVoice"]/b/text()')[0] ==
                     'x')
            passed = (doc.xpath('//span[@id="VoteResultApproved"]/b/text()')[0]
                      == 'x')

            yes_count = extract_int(doc.xpath(
                '//span[@id="VoteCount1"]/b/text()')[0])
            no_count = extract_int(doc.xpath(
                '//span[@id="VoteCount2"]/b/text()')[0])
            # every now and then this actually drops below 0 (error in count)
            other_count = max(13 - (yes_count+no_count), 0)

            vote = Vote('upper', vote_date, vote_type, passed, yes_count,
                        no_count, other_count, voice_vote=voice)

            vote.add_source(url)

            # members are only text on page in a <u> tag
            for member_u in doc.xpath('//u'):
                member = member_u.text
                vote_text = member_u.xpath('../../i/text()')[0]
                if 'Yes' in vote_text:
                    vote.yes(member)
                elif 'No' in vote_text:
                    vote.no(member)
                else:
                    vote.other(member)
        bill.add_vote(vote)
Пример #42
0
    def scrape_vote(self, bill, chamber, date, td):
        motion = td.text
        result = td.xpath("string(span[1])").strip()
        passed = result.split()[0] == "PASSED"
        yes, no, other = [
            int(g) for g in re.search(r'(\d+)-(\d+)-(\d+)$', result).groups()
        ]

        vote = Vote(chamber, date, motion, passed, yes, no, other)

        for name in split_names(td.xpath("span[. = 'AYES']")[0].tail):
            vote.yes(name)
        for name in split_names(td.xpath("span[. = 'NAYS']")[0].tail):
            vote.no(name)
        for name in split_names(
                td.xpath("span[contains(., 'Absent')]")[0].tail):
            vote.other(name)

        assert len(vote['yes_votes']) == vote['yes_count']
        assert len(vote['no_votes']) == vote['no_count']
        assert len(vote['other_votes']) == vote['other_count']

        bill.add_vote(vote)
Пример #43
0
    def parse_vote(self, actor, date, row):
        """
        takes the actor, date and row element and returns a Vote object
        """
        spans = row.xpath('.//span')
        motion = row.text.replace(u'\u00a0', " ").replace("-", "").strip()
        motion = motion if motion else "passage"
        passed, yes_count, no_count, other_count = spans[0].text_content(
        ).rsplit('-', 3)
        yes_votes = self.get_names(spans[1].tail)
        no_votes = self.get_names(spans[2].tail)

        other_votes = []
        for span in spans[3:]:
            if span.text.startswith(('Absent', 'Excused')):
                other_votes += self.get_names(span.tail)
        for key, val in {
                'adopted': True,
                'passed': True,
                'failed': False
        }.items():
            if key in passed.lower():
                passed = val
                break
        vote = Vote(actor, date, motion, passed, int(yes_count), int(no_count),
                    int(other_count))
        for name in yes_votes:
            if name and name != 'None':
                vote.yes(name)
        for name in no_votes:
            if name and name != 'None':
                vote.no(name)
        for name in other_votes:
            if name and name != 'None':
                vote.other(name)
        return vote
Пример #44
0
    def scrape_vote(self, bill, date, motion, url):
        page = self.urlopen(url)

        if 'not yet official' in page:
            # Sometimes they link to vote pages before they go live
            return

        page = lxml.html.fromstring(page)

        if url.endswith('Senate'):
            actor = 'upper'
        else:
            actor = 'lower'

        count_path = "string(//td[@align = 'center' and contains(., '%s: ')])"
        yes_count = int(page.xpath(count_path % "Yeas").split()[-1])
        no_count = int(page.xpath(count_path % "Nays").split()[-1])
        other_count = int(page.xpath(count_path % "Non Voting").split()[-1])
        other_count += int(page.xpath(count_path % "Present").split()[-1])

        passed = yes_count > no_count + other_count
        vote = Vote(actor, date, motion, passed, yes_count,
                    no_count, other_count)
        vote.add_source(url)

        vote_path = "//h3[. = '%s']/following-sibling::table[1]/tr/td/a"
        for yes in page.xpath(vote_path % "Yeas"):
            vote.yes(yes.text)
        for no in page.xpath(vote_path % "Nays"):
            vote.no(no.text)
        for other in page.xpath(vote_path % "Non Voting"):
            vote.other(other.text)
        for other in page.xpath(vote_path % "Present"):
            vote.other(other.text)

        bill.add_vote(vote)
Пример #45
0
    def scrape_bill_sheet(self, session, chamber):
        """
        Scrape the bill sheet (the page full of bills and other small bits of data)
        """
        sheet_url = self.get_bill_folder(session, chamber)

        bill_chamber = {"Senate": "upper", "House": "lower"}[chamber]

        index = {
            "id": 0,
            "title_sponsor": 1,
            "version": 2,
            "history": 3,
            "votes": 7
        }

        with self.urlopen(sheet_url) as sheet_html:
            sheet_page = lxml.html.fromstring(sheet_html)

            bills = sheet_page.xpath('//table/tr')

            for bill in bills:
                bill_id = self.read_td(bill[index["id"]][0])

                if bill_id == None:
                    # Every other entry is null for some reason
                    continue

                dot_loc = bill_id.find('.')
                if dot_loc != -1:
                    # budget bills are missing the .pdf, don't truncate
                    bill_id = bill_id[:dot_loc]
                title_and_sponsor = bill[index["title_sponsor"]][0]

                bill_title = title_and_sponsor.text
                bill_title_and_sponsor = title_and_sponsor.text_content()
                sponsors = bill_title_and_sponsor.replace(bill_title, "").\
                    replace(" & ...", "").split("--")

                cats = {
                    "SB": "bill",
                    "HB": "bill",
                    "HR": "resolution",
                    "SR": "resolution",
                    "SCR": "concurrent resolution",
                    "HCR": "concurrent resolution",
                    "SJR": "joint resolution",
                    "HJR": "joint resolution",
                    "SM": "memorial",
                    "HM": "memorial"
                }

                bill_type = None

                for cat in cats:
                    if bill_id[:len(cat)] == cat:
                        bill_type = cats[cat]

                b = Bill(session,
                         bill_chamber,
                         bill_id,
                         bill_title,
                         type=bill_type)

                b.add_source(sheet_url)

                versions_url = \
                    bill[index["version"]].xpath('font/a')[0].attrib["href"]
                versions_url = CO_URL_BASE + versions_url
                versions = self.parse_versions(versions_url)
                for version in versions:
                    b.add_version(version['name'],
                                  version['link'],
                                  mimetype=version['mimetype'])

                bill_history_href = CO_URL_BASE + \
                    bill[index["history"]][0][0].attrib['href']
                # ^^^^^^^ We assume this is a full path to the target.
                # might want to consider some better rel-path support
                # XXX: Look at this ^

                history = self.parse_history(bill_history_href)
                b.add_source(bill_history_href)

                for action in history:
                    self.add_action_to_bill(b, action)

                for sponsor in sponsors:
                    if sponsor != None and sponsor != "(NONE)" and \
                       sponsor != "":
                        b.add_sponsor("primary", sponsor)

                # Now that we have history, let's see if we can't grab some
                # votes

                bill_vote_href = self.get_vote_url(bill_id, session)
                votes = self.parse_votes(bill_vote_href)

                if votes['sanity-check'] != bill_id:
                    self.warning("XXX: READ ME! Sanity check failed!")
                    self.warning(" -> Scraped ID: " + votes['sanity-check'])
                    self.warning(" -> 'Real' ID:  " + bill_id)
                    assert votes['sanity-check'] == bill_id

                for vote in votes['votes']:
                    filed_votes = vote['votes']
                    passage = vote['meta']
                    result = vote['result']

                    composite_time = "%s %s" % (passage['x-parent-date'],
                                                passage['TIME'])
                    # It's now like: 04/01/2011 02:10:14 PM
                    pydate = dt.datetime.strptime(composite_time,
                                                  "%m/%d/%Y %I:%M:%S %p")
                    hasHouse = "House" in passage['x-parent-ctty']
                    hasSenate = "Senate" in passage['x-parent-ctty']

                    if hasHouse and hasSenate:
                        actor = "joint"
                    elif hasHouse:
                        actor = "lower"
                    else:
                        actor = "upper"

                    other = (int(result['EXC']) + int(result['ABS']))
                    # OK, sometimes the Other count is wrong.
                    local_other = 0
                    for voter in filed_votes:
                        l_vote = filed_votes[voter].lower().strip()
                        if l_vote != "yes" and l_vote != "no":
                            local_other = local_other + 1

                    if local_other != other:
                        self.warning( \
                            "XXX: !!!WARNING!!! - resetting the 'OTHER' VOTES" )
                        self.warning(" -> Old: %s // New: %s" %
                                     (other, local_other))
                        other = local_other

                    v = Vote(actor,
                             pydate,
                             passage['MOTION'],
                             (result['FINAL_ACTION'] == "PASS"),
                             int(result['YES']),
                             int(result['NO']),
                             other,
                             moved=passage['MOVED'],
                             seconded=passage['SECONDED'])

                    v.add_source(vote['meta']['url'])
                    # v.add_source( bill_vote_href )

                    # XXX: Add more stuff to kwargs, we have a ton of data
                    for voter in filed_votes:
                        who = voter
                        vote = filed_votes[who]
                        if vote.lower() == "yes":
                            v.yes(who)
                        elif vote.lower() == "no":
                            v.no(who)
                        else:
                            v.other(who)
                    b.add_vote(v)
                self.save_bill(b)
Пример #46
0
    def parse_vote(self, bill, vote_date, vote_chamber, vote_status, vote_url):
        vote_chamber = 'upper' if vote_chamber == 'Senate' else 'lower'
        vote_date = datetime.datetime.strptime(vote_date, '%a %d %b %Y')

        vote_doc, resp = self.urlretrieve(vote_url)

        try:
            subprocess.check_call('timeout 10 abiword --to=ksvote.txt %s' %
                                  vote_doc,
                                  shell=True,
                                  cwd='/tmp/')
        except subprocess.CalledProcessError:
            # timeout failed, some documents hang abiword
            self.error('abiword hung for longer than 10s on conversion')
            return
        vote_lines = open('/tmp/ksvote.txt').readlines()

        os.remove(vote_doc)

        comma_or_and = re.compile(', |\sand\s')

        vote = None
        passed = True
        for line in vote_lines:
            totals = re.findall(
                'Yeas (\d+)[;,] Nays (\d+)[;,] (?:Present but not voting|Present and Passing):? (\d+)[;,] (?:Absent or not voting|Absent or Not Voting):? (\d+)',
                line)
            line = line.strip()
            if totals:
                totals = totals[0]
                yeas = int(totals[0])
                nays = int(totals[1])
                nv = int(totals[2])
                absent = int(totals[3])
                # default passed to true
                vote = Vote(vote_chamber, vote_date, vote_status.strip(), True,
                            yeas, nays, nv + absent)
            elif vote and line.startswith('Yeas:'):
                line = line.split(':', 1)[1].strip()
                for member in comma_or_and.split(line):
                    if member != 'None.':
                        vote.yes(member)
            elif vote and line.startswith('Nays:'):
                line = line.split(':', 1)[1].strip()
                for member in comma_or_and.split(line):
                    if member != 'None.':
                        vote.no(member)
            elif vote and line.startswith('Present '):
                line = line.split(':', 1)[1].strip()
                for member in comma_or_and.split(line):
                    if member != 'None.':
                        vote.other(member)
            elif vote and line.startswith('Absent or'):
                line = line.split(':', 1)[1].strip()
                for member in comma_or_and.split(line):
                    if member != 'None.':
                        vote.other(member)
            elif 'the motion did not prevail' in line:
                passed = False

        if vote:
            vote['passed'] = passed
            vote.add_source(vote_url)
            bill.add_vote(vote)
Пример #47
0
class IDBillScraper(BillScraper):
    state = 'id'

    # the following are only used for parsing legislation from 2008 and earlier
    vote = None
    in_vote = False
    ayes = False
    nays = False
    other = False
    last_date = None

    def scrape_subjects(self, session):
        self._subjects = defaultdict(list)

        url = 'http://legislature.idaho.gov/legislation/%s/topicind.htm' % session
        html = self.urlopen(url)
        doc = lxml.html.fromstring(html)

        # loop through anchors
        anchors = doc.xpath('//td[@width="95%"]//a')
        for a in anchors:
            # if anchor has a name, that's the subject
            if a.get('name'):
                subject = a.get('name')
            # if anchor is a link to a bill, save that reference
            elif 'legislation' in a.get('href'):
                self._subjects[a.text].append(subject)

    def scrape(self, chamber, session):
        """
        Scrapes all the bills for a given session and chamber
        """

        #url = BILLS_URL % session
        if int(session[:4]) < 2009:
            self.scrape_pre_2009(chamber, session)
        else:
            self.scrape_subjects(session)
            self.scrape_post_2009(chamber, session)

    def scrape_post_2009(self, chamber, session):
        "scrapes legislation for 2009 and above"
        url = BILLS_URL % session
        with self.urlopen(url) as bill_index:
            html = lxml.html.fromstring(bill_index)
            # I check for rows with an id that contains 'bill' and startswith
            # 'H' or 'S' to make sure I dont get any links from the menus
            # might not be necessary
            bill_rows = html.xpath('//tr[contains(@id, "bill") and '\
                                   'starts-with(descendant::td/a/text(), "%s")]'\
                                   % _CHAMBERS[chamber][0])
            for row in bill_rows:
                matches = re.match(r'([A-Z]*)([0-9]+)',
                                   row[0].text_content().strip())
                bill_id = " ".join(matches.groups()).strip()
                short_title = row[1].text_content().strip()
                self.scrape_bill(chamber, session, bill_id, short_title)

    def scrape_pre_2009(self, chamber, session):
        """scrapes legislation from 2008 and below."""
        url = BILLS_URL + 'l'
        url = url % session
        with self.urlopen(url) as bill_index:
            html = lxml.html.fromstring(bill_index)
            html.make_links_absolute(url)
            links = html.xpath('//a')
            exprs = r'(%s[A-Z]*)([0-9]+)' % _CHAMBERS[chamber][0]
            for link in links:
                matches = re.match(exprs, link.text)
                if matches:
                    bill_id = " ".join(matches.groups())
                    short_title = link.tail[:link.tail.index('..')]
                    self.scrape_pre_2009_bill(chamber, session, bill_id,
                                              short_title)

    def scrape_bill(self, chamber, session, bill_id, short_title=None):
        """
        Scrapes documents, actions, vote counts and votes for
        bills from the 2009 session and above.
        """
        url = BILL_URL % (session, bill_id.replace(' ', ''))
        with self.urlopen(url) as bill_page:
            html = lxml.html.fromstring(bill_page)
            html.make_links_absolute(
                'http://legislature.idaho.gov/legislation/%s/' % session)
            bill_tables = html.xpath('./body/table/tr/td[2]')[0].xpath(
                './/table')
            title = bill_tables[1].text_content().strip()
            bill_type = get_bill_type(bill_id)
            bill = Bill(session, chamber, bill_id, title, type=bill_type)
            bill.add_source(url)
            bill['subjects'] = self._subjects[bill_id.replace(' ', '')]

            if short_title and bill['title'].lower() != short_title.lower():
                bill.add_title(short_title)

            # documents
            doc_links = html.xpath('//span/a')
            for link in doc_links:
                name = link.text_content().strip()
                href = link.get('href')
                if 'Engrossment' in name or 'Bill Text' in name:
                    bill.add_version(name, href)
                else:
                    bill.add_document(name, href)

            # sponsors range from a committee to one legislator to a group of legs
            sponsor_lists = bill_tables[0].text_content().split('by')
            if len(sponsor_lists) > 1:
                for sponsors in sponsor_lists[1:]:
                    for person in sponsors.split(','):
                        bill.add_sponsor('primary', person)

            actor = chamber
            last_date = None
            for row in bill_tables[2]:
                # lots of empty rows
                if len(row) == 1:
                    continue
                _, date, action, _ = [x.text_content().strip() for x in row]

                if date:
                    last_date = date
                else:
                    date = last_date

                date = datetime.datetime.strptime(date + '/' + session[0:4],
                                                  "%m/%d/%Y")
                if action.startswith('House'):
                    actor = 'lower'
                elif action.startswith('Senate'):
                    actor = 'upper'

                # votes
                if 'AYES' in action or 'NAYS' in action:
                    vote = self.parse_vote(actor, date, row[2])
                    vote.add_source(url)
                    bill.add_vote(vote)
                # some td's text is seperated by br elements
                if len(row[2]):
                    action = "".join(row[2].itertext())
                action = action.replace(u'\xa0', ' ').strip()
                atype = get_action(actor, action)
                bill.add_action(actor, action, date, type=atype)
                # after voice vote/roll call and some actions the bill is sent
                # 'to House' or 'to Senate'
                if 'to House' in action:
                    actor = 'lower'
                elif 'to Senate' in action:
                    actor = 'upper'
            self.save_bill(bill)

    def scrape_pre_2009_bill(self, chamber, session, bill_id, short_title=''):
        """bills from 2008 and below are in a 'pre' element and is simpler to
        parse them as text"""
        url = 'http://legislature.idaho.gov/legislation/%s/%s.html' % (
            session, bill_id.replace(' ', ''))
        with self.urlopen(url) as bill_page:
            html = lxml.html.fromstring(bill_page)
            text = html.xpath('//pre')[0].text.split('\r\n')

            # title
            title = " - ".join(
                [x.strip() for x in text[1].split('-') if x.isupper()])
            # bill type
            bill_type = get_bill_type(bill_id)

            bill = Bill(session, chamber, bill_id, title, type=bill_type)
            # sponsors
            sponsors = text[0].split('by')[-1]
            for sponsor in sponsors.split(','):
                bill.add_sponsor('primary', sponsor)

            actor = chamber
            self.flag()  # clear last bills vote flags
            self.vote = None  #

            for line in text:

                if re.match(r'^\d\d/\d\d', line):
                    date = date = datetime.datetime.strptime(
                        line[0:5] + '/' + session[0:4], "%m/%d/%Y")
                    self.last_date = date
                    action_text = line[5:].strip()
                    # actor
                    if action_text.lower().startswith('house') or \
                       action_text.lower().startswith('senate'):
                        actor = {'H': 'lower', 'S': 'upper'}[action_text[0]]

                    action = get_action(actor, action_text)
                    bill.add_action(actor, action_text, date, type=action)
                    if "bill:passed" in action or "bill:failed" in action:
                        passed = False if 'FAILED' in action_text else True
                        votes = re.search(r'(\d+)-(\d+)-(\d+)', action_text)
                        if votes:
                            yes, no, other = votes.groups()
                            self.in_vote = True
                            self.vote = Vote(chamber, date, action_text,
                                             passed, int(yes), int(no),
                                             int(other))
                else:
                    date = self.last_date
                    # nothing to do if its not a vote
                    if "Floor Sponsor" in line:
                        self.in_vote = False
                        if self.vote:
                            bill.add_vote(self.vote)
                            self.vote = None

                    if not self.in_vote:
                        continue
                    if 'AYES --' in line:
                        self.flag(ayes=True)
                    elif 'NAYS --' in line:
                        self.flag(nays=True)
                    elif 'Absent and excused' in line:
                        self.flag(other=True)

                    if self.ayes:
                        for name in line.replace('AYES --', '').split(','):
                            name = name.strip()
                            if name:
                                self.vote.yes(name)

                    if self.nays:
                        for name in line.replace('NAYS --', '').split(','):
                            name = name.strip()
                            if name:
                                self.vote.no(name)

                    if self.other:
                        for name in line.replace('Absent and excused --',
                                                 '').split(','):
                            name = name.strip()
                            if name:
                                self.vote.other(name)

            self.save_bill(bill)

    def parse_vote(self, actor, date, row):
        """
        takes the actor, date and row element and returns a Vote object
        """
        spans = row.xpath('.//span')
        motion = row.text
        passed, yes_count, no_count, other_count = spans[0].text_content(
        ).split('-')
        yes_votes = [
            name
            for name in spans[1].tail.replace(u'\xa0--\xa0', '').split(',')
            if name
        ]

        no_votes = [
            name
            for name in spans[2].tail.replace(u'\xa0--\xa0', '').split(',')
            if name
        ]
        other_votes = []
        if spans[3].text.startswith('Absent'):
            other_votes = [
                name
                for name in spans[3].tail.replace(u'\xa0--\xa0', '').split(',')
                if name
            ]
        for key, val in {
                'adopted': True,
                'passed': True,
                'failed': False
        }.items():
            if key in passed.lower():
                passed = val
                break
        vote = Vote(actor, date, motion, passed, int(yes_count), int(no_count),
                    int(other_count))
        for name in yes_votes:
            if name and name != 'None':
                vote.yes(name)
        for name in no_votes:
            if name and name != 'None':
                vote.no(name)
        for name in other_votes:
            if name and name != 'None':
                vote.other(name)
        return vote

    def flag(self, ayes=False, nays=False, other=False):
        """ help to keep track of where we are at parsing votes from text"""
        self.ayes = ayes
        self.nays = nays
        self.other = other
Пример #48
0
    def scrape_bill_sheet(self, session, chamber):
        """
        Scrape the bill sheet (the page full of bills and other small bits of data)
        """
        sheet_url = self.get_bill_folder(session, chamber)

        bill_chamber = {"Senate": "upper", "House": "lower"}[chamber]

        index = {
            "id": 0,
            "title_sponsor": 1,
            "version": 2,
            "history": 3,
            "votes": 7
        }

        sheet_html = self.urlopen(sheet_url)
        sheet_page = lxml.html.fromstring(sheet_html)
        sheet_page.make_links_absolute(sheet_url)

        bills = sheet_page.xpath('//table/tr')

        for bill in bills:
            bill_id = self.read_td(bill[index["id"]][0])

            if bill_id == None:
                # Every other entry is null for some reason
                continue

            dot_loc = bill_id.find('.')
            if dot_loc != -1:
                # budget bills are missing the .pdf, don't truncate
                bill_id = bill_id[:dot_loc]
            title_and_sponsor = bill[index["title_sponsor"]][0]

            bill_title = title_and_sponsor.text
            bill_title_and_sponsor = title_and_sponsor.text_content()
            if bill_title is None:
                continue  # Odd ...

            sponsors = bill_title_and_sponsor.replace(bill_title, "").\
                replace(" & ...", "").split("--")

            cats = {
                "SB": "bill",
                "HB": "bill",
                "HR": "resolution",
                "SR": "resolution",
                "SCR": "concurrent resolution",
                "HCR": "concurrent resolution",
                "SJR": "joint resolution",
                "HJR": "joint resolution",
                "SM": "memorial",
                "HM": "memorial"
            }

            bill_type = None

            for cat in cats:
                if bill_id[:len(cat)] == cat:
                    bill_type = cats[cat]

            b = Bill(session,
                     bill_chamber,
                     bill_id,
                     bill_title,
                     type=bill_type)

            b.add_source(sheet_url)

            versions_url = \
                bill[index["version"]].xpath('font/a')[0].attrib["href"]
            versions_url = versions_url
            versions = self.parse_versions(versions_url)

            for version in versions:
                b.add_version(version['name'],
                              version['link'],
                              mimetype=version['mimetype'])

            bill_history_href = bill[index["history"]][0][0].attrib['href']

            history = self.parse_history(bill_history_href)
            b.add_source(bill_history_href)

            chamber_map = dict(Senate='upper', House='lower')
            for action, date in history:
                action_actor = chamber_map.get(chamber, chamber)
                attrs = dict(actor=action_actor, action=action, date=date)
                attrs.update(self.categorizer.categorize(action))
                b.add_action(**attrs)

            for sponsor in sponsors:
                if sponsor != None and sponsor != "(NONE)" and \
                   sponsor != "":
                    if "&" in sponsor:
                        for sponsor in [x.strip() for x in sponsor.split("&")]:
                            b.add_sponsor("primary", sponsor)
                    else:
                        b.add_sponsor("primary", sponsor)

            # Now that we have history, let's see if we can't grab some
            # votes

            bill_vote_href, = bill.xpath(".//a[contains(text(), 'Votes')]")
            bill_vote_href = bill_vote_href.attrib['href']
            #bill_vote_href = self.get_vote_url(bill_id, session)
            votes = self.parse_votes(bill_vote_href)

            if (votes['sanity-check'] == 'This site only supports frames '
                    'compatible browsers!'):
                votes['votes'] = []
            elif votes['sanity-check'] != bill_id:
                self.warning("XXX: READ ME! Sanity check failed!")
                self.warning(" -> Scraped ID: " + votes['sanity-check'])
                self.warning(" -> 'Real' ID:  " + bill_id)
                assert votes['sanity-check'] == bill_id

            for vote in votes['votes']:
                filed_votes = vote['votes']
                passage = vote['meta']
                result = vote['result']

                composite_time = "%s %s" % (passage['x-parent-date'],
                                            passage['TIME'])
                # It's now like: 04/01/2011 02:10:14 PM
                pydate = dt.datetime.strptime(composite_time,
                                              "%m/%d/%Y %I:%M:%S %p")
                hasHouse = "House" in passage['x-parent-ctty']
                hasSenate = "Senate" in passage['x-parent-ctty']

                if hasHouse and hasSenate:
                    actor = "joint"
                elif hasHouse:
                    actor = "lower"
                else:
                    actor = "upper"

                other = (int(result['EXC']) + int(result['ABS']))
                # OK, sometimes the Other count is wrong.
                local_other = 0
                for voter in filed_votes:
                    l_vote = filed_votes[voter].lower().strip()
                    if l_vote != "yes" and l_vote != "no":
                        local_other = local_other + 1

                if local_other != other:
                    self.warning( \
                        "XXX: !!!WARNING!!! - resetting the 'OTHER' VOTES")
                    self.warning(" -> Old: %s // New: %s" %
                                 (other, local_other))
                    other = local_other

                passed = (result['FINAL_ACTION'] == "PASS")
                if passage['MOTION'].strip() == "":
                    continue

                if "without objection" in passage['MOTION'].lower():
                    passed = True

                v = Vote(actor,
                         pydate,
                         passage['MOTION'],
                         passed,
                         int(result['YES']),
                         int(result['NO']),
                         other,
                         moved=passage['MOVED'],
                         seconded=passage['SECONDED'])

                v.add_source(vote['meta']['url'])
                # v.add_source( bill_vote_href )

                # XXX: Add more stuff to kwargs, we have a ton of data
                seen = set([])
                for voter in filed_votes:
                    who = voter
                    if who in seen:
                        raise Exception("Seeing the double-thing. - bug #702")
                    seen.add(who)

                    vote = filed_votes[who]
                    if vote.lower() == "yes":
                        v.yes(who)
                    elif vote.lower() == "no":
                        v.no(who)
                    else:
                        v.other(who)
                b.add_vote(v)
            self.save_bill(b)
Пример #49
0
    def scrape_votes(self, bill_page, bill, insert, year):
        root = lxml.html.fromstring(bill_page)
        for link in root.xpath('//a[contains(text(), "Passage")]'):
            motion = link.text
            if 'Assembly' in motion:
                chamber = 'lower'
            else:
                chamber = 'upper'
            vote_url = 'http://www.leg.state.nv.us/Session/%s/Reports/%s' % (
                insert, link.get('href'))
            bill.add_source(vote_url)
            with self.urlopen(vote_url) as page:
                page = page.decode("utf8").replace(u"\xa0", " ")
                root = lxml.html.fromstring(page)

                date = root.xpath('string(/html/body/center/font)').split()[-1]
                date = date + "-" + str(year)
                date = datetime.strptime(date, "%m-%d-%Y")
                yes_count = int(
                    root.xpath(
                        'string(/html/body/center/table/tr/td[1])').split()[0])
                no_count = int(
                    root.xpath(
                        'string(/html/body/center/table/tr/td[2])').split()[0])
                excused = int(
                    root.xpath(
                        'string(/html/body/center/table/tr/td[3])').split()[0])
                not_voting = int(
                    root.xpath(
                        'string(/html/body/center/table/tr/td[4])').split()[0])
                absent = int(
                    root.xpath(
                        'string(/html/body/center/table/tr/td[5])').split()[0])
                other_count = excused + not_voting + absent
                passed = yes_count > no_count

                vote = Vote(chamber,
                            date,
                            motion,
                            passed,
                            yes_count,
                            no_count,
                            other_count,
                            not_voting=not_voting,
                            absent=absent)

                for el in root.xpath('/html/body/table[2]/tr'):
                    name = el.xpath('string(td[1])').strip()
                    full_name = ''
                    for part in name:
                        full_name = full_name + part + " "
                    name = str(name)
                    vote_result = el.xpath('string(td[2])').split()[0]

                    if vote_result == 'Yea':
                        vote.yes(name)
                    elif vote_result == 'Nay':
                        vote.no(name)
                    else:
                        vote.other(name)
                bill.add_vote(vote)
Пример #50
0
    def scrape_vote(self, bill, name, url):
        if "VOTE/H" in url:
            vote_chamber = 'lower'
            cols = (1, 5, 9, 13)
            name_offset = 3
            yes_offset = 0
            no_offset = 1
        else:
            vote_chamber = 'upper'
            cols = (1, 6)
            name_offset = 4
            yes_offset = 1
            no_offset = 2

        with self.urlopen(url) as page:
            if 'BUDGET ADDRESS' in page:
                return

            page = lxml.html.fromstring(page)

            yes_count = page.xpath(
                "string(//span[contains(., 'Those voting Yea')])")
            yes_count = int(re.match(r'[^\d]*(\d+)[^\d]*', yes_count).group(1))

            no_count = page.xpath(
                "string(//span[contains(., 'Those voting Nay')])")
            no_count = int(re.match(r'[^\d]*(\d+)[^\d]*', no_count).group(1))

            other_count = page.xpath(
                "string(//span[contains(., 'Those absent')])")
            other_count = int(
                re.match(r'[^\d]*(\d+)[^\d]*', other_count).group(1))

            need_count = page.xpath(
                "string(//span[contains(., 'Necessary for')])")
            need_count = int(
                re.match(r'[^\d]*(\d+)[^\d]*', need_count).group(1))

            date = page.xpath("string(//span[contains(., 'Taken on')])")
            date = re.match(r'.*Taken\s+on\s+(\d+/\s?\d+)', date).group(1)
            date = date.replace(' ', '')
            date = datetime.datetime.strptime(date + " " + bill['session'],
                                              "%m/%d %Y").date()

            vote = Vote(vote_chamber, date, name, yes_count > need_count,
                        yes_count, no_count, other_count)
            vote.add_source(url)

            table = page.xpath("//table")[0]
            for row in table.xpath("tr"):
                for i in cols:
                    name = row.xpath("string(td[%d])" % (
                        i + name_offset)).strip()

                    if not name or name == 'VACANT':
                        continue

                    if "Y" in row.xpath("string(td[%d])" %
                                        (i + yes_offset)):
                        vote.yes(name)
                    elif "N" in row.xpath("string(td[%d])" %
                                          (i + no_offset)):
                        vote.no(name)
                    else:
                        vote.other(name)

            bill.add_vote(vote)
Пример #51
0
    def scrape_senate(self, session):
        url = journals % (session, 'Senate')
        page = self.lxmlize(url)
        hrefs = page.xpath("//font//a")

        for href in hrefs:
            (path, response) = self.urlretrieve(href.attrib['href'])
            data = convert_pdf(path, type='text')

            cur_bill_id = None
            cur_vote_count = None
            in_vote = False
            cur_question = None
            in_question = False
            known_date = None
            cur_vote = {}

            for line in data.split("\n"):
                if not known_date:
                    dt = date_re.findall(line)
                    if dt != []:
                        dt, dow = dt[0]
                        dt = dt.replace(',', '')
                        known_date = datetime.datetime.strptime(dt, "%A %B %d %Y")

                if in_question:
                    line = line.strip()
                    if re.match("\d+", line):
                        in_question = False
                        continue
                    try:
                        line, _ = line.rsplit(" ", 1)
                        cur_question += line.strip()
                    except ValueError:
                        in_question = False
                        continue

                    cur_question += line.strip()
                if not in_vote:
                    summ = vote_re.findall(line)
                    if summ != []:
                        cur_vote = {}
                        cur_vote_count = summ[0]
                        in_vote = True
                        continue

                    if ("The question being" in line) or \
                       ("On motion of" in line) or \
                       ("the following" in line) or \
                       ("moved that the" in line):
                        cur_question, _ = line.strip().rsplit(" ", 1)
                        cur_question = cur_question.strip()
                        in_question = True

                    if line.strip() == "":
                        continue
                    first = line[0]
                    if first != " ":
                        if " " not in line:
                            # wtf
                            continue

                        bill_id, kruft = line.split(" ", 1)
                        if len(bill_id) < 3:
                            continue
                        if bill_id[0] != "H" and bill_id[0] != "S":
                            continue
                        if bill_id[1] not in ['B', 'J', 'R', 'M']:
                            continue

                        cur_bill_id = bill_id
                else:
                    line = line.strip()
                    try:
                        line, lineno = line.rsplit(" ", 1)
                    except ValueError:
                        in_vote = False
                        if cur_question is None:
                            continue

                        if cur_bill_id is None:
                            continue

                        yes, no, exc, ab = cur_vote_count
                        other = int(exc) + int(ab)
                        yes, no, other = int(yes), int(no), int(other)

                        bc = {'H': 'lower', 'S': 'upper'}[cur_bill_id[0]]

                        vote = Vote('upper',
                                    known_date,
                                    cur_question,
                                    (yes > no),
                                    yes,
                                    no,
                                    other,
                                    session=session,
                                    bill_id=cur_bill_id,
                                    bill_chamber=bc)
                        for person in cur_vote:
                            if person is None:
                                continue

                            howvote = cur_vote[person]

                            if person.endswith("Y"):
                                howvote = "Y"
                                person = person[:-1]
                            if person.endswith("N"):
                                howvote = "N"
                                person = person[:-1]
                            if person.endswith("E"):
                                howvote = "E"
                                person = person[:-1]

                            howvote = howvote.upper()
                            if howvote == 'Y':
                                vote.yes(person)
                            elif howvote == 'N':
                                vote.no(person)
                            else:
                                vote.other(person)
                        vote.add_source(href.attrib['href'])
                        self.save_vote(vote)

                        cur_vote, cur_question, cur_vote_count = (
                            None, None, None)
                        continue

                    votes = re.findall(votes_re, line)

                    for person in votes:
                        name, li, vot = person
                        cur_vote[name] = vot

            os.unlink(path)
Пример #52
0
    def scrape_bill_type(self, chamber, session, bill_type, type_abbr,
            committee_abbr_regex=get_committee_name_regex()):

        if chamber == 'upper':
            chamber_name = 'SENATE'
        else:
            chamber_name = 'ASSEMBLY'

        bills = self.session.query(CABill).filter_by(
            session_year=session).filter_by(
            measure_type=type_abbr)

        for bill in bills:
            bill_session = session
            if bill.session_num != '0':
                bill_session += ' Special Session %s' % bill.session_num

            bill_id = bill.short_bill_id

            fsbill = Bill(bill_session, chamber, bill_id, '')

            # # Construct session for web query, going from '20092010' to '0910'
            # source_session = session[2:4] + session[6:8]

            # # Turn 'AB 10' into 'ab_10'
            # source_num = "%s_%s" % (bill.measure_type.lower(),
            #                         bill.measure_num)

            # Construct a fake source url
            source_url = ('http://leginfo.legislature.ca.gov/faces/'
                          'billNavClient.xhtml?bill_id=%s') % bill.bill_id

            fsbill.add_source(source_url)
            fsbill.add_version(bill_id, source_url, 'text/html')

            title = ''
            type_ = ['bill']
            subject = ''
            all_titles = set()

            # Get digest test (aka "summary") from latest version.
            if bill.versions:
                version = bill.versions[-1]
                nsmap = version.xml.nsmap
                xpath = '//caml:DigestText/xhtml:p'
                els = version.xml.xpath(xpath, namespaces=nsmap)
                chunks = []
                for el in els:
                    t = etree_text_content(el)
                    t = re.sub(r'\s+', ' ', t)
                    t = re.sub(r'\)(\S)', lambda m: ') %s' % m.group(1), t)
                    chunks.append(t)
                summary = '\n\n'.join(chunks)

            for version in bill.versions:
                if not version.bill_xml:
                    continue

                # CA is inconsistent in that some bills have a short title
                # that is longer, more descriptive than title.
                if bill.measure_type in ('AB', 'SB'):
                    impact_clause = clean_title(version.title)
                    title = clean_title(version.short_title)
                else:
                    impact_clause = None
                    if len(version.title) < len(version.short_title) and \
                            not version.title.lower().startswith('an act'):
                        title = clean_title(version.short_title)
                    else:
                        title = clean_title(version.title)

                if title:
                    all_titles.add(title)

                type_ = [bill_type]

                if version.appropriation == 'Yes':
                    type_.append('appropriation')
                if version.fiscal_committee == 'Yes':
                    type_.append('fiscal committee')
                if version.local_program == 'Yes':
                    type_.append('local program')
                if version.urgency == 'Yes':
                    type_.append('urgency')
                if version.taxlevy == 'Yes':
                    type_.append('tax levy')

                if version.subject:
                    subject = clean_title(version.subject)

            if not title:
                self.warning("Couldn't find title for %s, skipping" % bill_id)
                continue

            fsbill['title'] = title
            fsbill['summary'] = summary
            fsbill['type'] = type_
            fsbill['subjects'] = filter(None, [subject])
            fsbill['impact_clause'] = impact_clause

            # We don't want the current title in alternate_titles
            all_titles.remove(title)

            fsbill['alternate_titles'] = list(all_titles)

            for author in version.authors:
                if author.house == chamber_name:
                    fsbill.add_sponsor(SPONSOR_TYPES[author.contribution],
                                       author.name,
                                       official_type=author.contribution)

            for action in bill.actions:
                if not action.action:
                    # NULL action text seems to be an error on CA's part,
                    # unless it has some meaning I'm missing
                    continue
                actor = action.actor or chamber
                actor = actor.strip()
                match = re.match(r'(Assembly|Senate)($| \(Floor)', actor)
                if match:
                    actor = {'Assembly': 'lower',
                             'Senate': 'upper'}[match.group(1)]
                elif actor.startswith('Governor'):
                    actor = 'other'
                else:
                    def replacer(matchobj):
                        if matchobj:
                            return {'Assembly': 'lower',
                                    'Senate': 'upper'}[matchobj.group()]
                        else:
                            return matchobj.group()

                    actor = re.sub(r'^(Assembly|Senate)', replacer, actor)

                type_ = []

                act_str = action.action
                act_str = re.sub(r'\s+', ' ', act_str)

                attrs = self.categorizer.categorize(act_str)

                # Add in the committee strings of the related committees, if any.
                kwargs = attrs
                matched_abbrs = committee_abbr_regex.findall(action.action)

                if 'Com. on' in action.action and not matched_abbrs:
                    msg = 'Failed to extract committee abbr from %r.'
                    self.logger.warning(msg % action.action)

                if matched_abbrs:
                    committees = []
                    for abbr in matched_abbrs:
                        try:
                            name = self.committee_abbr_to_name(chamber, abbr)
                        except KeyError:
                            msg = ('Mapping contains no committee name for '
                                   'abbreviation %r. Action text was %r.')
                            args = (abbr, action.action)
                            raise KeyError(msg % args)
                        else:
                            committees.append(name)

                    committees = filter(None, committees)
                    kwargs['committees'] = committees

                    code = re.search(r'C[SXZ]\d+', actor)
                    if code is not None:
                        code = code.group()
                        kwargs['actor_info'] = {'committee_code': code}

                    assert len(committees) == len(matched_abbrs)
                    for committee, abbr in zip(committees, matched_abbrs):
                        act_str = act_str.replace('Com. on ' + abbr, committee)
                        act_str = act_str.replace(abbr, committee)

                changed = False
                for string in ['upper', 'lower', 'joint']:
                    if actor.startswith(string):
                        actor = string
                        changed = True
                        break
                if not changed:
                    actor = 'other'
                if actor != action.actor:
                    actor_info = kwargs.get('actor_info', {})
                    actor_info['details'] = action.actor
                    kwargs['actor_info'] = actor_info

                # Add strings for related legislators, if any.
                rgx = '(?:senator|assembly[mwp][^ .,:;]+)\s+[^ .,:;]+'
                legislators = re.findall(rgx, action.action, re.I)
                if legislators:
                    kwargs['legislators'] = legislators

                fsbill.add_action(actor, act_str, action.action_date.date(),
                                  **kwargs)

            for vote in bill.votes:
                if vote.vote_result == '(PASS)':
                    result = True
                else:
                    result = False

                full_loc = vote.location.description
                first_part = full_loc.split(' ')[0].lower()
                if first_part in ['asm', 'assembly']:
                    vote_chamber = 'lower'
                    vote_location = ' '.join(full_loc.split(' ')[1:])
                elif first_part.startswith('sen'):
                    vote_chamber = 'upper'
                    vote_location = ' '.join(full_loc.split(' ')[1:])
                else:
                    raise ScrapeError("Bad location: %s" % full_loc)

                if vote.motion:
                    motion = vote.motion.motion_text or ''
                else:
                    motion = ''

                if "Third Reading" in motion or "3rd Reading" in motion:
                    vtype = 'passage'
                elif "Do Pass" in motion:
                    vtype = 'passage'
                else:
                    vtype = 'other'

                motion = motion.strip()

                # Why did it take until 2.7 to get a flags argument on re.sub?
                motion = re.compile(r'(\w+)( Extraordinary)? Session$',
                                    re.IGNORECASE).sub('', motion)
                motion = re.compile(r'^(Senate|Assembly) ',
                                    re.IGNORECASE).sub('', motion)
                motion = re.sub(r'^(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.?  ',
                                '', motion)
                motion = re.sub(r' \(\w+\)$', '', motion)
                motion = re.sub(r'(SCR|SB|AB|AJR|ACR)\s?\d+ \w+\.?$',
                                '', motion)
                motion = re.sub(r'(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? '
                                r'Urgency Clause$',
                                '(Urgency Clause)', motion)
                motion = re.sub(r'\s+', ' ', motion)

                if not motion:
                    self.warning("Got blank motion on vote for %s" % bill_id)
                    continue

                fsvote = Vote(vote_chamber,
                              self._tz.localize(vote.vote_date_time),
                              motion,
                              result,
                              int(vote.ayes),
                              int(vote.noes),
                              int(vote.abstain),
                              threshold=vote.threshold,
                              type_=vtype)

                if vote_location != 'Floor':
                    fsvote['committee'] = vote_location

                for record in vote.votes:
                    if record.vote_code == 'AYE':
                        fsvote.yes(record.legislator_name)
                    elif record.vote_code.startswith('NO'):
                        fsvote.no(record.legislator_name)
                    else:
                        fsvote.other(record.legislator_name)

                for s in ('yes', 'no', 'other'):
                    # Kill dupe votes.
                    key = s + '_votes'
                    fsvote[key] = list(set(fsvote[key]))

                # In a small percentage of bills, the integer vote counts
                # are inaccurate, so let's ignore them.
                for k in ('yes', 'no', 'other'):
                    fsvote[k + '_count'] = len(fsvote[k + '_votes'])

                fsbill.add_vote(fsvote)

            self.save_bill(fsbill)
Пример #53
0
    def scrape_pdf_for_votes(self, session, chamber, date, motion, href):
        warned = False
        # vote indicator, a few spaces, a name, newline or multiple spaces
        VOTE_RE = re.compile('(Y|N|E|NV|A|P|-)\s{2,5}(\w.+?)(?:\n|\s{2})')
        COUNT_RE = re.compile(r'^(\d+)\s+YEAS?\s+(\d+)\s+NAYS?\s+(\d+)\s+PRESENT(?:\s+(\d+)\s+NOT\sVOTING)?\s*$')
        PASS_FAIL_WORDS = {
            'PASSED': True,
            'PREVAILED': True,
            'ADOPTED': True,
            'CONCURRED': True,
            'FAILED': False,
            'LOST': False,
        }

        pdflines = self.fetch_pdf_lines(href)

        if not pdflines:
            return False

        yes_count = no_count = present_count = other_count = 0
        yes_votes = []
        no_votes = []
        present_votes = []
        other_vote_detail = defaultdict(list)
        passed = None
        counts_found = False
        vote_lines = []
        for line in pdflines:
            # consider pass/fail as a document property instead of a result of the vote count
            # extract the vote count from the document instead of just using counts of names
            if not line.strip():
                continue
            elif line.strip() in PASS_FAIL_WORDS:
                if passed is not None:
                    raise Exception("Duplicate pass/fail matches in [%s]" % href)
                passed = PASS_FAIL_WORDS[line.strip()]
            elif COUNT_RE.match(line):
                yes_count, no_count, present_count, not_voting_count = COUNT_RE.match(line).groups()
                yes_count = int(yes_count)
                no_count = int(no_count)
                present_count = int(present_count)
                counts_found = True
            elif counts_found:
                for value in VOTE_VALUES:
                    if re.search(r'^\s*({})\s+\w'.format(value), line):
                        vote_lines.append(line)
                        break

        votes = find_columns_and_parse(vote_lines)
        for name, vcode in votes.items():
            if name == 'Mr. Speaker':
                name = self.metadata['session_details'][session]['speaker']
            elif name == 'Mr. President':
                name = self.metadata['session_details'][session]['president']
            if vcode == 'Y':
                yes_votes.append(name)
            elif vcode == 'N':
                no_votes.append(name)
            else:
                other_vote_detail[vcode].append(name)
                other_count += 1
                if vcode == 'P':
                    present_votes.append(name)
        # fake the counts
        if yes_count == 0 and no_count == 0 and present_count == 0:
            yes_count = len(yes_votes)
            no_count = len(no_votes)
        else:  # audit
            if yes_count != len(yes_votes):
                self.warning("Mismatched yes count [expect: %i] [have: %i]" % (yes_count, len(yes_votes)))
                warned = True
            if no_count != len(no_votes):
                self.warning("Mismatched no count [expect: %i] [have: %i]" % (no_count, len(no_votes)))
                warned = True
            if present_count != len(present_votes):
                self.warning("Mismatched present count [expect: %i] [have: %i]" % (present_count, len(present_votes)))
                warned = True

        if passed is None:
            if chamber == 'lower':  # senate doesn't have these lines
                self.warning("No pass/fail word found; fall back to comparing yes and no vote.")
                warned = True
            passed = yes_count > no_count
        vote = Vote(chamber, date, motion, passed, yes_count, no_count,
                    other_count, other_vote_detail=other_vote_detail)
        for name in yes_votes:
            vote.yes(name)
        for name in no_votes:
            vote.no(name)
        for other_type, names in other_vote_detail.iteritems():
            for name in names:
                vote.other(name)
        vote.add_source(href)

        if warned:
            self.warning("Warnings were issued. Best to check %s" % href)
        return vote
Пример #54
0
    def scrape_lower_committee_votes(self, session_number, bill):
        '''
        House committee roll calls are not available on the Senate's
        website. Furthermore, the House uses an internal ID system in
        its URLs, making accessing those pages non-trivial.

        This function will fetch all the House committee votes for the
        given bill, and add the votes to that object.
        '''

        house_url = 'http://www.myfloridahouse.gov/Sections/Bills/bills.aspx'

        # Keep the digits and all following characters in the bill's ID
        bill_number = re.search(r'^\w+\s(\d+\w*)$', bill['bill_id']).group(1)

        form = {
            'rblChamber': 'B',
            'ddlSession': session_number,
            'ddlBillList': '-1',
            'txtBillNumber': bill_number,
            'ddlSponsor': '-1',
            'ddlReferredTo': '-1',
            'SubmittedByControl': '',
        }
        doc = lxml.html.fromstring(self.post(url=house_url, data=form).text)
        doc.make_links_absolute(house_url)

        (bill_link, ) = doc.xpath(
            '//a[contains(@href, "/Bills/billsdetail.aspx?BillId=")]/@href')
        bill_doc = self.lxmlize(bill_link)
        links = bill_doc.xpath('//a[text()="See Votes"]/@href')

        for link in links:
            vote_doc = self.lxmlize(link)

            (date, ) = vote_doc.xpath(
                '//span[@id="ctl00_ContentPlaceHolder1_lblDate"]/text()')
            date = datetime.datetime.strptime(date,
                                              '%m/%d/%Y %I:%M:%S %p').date()

            totals = vote_doc.xpath('//table//table')[-1].text_content()
            totals = re.sub(r'(?mu)\s+', " ", totals).strip()
            (yes_count, no_count, other_count) = [
                int(x) for x in re.search(
                    r'(?m)Total Yeas:\s+(\d+)\s+Total Nays:\s+(\d+)\s+'
                    'Total Missed:\s+(\d+)', totals).groups()
            ]
            passed = yes_count > no_count

            (committee, ) = vote_doc.xpath(
                '//span[@id="ctl00_ContentPlaceHolder1_lblCommittee"]/text()')
            (action, ) = vote_doc.xpath(
                '//span[@id="ctl00_ContentPlaceHolder1_lblAction"]/text()')
            motion = "{} ({})".format(action, committee)

            vote = Vote('lower', date, motion, passed, yes_count, no_count,
                        other_count)
            vote.add_source(link)

            for member_vote in vote_doc.xpath('//table//table//table//td'):
                if not member_vote.text_content().strip():
                    continue

                (member, ) = member_vote.xpath('span[2]//text()')
                (member_vote, ) = member_vote.xpath('span[1]//text()')

                if member_vote == "Y":
                    vote.yes(member)
                elif member_vote == "N":
                    vote.no(member)
                elif member_vote == "-":
                    vote.other(member)
                # Parenthetical votes appear to not be counted in the
                # totals for Yea, Nay, _or_ Missed
                elif re.search(r'\([YN]\)', member_vote):
                    continue
                else:
                    raise IndexError(
                        "Unknown vote type found: {}".format(member_vote))

            vote.validate()
            bill.add_vote(vote)
Пример #55
0
    def scrape_floor_vote(self, chamber, bill, date, url):
        (path, resp) = self.urlretrieve(url)
        text = convert_pdf(path, 'text')
        lines = text.split("\n")
        os.remove(path)

        MOTION_INDEX = 4
        TOTALS_INDEX = 6
        VOTE_START_INDEX = 9

        motion = lines[MOTION_INDEX].strip()
        # Sometimes there is no motion name, only "Passage" in the line above
        if (not motion
                and not lines[MOTION_INDEX - 1].startswith("Calendar Page:")):
            motion = lines[MOTION_INDEX - 1]
            MOTION_INDEX -= 1
            TOTALS_INDEX -= 1
            VOTE_START_INDEX -= 1
        else:
            assert motion, "Floor vote's motion name appears to be empty"

        for _extra_motion_line in range(2):
            MOTION_INDEX += 1
            if lines[MOTION_INDEX].strip():
                motion = "{}, {}".format(motion, lines[MOTION_INDEX].strip())
                TOTALS_INDEX += 1
                VOTE_START_INDEX += 1
            else:
                break

        (yes_count, no_count, other_count) = [
            int(x) for x in re.search(
                r'^\s+Yeas - (\d+)\s+Nays - (\d+)\s+Not Voting - (\d+)\s*$',
                lines[TOTALS_INDEX]).groups()
        ]
        passed = (yes_count > no_count)

        vote = Vote(chamber, date, motion, passed, yes_count, no_count,
                    other_count)
        vote.add_source(url)

        for line in lines[VOTE_START_INDEX:]:
            if not line.strip():
                break

            if " President " in line:
                line = line.replace(" President ", " ")
            elif " Speaker " in line:
                line = line.replace(" Speaker ", " ")

            # Votes follow the pattern of:
            # [vote code] [member name]-[district number]
            for member in re.findall(r'\s*Y\s+(.*?)-\d{1,3}\s*', line):
                vote.yes(member)
            for member in re.findall(r'\s*N\s+(.*?)-\d{1,3}\s*', line):
                vote.no(member)
            for member in re.findall(r'\s*(?:EX|AV)\s+(.*?)-\d{1,3}\s*', line):
                vote.other(member)

        try:
            vote.validate()
        except ValueError:
            # On a rare occasion, a member won't have a vote code,
            # which indicates that they didn't vote. The totals reflect
            # this.
            self.logger.info("Votes don't add up; looking for additional ones")
            for line in lines[VOTE_START_INDEX:]:
                if not line.strip():
                    break
                for member in re.findall(r'\s{8,}([A-Z][a-z\'].*?)-\d{1,3}',
                                         line):
                    vote.other(member)

        vote.validate()
        bill.add_vote(vote)
Пример #56
0
    def build_senate_votes(self):
        xpath = "//div/b[starts-with(., 'VOTE: FLOOR VOTE:')]"
        for b in self.urls.senate.xpath(xpath):
            date = b.text.split('-')[1].strip()
            date = datetime.datetime.strptime(date, "%b %d, %Y").date()

            yes_votes, no_votes, other_votes = [], [], []
            yes_count, no_count, other_count = 0, 0, 0
            actual_vote = collections.defaultdict(list)

            vtype = None
            for tag in b.xpath("following-sibling::blockquote/*"):
                if tag.tag == 'b':
                    text = tag.text
                    if text.startswith('Ayes'):
                        vtype = 'yes'
                        yes_count = int(
                            re.search(r'\((\d+)\):', text).group(1))
                    elif text.startswith('Nays'):
                        vtype = 'no'
                        no_count = int(re.search(r'\((\d+)\):', text).group(1))
                    elif (text.startswith('Excused')
                          or text.startswith('Abstain')
                          or text.startswith('Absent')):
                        vtype = 'other'
                        other_count += int(
                            re.search(r'\((\d+)\):', text).group(1))
                    else:
                        raise ValueError('bad vote type: %s' % tag.text)
                elif tag.tag == 'a':
                    name = tag.text.strip()
                    if vtype == 'yes':
                        yes_votes.append(name)
                    elif vtype == 'no':
                        no_votes.append(name)
                    elif vtype == 'other':
                        other_votes.append((name, tag.text))

            passed = yes_count > (no_count + other_count)

            vote = Vote('upper', date, 'Floor Vote', passed, yes_count,
                        no_count, other_count)

            for name in yes_votes:
                vote.yes(name)
            for name in no_votes:
                vote.no(name)
            for name, vote_val in other_votes:
                vote.other(name)
                actual_vote[vote_val].append(name)

            vote['actual_vote'] = actual_vote
            vote.add_source(self.url)
            self.bill.add_vote(vote)

        xpath = "//div/b[starts-with(., 'VOTE: COMMITTEE VOTE:')]"
        for b in self.urls.senate.xpath(xpath):
            _, committee, date = re.split(r'\s*\t+\s*-\s*', b.text)
            date = date.strip()
            date = datetime.datetime.strptime(date, "%b %d, %Y").date()

            yes_votes, no_votes, other_votes = [], [], []
            yes_count, no_count, other_count = 0, 0, 0

            vtype = None
            for tag in b.xpath("following-sibling::blockquote/*"):
                if tag.tag == 'b':
                    text = tag.text
                    if text.startswith('Ayes'):
                        vtype = 'yes'
                        yes_count += int(
                            re.search(r'\((\d+)\):', text).group(1))
                    elif text.startswith('Nays'):
                        vtype = 'no'
                        no_count += int(
                            re.search(r'\((\d+)\):', text).group(1))
                    elif (text.startswith('Excused')
                          or text.startswith('Abstain')
                          or text.startswith('Absent')):
                        vtype = 'other'
                        other_count += int(
                            re.search(r'\((\d+)\):', text).group(1))
                    else:
                        raise ValueError('bad vote type: %s' % tag.text)
                elif tag.tag == 'a':
                    name = tag.text.strip()
                    if vtype == 'yes':
                        yes_votes.append(name)
                    elif vtype == 'no':
                        no_votes.append(name)
                    elif vtype == 'other':
                        other_votes.append(name)

            passed = yes_count > (no_count + other_count)

            vote = Vote('upper', date, '%s Committee Vote' % committee, passed,
                        yes_count, no_count, other_count)

            for name in yes_votes:
                vote.yes(name)
            for name in no_votes:
                vote.no(name)
            for name in other_votes:
                vote.other(name)

            vote.add_source(self.url)
            self.bill.add_vote(vote)
Пример #57
0
    def scrape_digest(self, bill):
        digest_url = 'http://legisweb.state.wy.us/%(session)s/Digest/%(bill_id)s.pdf' % bill
        bill.add_source(digest_url)

        try:
            (filename, response) = self.urlretrieve(digest_url)
            all_text = convert_pdf(filename, type='text')
        except scrapelib.HTTPError:
            self.warning('no digest for %s' % bill['bill_id'])
            return
        if all_text.strip() == "":
            self.warning('Non-functional digest for bill {}'.format(
                bill['bill_id']))
            return

        # Split the digest's text into sponsors, description, and actions
        SPONSOR_RE = r'(?sm)Sponsored By:\s+(.*?)\n\n'
        DESCRIPTION_RE = r'(?sm)\n\n((?:AN\s*?ACT|A JOINT RESOLUTION) .*?)\n\n'
        ACTIONS_RE = r'(?sm)\n\n(\d{1,2}/\d{1,2}/\d{4}.*)'

        ext_title = re.search(DESCRIPTION_RE, all_text).group(1)
        bill_desc = ext_title.replace('\n', ' ')
        bill_desc = re.sub("  *", " ",
                           bill_desc.decode('utf-8')).encode('utf-8')
        bill['description'] = bill_desc

        sponsor_span = re.search(SPONSOR_RE, all_text).group(1)
        sponsors = ''
        sponsors = sponsor_span.replace('\n', ' ')
        if sponsors:
            if 'Committee' in sponsors:
                bill.add_sponsor('primary', sponsors)
            else:
                if bill['chamber'] == 'lower':
                    sp_lists = sponsors.split('and Senator(s)')
                else:
                    sp_lists = sponsors.split('and Representative(s)')
                for spl in sp_lists:
                    for sponsor in split_names(spl):
                        sponsor = sponsor.strip()
                        if sponsor != "":
                            bill.add_sponsor('primary', sponsor)

        action_re = re.compile('(\d{1,2}/\d{1,2}/\d{4})\s+(H |S )?(.+)')
        vote_total_re = re.compile(
            '(Ayes )?(\d*)(\s*)Nays(\s*)(\d+)(\s*)Excused(\s*)(\d+)(\s*)Absent(\s*)(\d+)(\s*)Conflicts(\s*)(\d+)'
        )

        # initial actor is bill chamber
        actor = bill['chamber']
        actions = []
        action_lines = re.search(ACTIONS_RE, all_text).group(1).split('\n')
        action_lines = iter(action_lines)
        for line in action_lines:
            line = clean_line(line)

            # skip blank lines
            if not line:
                continue

            amatch = action_re.match(line)
            if amatch:
                date, achamber, action = amatch.groups()

                # change actor if one is on this action
                if achamber == 'H ':
                    actor = 'lower'
                elif achamber == 'S ':
                    actor = 'upper'

                date = datetime.datetime.strptime(date, '%m/%d/%Y')
                bill.add_action(actor,
                                action.strip(),
                                date,
                                type=categorize_action(action))
            elif line == 'ROLL CALL':
                voters = defaultdict(str)
                # if we hit a roll call, use an inner loop to consume lines
                # in a psuedo-state machine manner, 3 types
                # Ayes|Nays|Excused|... - indicates next line is voters
                # : (Senators|Representatives): ... - voters
                # \d+ Nays \d+ Excused ... - totals
                voters_type = None
                for ainext in action_lines:
                    nextline = clean_line(ainext)
                    if not nextline:
                        continue

                    breakers = [
                        "Ayes:", "Nays:", "Nayes:", "Excused:", "Absent:",
                        "Conflicts:"
                    ]

                    for breaker in breakers:
                        if nextline.startswith(breaker):
                            voters_type = breaker[:-1]
                            if voters_type == "Nayes":
                                voters_type = "Nays"
                                self.log("Fixed a case of 'Naye-itis'")
                            nextline = nextline[len(breaker) - 1:]

                    if nextline.startswith(': '):
                        voters[voters_type] = nextline
                    elif nextline in ('Ayes', 'Nays', 'Excused', 'Absent',
                                      'Conflicts'):
                        voters_type = nextline
                    elif vote_total_re.match(nextline):
                        #_, ayes, _, nays, _, exc, _, abs, _, con, _ = \
                        tupple = vote_total_re.match(nextline).groups()
                        ayes = tupple[1]
                        nays = tupple[4]
                        exc = tupple[7]
                        abs = tupple[10]
                        con = tupple[13]

                        passed = (('Passed' in action or 'Do Pass' in action
                                   or 'Did Concur' in action
                                   or 'Referred to' in action)
                                  and 'Failed' not in action)
                        vote = Vote(actor, date, action, passed, int(ayes),
                                    int(nays),
                                    int(exc) + int(abs) + int(con))
                        vote.add_source(digest_url)

                        for vtype, voters in voters.iteritems():
                            for voter in split_names(voters):
                                if voter:
                                    if vtype == 'Ayes':
                                        vote.yes(voter)
                                    elif vtype == 'Nays':
                                        vote.no(voter)
                                    else:
                                        vote.other(voter)
                        # done collecting this vote
                        bill.add_vote(vote)
                        break
                    else:
                        # if it is a stray line within the vote, is is a
                        # continuation of the voter list
                        # (sometimes has a newline)
                        voters[voters_type] += ' ' + nextline
Пример #58
0
    def scrape_vote(self, bill, name, url):
        match = re.match('^(Senate|House) Vote on [^,]*,(.*)$', name)

        if not match:
            return

        chamber = {'Senate': 'upper', 'House': 'lower'}[match.group(1)]
        motion = match.group(2).strip()

        if motion.startswith('FINAL PASSAGE'):
            type = 'passage'
        elif motion.startswith('AMENDMENT'):
            type = 'amendment'
        elif 'ON 3RD READINT' in motion:
            type = 'reading:3'
        else:
            type = 'other'

        vote = Vote(chamber, None, motion, None, None, None, None)
        vote['type'] = type
        vote.add_source(url)

        with self.urlopen(url) as text:
            (fd, temp_path) = tempfile.mkstemp()
            with os.fdopen(fd, 'wb') as w:
                w.write(text)
            html = pdf_to_lxml(temp_path)
            os.remove(temp_path)

            vote_type = None
            total_re = re.compile('^Total--(\d+)$')
            body = html.xpath('string(/html/body)')

            date_match = re.search('%s (\d{4,4})' % bill['bill_id'], body)
            try:
                date = date_match.group(1)
            except AttributeError:
                print "BAD VOTE"
                return
            month = int(date[0:2])
            day = int(date[2:4])
            date = datetime.date(int(bill['session']), month, day)
            vote['date'] = date

            for line in body.replace(u'\xa0', '\n').split('\n'):
                line = line.replace('&nbsp;', '').strip()
                if not line:
                    continue

                if line in ('YEAS', 'NAYS', 'ABSENT'):
                    vote_type = {
                        'YEAS': 'yes',
                        'NAYS': 'no',
                        'ABSENT': 'other'
                    }[line]
                elif vote_type:
                    match = total_re.match(line)
                    if match:
                        vote['%s_count' % vote_type] = int(match.group(1))
                    elif vote_type == 'yes':
                        vote.yes(line)
                    elif vote_type == 'no':
                        vote.no(line)
                    elif vote_type == 'other':
                        vote.other(line)

        # The PDFs oddly don't say whether a vote passed or failed.
        # Hopefully passage just requires yes_votes > not_yes_votes
        if vote['yes_count'] > (vote['no_count'] + vote['other_count']):
            vote['passed'] = True
        else:
            vote['passed'] = False

        bill.add_vote(vote)
Пример #59
0
    def scrape_vote(self, bill, name, url):
        match = re.match('^(Senate|House) Vote on [^,]*,(.*)$', name)

        if not match:
            return

        chamber = {'Senate': 'upper', 'House': 'lower'}[match.group(1)]
        motion = match.group(2).strip()

        if motion.startswith('FINAL PASSAGE'):
            type = 'passage'
        elif motion.startswith('AMENDMENT'):
            type = 'amendment'
        elif 'ON 3RD READING' in motion:
            type = 'reading:3'
        else:
            type = 'other'

        vote = Vote(chamber, None, motion, None, None, None, None)
        vote['type'] = type
        vote.add_source(url)

        (fd, temp_path) = tempfile.mkstemp()
        self.urlretrieve(url, temp_path)

        html = pdf_to_lxml(temp_path)
        os.close(fd)
        os.remove(temp_path)

        vote_type = None
        total_re = re.compile('^Total--(\d+)$')
        body = html.xpath('string(/html/body)')

        date_match = re.search('Date: (\d{1,2}/\d{1,2}/\d{4})', body)
        try:
            date = date_match.group(1)
        except AttributeError:
            self.warning("BAD VOTE: date error")
            return

        vote['date'] = datetime.datetime.strptime(date, '%m/%d/%Y')

        for line in body.replace(u'\xa0', '\n').split('\n'):
            line = line.replace('&nbsp;', '').strip()
            if not line:
                continue

            if line in ('YEAS', 'NAYS', 'ABSENT'):
                vote_type = {
                    'YEAS': 'yes',
                    'NAYS': 'no',
                    'ABSENT': 'other'
                }[line]
            elif line in ('Total', '--'):
                vote_type = None
            elif vote_type:
                match = total_re.match(line)
                if match:
                    vote['%s_count' % vote_type] = int(match.group(1))
                elif vote_type == 'yes':
                    vote.yes(line)
                elif vote_type == 'no':
                    vote.no(line)
                elif vote_type == 'other':
                    vote.other(line)

        # tally counts
        vote['yes_count'] = len(vote['yes_votes'])
        vote['no_count'] = len(vote['no_votes'])
        vote['other_count'] = len(vote['other_votes'])

        # The PDFs oddly don't say whether a vote passed or failed.
        # Hopefully passage just requires yes_votes > not_yes_votes
        if vote['yes_count'] > (vote['no_count'] + vote['other_count']):
            vote['passed'] = True
        else:
            vote['passed'] = False

        bill.add_vote(vote)
Пример #60
0
    def scrape_house(self, session):
        url = journals % (session, 'House')
        page = self.lxmlize(url)
        hrefs = page.xpath("//font//a")

        for href in hrefs:
            (path, response) = self.urlretrieve(href.attrib['href'])
            data = convert_pdf(path, type='text')

            in_vote = False
            cur_vote = {}
            known_date = None
            cur_vote_count = None
            in_question = False
            cur_question = None
            cur_bill_id = None

            for line in data.split("\n"):
                if known_date is None:
                     dt = date_re.findall(line)
                     if dt != []:
                        dt, dow = dt[0]
                        known_date = datetime.datetime.strptime(dt,
                            "%A, %B %d, %Y")

                non_std = False
                if re.match("(\s+)?\d+.*", line) is None:
                    non_std = True
                    l = line.lower().strip()
                    skip = False
                    blacklist = [
                        "house",
                        "page",
                        "general assembly",
                        "state of colorado",
                        "session",
                        "legislative day"
                    ]
                    for thing in blacklist:
                        if thing in l:
                            skip = True
                    if skip:
                        continue

                found = re.findall(
                    "(?P<bill_id>(H|S|SJ|HJ)(B|M|R)\d{2}-\d{3,4})",
                    line
                )
                if found != []:
                    found = found[0]
                    cur_bill_id, chamber, typ = found

                try:
                    if not non_std:
                        _, line = line.strip().split(" ", 1)
                    line = line.strip()
                except ValueError:
                    in_vote = False
                    in_question = False
                    continue

                if in_question:
                    cur_question += " " + line.strip()
                    continue

                if ("The question being" in line) or \
                   ("On motion of" in line) or \
                   ("the following" in line) or \
                   ("moved that the" in line):
                    cur_question = line.strip()
                    in_question = True


                if in_vote:
                    if line == "":
                        likely_garbage = True

                    likely_garbage = False
                    if "co-sponsor" in line.lower():
                        likely_garbage = True

                    if 'the speaker' in line.lower():
                        likely_garbage = True

                    votes = re.findall(votes_re, line)
                    if likely_garbage:
                        votes = []

                    for person, _, v in votes:
                        cur_vote[person] = v

                    last_line = False
                    for who, _, vote in votes:
                        if who.lower() == "speaker":
                            last_line = True

                    if votes == [] or last_line:
                        in_vote = False
                        # save vote
                        yes, no, other = cur_vote_count
                        if cur_bill_id is None or cur_question is None:
                            continue

                        bc = {
                            "H": "lower",
                            "S": "upper",
                            "J": "joint"
                        }[cur_bill_id[0].upper()]

                        vote = Vote('lower',
                                    known_date,
                                    cur_question,
                                    (yes > no),
                                    yes,
                                    no,
                                    other,
                                    session=session,
                                    bill_id=cur_bill_id,
                                    bill_chamber=bc)

                        vote.add_source(href.attrib['href'])
                        vote.add_source(url)

                        for person in cur_vote:
                            if person is None:
                                continue

                            vot = cur_vote[person]

                            if person.endswith("Y"):
                                vot = "Y"
                                person = person[:-1]
                            if person.endswith("N"):
                                vot = "N"
                                person = person[:-1]
                            if person.endswith("E"):
                                vot = "E"
                                person = person[:-1]

                            if vot == 'Y':
                                vote.yes(person)
                            elif vot == 'N':
                                vote.no(person)
                            elif vot == 'E' or vot == '-':
                                vote.other(person)

                        self.save_vote(vote)

                        cur_vote = {}
                        in_question = False
                        cur_question = None
                        in_vote = False
                        cur_vote_count = None
                        continue

                summ = vote_re.findall(line)
                if summ == []:
                    continue
                summ = summ[0]
                yes, no, exc, ab = summ
                yes, no, exc, ab = \
                        int(yes), int(no), int(exc), int(ab)
                other = exc + ab
                cur_vote_count = (yes, no, other)
                in_vote = True
                continue
            os.unlink(path)