def test_fix_bill_id():
    j = create_jurisdiction()
    j.legislative_sessions.create(name='1900', identifier='1900')

    org1 = ScrapeOrganization(name='House', classification='lower')
    bill = ScrapeBill('HB 1', '1900', 'Test Bill ID',
                      classification='bill', chamber='lower')

    oi = OrganizationImporter('jid')

    oi.import_data([org1.as_dict()])

    from pupa.settings import IMPORT_TRANSFORMERS
    IMPORT_TRANSFORMERS['bill'] = {
        'identifier': lambda x: re.sub(r'([A-Z]*)\s*0*([-\d]+)', r'\1 \2', x, 1)
    }

    bi = BillImporter('jid', oi, DumbMockImporter())
    bi.import_data([bill.as_dict()])

    ve = ScrapeVoteEvent(legislative_session='1900', motion_text='passage',
                         start_date='1900-04-02', classification='passage:bill',
                         result='fail', bill_chamber='lower', bill='HB1',
                         identifier='4',
                         bill_action='passage',
                         organization=org1._id)

    VoteEventImporter('jid', DumbMockImporter(), oi, bi).import_data([
        ve.as_dict(),
    ])

    IMPORT_TRANSFORMERS['bill'] = {}

    ve = VoteEvent.objects.get()
    ve.bill.identifier == 'HB 1'
Пример #2
0
    def scrape_senate_vote(self, bill, url, date):
        try:
            filename, resp = self.urlretrieve(url)
        except scrapelib.HTTPError:
            self.warning("missing vote file %s" % url)
            return

        vote = Vote(
            chamber='upper',
            start_date=date.strftime("%Y-%m-%d"),
            motion_text='Passage',
            # setting 'fail' for now.
            result='fail',
            classification='passage',
            bill=bill
        )
        vote.add_source(url)

        text = convert_pdf(filename, 'text').decode('utf-8')
        os.remove(filename)

        if re.search('Yea:\s+\d+\s+Nay:\s+\d+\s+Absent:\s+\d+', text):
            yield from self.scrape_senate_vote_3col(bill, vote, text, url, date)
            return

        data = re.split(r'(Yea|Nay|Absent)s?:', text)[::-1]
        data = filter(None, data)
        keymap = dict(yea='yes', nay='no')
        actual_vote = collections.defaultdict(int)
        vote_count = {
            'yes': 0,
            'no': 0,
            'other': 0
        }
        while True:
            if not data:
                break
            vote_val = data.pop()
            key = keymap.get(vote_val.lower(), 'other')
            values = data.pop()
            for name in re.split(r'(?:[\s,]+and\s|[\s,]{2,})', values):
                if name.lower().strip() == 'none.':
                    continue
                name = name.replace('..', '')
                name = re.sub(r'\.$', '', name)
                name = name.strip('-1234567890 \n')
                if not name:
                    continue
                vote.vote(key, name)
                actual_vote[vote_val] += 1
                vote_count[key] += 1
            assert actual_vote[vote_val] == vote_count[key]

        for key, value in vote_count.items():
            vote.set_count(key, value)
        # updating result with actual value
        vote.result = 'pass' if vote_count['yes'] > (vote_count['no'] +
                                                     vote_count['other']) else 'fail'

        yield vote
Пример #3
0
    def add_vote(self, bill, chamber, date, text, url):
        votes = re.findall(r'Ayes,?[\s]?(\d+)[,;]\s+N(?:oes|ays),?[\s]?(\d+)', text)
        yes, no = int(votes[0][0]), int(votes[0][1])

        vtype = 'other'
        for regex, type in motion_classifiers.items():
            if re.match(regex, text):
                vtype = type
                break

        v = VoteEvent(
            chamber=chamber,
            start_date=TIMEZONE.localize(date),
            motion_text=text,
            result='pass' if yes > no else 'fail',
            classification=vtype,
            bill=bill,
        )
        v.pupa_id = url.split('/')[-1]
        v.set_count('yes', yes)
        v.set_count('no', no)

        # fetch the vote itself
        if url:
            v.add_source(url)

            if 'av' in url:
                self.add_house_votes(v, url)
            elif 'sv' in url:
                self.add_senate_votes(v, url)

        return v
Пример #4
0
 def _get_votes(self, date, actor, action, bill, url):
     vre = r'(?P<leader>.*)(AYES|YEAS):\s+(?P<yeas>\d+)\s+(NOES|NAYS):\s+(?P<nays>\d+).*'
     if 'YEAS' in action.upper() or 'AYES' in action.upper():
         match = re.match(vre, action)
         if match:
             v = match.groupdict()
             yes, no = int(v['yeas']), int(v['nays'])
             vote = VoteEvent(
                 chamber=actor,
                 motion_text=v['leader'],
                 result='pass' if yes > no else 'fail',
                 classification='passage',
                 start_date=TIMEZONE.localize(date),
                 bill=bill,
             )
             vote.add_source(url)
             yield vote
Пример #5
0
    def get_vote_event(self, bill, act, votes, result):
        '''Make VoteEvent object from given Bill, action, votes and result.'''
        organization = json.loads(act['organization_id'].lstrip('~'))
        vote_event = VoteEvent(legislative_session=bill.legislative_session,
                               motion_text=act['description'],
                               organization=organization,
                               classification=None,
                               start_date=act['date'],
                               result=result,
                               bill=bill)

        legistar_web, legistar_api = [src['url'] for src in bill.sources]

        vote_event.add_source(legistar_web)
        vote_event.add_source(legistar_api + '/histories')

        for vote in votes:
            raw_option = vote['VoteValueName'].lower()

            if raw_option == 'suspended':
                continue

            clean_option = self.VOTE_OPTIONS.get(raw_option, raw_option)
            vote_event.vote(clean_option, vote['VotePersonName'].strip())

        return vote_event
Пример #6
0
def test_vote_event_identifier_dedupe():
    j = Jurisdiction.objects.create(id='jid', division_id='did')
    j.legislative_sessions.create(name='1900', identifier='1900')

    vote_event = ScrapeVoteEvent(legislative_session='1900', start_date='2013',
                                 classification='anything', result='passed',
                                 motion_text='a vote on something',
                                 identifier='Roll Call No. 1')
    dmi = DumbMockImporter()
    bi = BillImporter('jid', dmi, dmi)

    _, what = VoteEventImporter('jid', dmi, dmi, bi).import_item(vote_event.as_dict())
    assert what == 'insert'
    assert VoteEvent.objects.count() == 1

    # same exact vote event, no changes
    _, what = VoteEventImporter('jid', dmi, dmi, bi).import_item(vote_event.as_dict())
    assert what == 'noop'
    assert VoteEvent.objects.count() == 1

    # new info, update
    vote_event.result = 'failed'
    _, what = VoteEventImporter('jid', dmi, dmi, bi).import_item(vote_event.as_dict())
    assert what == 'update'
    assert VoteEvent.objects.count() == 1

    # new bill, insert
    vote_event.identifier = 'Roll Call 2'
    _, what = VoteEventImporter('jid', dmi, dmi, bi).import_item(vote_event.as_dict())
    assert what == 'insert'
    assert VoteEvent.objects.count() == 2
def test_vote_event_pupa_identifier_dedupe():
    j = create_jurisdiction()
    j.legislative_sessions.create(name='1900', identifier='1900')
    Organization.objects.create(id='org-id', name='Legislature',
                                classification='legislature',
                                jurisdiction=j)

    vote_event = ScrapeVoteEvent(legislative_session='1900', start_date='2013',
                                 classification='anything', result='passed',
                                 motion_text='a vote on something',
                                 identifier='Roll Call No. 1')
    vote_event.pupa_id = 'foo'

    dmi = DumbMockImporter()
    oi = OrganizationImporter('jid')
    bi = BillImporter('jid', dmi, oi)

    _, what = VoteEventImporter('jid', dmi, oi, bi).import_item(vote_event.as_dict())
    assert what == 'insert'
    assert VoteEvent.objects.count() == 1

    # same exact vote event, no changes
    _, what = VoteEventImporter('jid', dmi, oi, bi).import_item(vote_event.as_dict())
    assert what == 'noop'
    assert VoteEvent.objects.count() == 1

    # new info, update
    vote_event.result = 'failed'
    _, what = VoteEventImporter('jid', dmi, oi, bi).import_item(vote_event.as_dict())
    assert what == 'update'
    assert VoteEvent.objects.count() == 1

    # new bill identifier, update
    vote_event.identifier = 'First Roll Call'
    _, what = VoteEventImporter('jid', dmi, oi, bi).import_item(vote_event.as_dict())
    assert what == 'update'
    assert VoteEvent.objects.count() == 1

    # new pupa identifier, insert
    vote_event.pupa_id = 'bar'
    _, what = VoteEventImporter('jid', dmi, oi, bi).import_item(vote_event.as_dict())
    assert what == 'insert'
    assert VoteEvent.objects.count() == 2
Пример #8
0
def test_full_vote_event():
    j = Jurisdiction.objects.create(id='jid', division_id='did')
    j.legislative_sessions.create(name='1900', identifier='1900')
    sp1 = ScrapePerson('John Smith', primary_org='lower')
    sp2 = ScrapePerson('Adam Smith', primary_org='lower')
    org = ScrapeOrganization(name='House', classification='lower')
    bill = ScrapeBill('HB 1', '1900', 'Axe & Tack Tax Act', from_organization=org._id)
    vote_event = ScrapeVoteEvent(legislative_session='1900', motion_text='passage',
                                 start_date='1900-04-01', classification='passage:bill',
                                 result='pass', bill_chamber='lower', bill='HB 1',
                                 organization=org._id)
    vote_event.set_count('yes', 20)
    vote_event.yes('John Smith')
    vote_event.no('Adam Smith')

    oi = OrganizationImporter('jid')
    oi.import_data([org.as_dict()])

    pi = PersonImporter('jid')
    pi.import_data([sp1.as_dict(), sp2.as_dict()])

    mi = MembershipImporter('jid', pi, oi, DumbMockImporter())
    mi.import_data([sp1._related[0].as_dict(), sp2._related[0].as_dict()])

    bi = BillImporter('jid', oi, pi)
    bi.import_data([bill.as_dict()])

    VoteEventImporter('jid', pi, oi, bi).import_data([vote_event.as_dict()])

    assert VoteEvent.objects.count() == 1
    ve = VoteEvent.objects.get()
    assert ve.legislative_session == LegislativeSession.objects.get()
    assert ve.motion_classification == ['passage:bill']
    assert ve.bill == Bill.objects.get()
    count = ve.counts.get()
    assert count.option == 'yes'
    assert count.value == 20
    votes = list(ve.votes.all())
    assert len(votes) == 2
    for v in ve.votes.all():
        if v.voter_name == 'John Smith':
            assert v.option == 'yes'
            assert v.voter == Person.objects.get(name='John Smith')
        else:
            assert v.option == 'no'
            assert v.voter == Person.objects.get(name='Adam Smith')
def test_vote_event_bill_actions_two_stage():
    # this test is very similar to what we're testing in test_vote_event_bill_actions w/
    # ve3 and ve4, that two bills that reference the same action won't conflict w/ the
    # OneToOneField, but in this case we do it in two stages so that the conflict is found
    # even if the votes weren't in the same scrape
    j = create_jurisdiction()
    j.legislative_sessions.create(name='1900', identifier='1900')
    org1 = ScrapeOrganization(name='House', classification='lower')
    bill = ScrapeBill('HB 1', '1900', 'Axe & Tack Tax Act', from_organization=org1._id)

    bill.add_action(description='passage', date='1900-04-02', chamber='lower')

    ve1 = ScrapeVoteEvent(legislative_session='1900', motion_text='passage',
                          start_date='1900-04-02', classification='passage:bill',
                          result='pass', bill_chamber='lower', bill='HB 1',
                          bill_action='passage',
                          organization=org1._id)
    ve2 = ScrapeVoteEvent(legislative_session='1900', motion_text='passage',
                          start_date='1900-04-02', classification='passage:bill',
                          result='pass', bill_chamber='lower', bill='HB 1',
                          bill_action='passage',
                          organization=org1._id)
    # disambiguate them
    ve1.pupa_id = 'one'
    ve2.pupa_id = 'two'

    oi = OrganizationImporter('jid')
    oi.import_data([org1.as_dict()])

    bi = BillImporter('jid', oi, DumbMockImporter())
    bi.import_data([bill.as_dict()])

    # first imports just fine
    VoteEventImporter('jid', DumbMockImporter(), oi, bi).import_data([
        ve1.as_dict(),
    ])
    votes = list(VoteEvent.objects.all())
    assert len(votes) == 1
    assert votes[0].bill_action is not None

    # when second is imported, ensure that action stays pinned to first just as it would
    # have if they were both in same import
    VoteEventImporter('jid', DumbMockImporter(), oi, bi).import_data([
        ve1.as_dict(),
        ve2.as_dict(),
    ])
    votes = list(VoteEvent.objects.all())
    assert len(votes) == 2
    assert votes[0].bill_action is not None
    assert votes[1].bill_action is None
Пример #10
0
def viva_voce_votes(root, session, chamber):
    for el in root.xpath(u'//div[starts-with(., "All Members are deemed")]'):
        mv = MaybeViva(el)
        if not mv.is_valid:
            continue

        v = VoteEvent(
            chamber=chamber,
            start_date=None,
            motion_text='passage' if mv.passed else 'other',
            result='pass' if mv.passed else 'fail',
            classification='passage' if mv.passed else 'other',
            legislative_session=session[0:2],
            bill=mv.bill_id,
            bill_chamber=mv.chamber
        )

        v.set_count('yes', 0)
        v.set_count('no', 0)
        v.set_count('absent', 0)
        v.set_count('not voting', 0)

        yield v
Пример #11
0
def test_vote_event_bill_id_dedupe():
    j = create_jurisdiction()
    session = j.legislative_sessions.create(name='1900', identifier='1900')
    org = Organization.objects.create(id='org-id', name='House', classification='lower',
                                      jurisdiction=j)
    bill = Bill.objects.create(id='bill-1', identifier='HB 1', legislative_session=session,
                               from_organization=org)
    bill2 = Bill.objects.create(id='bill-2', identifier='HB 2', legislative_session=session,
                                from_organization=org)

    vote_event = ScrapeVoteEvent(legislative_session='1900', start_date='2013',
                                 classification='anything', result='passed',
                                 motion_text='a vote on something',
                                 bill=bill.identifier, bill_chamber='lower',
                                 chamber='lower')
    dmi = DumbMockImporter()
    oi = OrganizationImporter('jid')
    bi = BillImporter('jid', dmi, oi)

    _, what = VoteEventImporter('jid', dmi, oi, bi).import_item(vote_event.as_dict())
    assert what == 'insert'
    assert VoteEvent.objects.count() == 1

    # same exact vote event, no changes
    _, what = VoteEventImporter('jid', dmi, oi, bi).import_item(vote_event.as_dict())
    assert what == 'noop'
    assert VoteEvent.objects.count() == 1

    # new info, update
    vote_event.result = 'failed'
    _, what = VoteEventImporter('jid', dmi, oi, bi).import_item(vote_event.as_dict())
    assert what == 'update'
    assert VoteEvent.objects.count() == 1

    # new vote event, insert
    vote_event = ScrapeVoteEvent(legislative_session='1900', start_date='2013',
                                 classification='anything', result='passed',
                                 motion_text='a vote on something',
                                 bill=bill2.identifier, bill_chamber='lower',
                                 chamber='lower')
    _, what = VoteEventImporter('jid', dmi, oi, bi).import_item(vote_event.as_dict())
    assert what == 'insert'
    assert VoteEvent.objects.count() == 2
Пример #12
0
def test_vote_event_bill_clearing():
    # ensure that we don't wind up with vote events sitting around forever on bills as
    # changes make it look like there are multiple vote events
    j = create_jurisdiction()
    session = j.legislative_sessions.create(name='1900', identifier='1900')
    org = Organization.objects.create(id='org-id', name='House', classification='lower',
                                      jurisdiction=j)
    bill = Bill.objects.create(id='bill-1', identifier='HB 1', legislative_session=session,
                               from_organization=org)
    Bill.objects.create(id='bill-2', identifier='HB 2', legislative_session=session,
                        from_organization=org)
    oi = OrganizationImporter('jid')
    dmi = DumbMockImporter()
    bi = BillImporter('jid', dmi, oi)

    vote_event1 = ScrapeVoteEvent(legislative_session='1900', start_date='2013',
                                  classification='anything', result='passed',
                                  motion_text='a vote on somthing',             # typo intentional
                                  bill=bill.identifier, bill_chamber='lower',
                                  chamber='lower'
                                  )
    vote_event2 = ScrapeVoteEvent(legislative_session='1900', start_date='2013',
                                  classification='anything', result='passed',
                                  motion_text='a vote on something else',
                                  bill=bill.identifier, bill_chamber='lower',
                                  chamber='lower'
                                  )

    # have to use import_data so postimport is called
    VoteEventImporter('jid', dmi, oi, bi).import_data([
        vote_event1.as_dict(),
        vote_event2.as_dict()
    ])
    assert VoteEvent.objects.count() == 2

    # a typo is fixed, we don't want 3 vote events now
    vote_event1.motion_text = 'a vote on something'
    VoteEventImporter('jid', dmi, oi, bi).import_data([
        vote_event1.as_dict(),
        vote_event2.as_dict()
    ])
    assert VoteEvent.objects.count() == 2
Пример #13
0
    def createVoteEvent(self, motion, agenda_item_version):
        version = agenda_item_version
        date = self.toDate(version['date'])
        v = VoteEvent(
            motion_text=motion['title_text'],
            result=RESULT_MAP[motion['result']],
            classification=motion['action'],
            start_date=date,
            legislative_session=version['session'],
        )

        if motion['mover']:
            v.extras['mover'] = motion['mover']
        if motion['body_text']:
            v.extras['body'] = motion['body_text']

        v.set_bill(version['bill_identifier'])
        v.add_source(version['url'])

        return v
Пример #14
0
    def process_committee_vote(self, committee_action, bill):
        try:
            date = committee_action["ActionDate"]
            vote_info = committee_action["Vote"]

        except KeyError:
            self.logger.warning("Committee vote has no data. Skipping.")
            return
        date = self.date_format(date)

        other_count = 0
        for v in vote_info:
            vote_count = 0 if v["VoteCount"] == "" else int(v["VoteCount"])

            if v["VoteType"] == "Yes":
                yes_count = vote_count
            elif v["VoteType"] == "No":
                no_count = vote_count
            else:
                other_count += vote_count

        result = 'fail'
        if yes_count > no_count:
            result = 'pass'

        v = VoteEvent(chamber='legislature',
                      start_date=date,
                      motion_text='Committee Vote',
                      result=result,
                      classification='committee',
                      bill=bill
                      )
        v.set_count('yes', yes_count)
        v.set_count('no', no_count)
        v.set_count('other', other_count)

        return v
Пример #15
0
    def scrape_vote(self, bill, vote_json, session):

        if vote_json['amendmentNumber']:
            motion = '{}: {}'.format(
                vote_json['amendmentNumber'], vote_json['action'])
        else:
            motion = vote_json['action']

        result = 'pass' if vote_json['yesVotesCount'] > vote_json['noVotesCount'] else 'fail'

        v = VoteEvent(
            chamber=self.chamber_abbrev_map[vote_json['chamber']],
            start_date=self.parse_local_date(vote_json['voteDate']),
            motion_text=motion,
            result=result,
            legislative_session=session,
            bill=bill,
            classification='other',
        )

        v.set_count(option='yes', value=vote_json['yesVotesCount'])
        v.set_count('no', vote_json['noVotesCount'])
        v.set_count('absent', vote_json['absentVotesCount'])
        v.set_count('excused', vote_json['excusedVotesCount'])
        v.set_count('other', vote_json['conflictVotesCount'])

        for name in vote_json['yesVotes'].split(','):
            if name.strip():
                v.yes(name.strip())

        for name in vote_json['noVotes'].split(','):
            if name.strip():
                v.no(name.strip())

        # add votes with other classifications
        # option can be 'yes', 'no', 'absent',
        # 'abstain', 'not voting', 'paired', 'excused'
        for name in vote_json['absentVotes'].split(','):
            if name.strip():
                v.vote(option="absent",
                       voter=name)

        for name in vote_json['excusedVotes'].split(','):
            if name.strip():
                v.vote(option="excused",
                       voter=name)

        for name in vote_json['conflictVotes'].split(','):
            if name.strip():
                v.vote(option="other",
                       voter=name)

        source_url = 'http://lso.wyoleg.gov/Legislation/{}/{}'.format(
            session, vote_json['billNumber'])
        v.add_source(source_url)

        yield v
Пример #16
0
    def parse_vote_pdf(self, vote_url, bill):

        filename, response = self.urlretrieve(vote_url)

        text = convert_pdf(filename, type='text').decode()
        lines = text.splitlines()

        if 'Senate' in vote_url:
            chamber = 'upper'
        else:
            chamber = 'lower'

        date_string = lines[0].split('Calendar Date:')[1].strip()
        date = datetime.datetime.strptime(date_string, "%b %d, %Y %I:%M (%p)")

        page_index = None
        for index, line in enumerate(lines):
            if 'Yeas' in line and 'Nays' in line:
                page_index = index
                break

        vote_counts = 5 * [0]
        vote_types = ['yes', 'no', 'not voting', 'excused', 'absent']

        if page_index:

            counts = re.split(r'\s{2,}', lines[page_index].strip())

            for index, count in enumerate(counts):
                number, string = count.split(' ', 1)
                number = int(number)
                vote_counts[index] = number
        else:
            raise ValueError("Vote Counts Not found at %s" % vote_url)

        passed = vote_counts[0] > vote_counts[1]

        # Consent calendar votes address multiple bills in one VoteEvent
        # eg, http://mgaleg.maryland.gov/2018RS/votes/Senate/0478.pdf
        is_consent_calendar = any(
            ['Consent Calendar' in line for line in lines[:page_index]])
        consent_calendar_bills = None
        motion = ""
        if is_consent_calendar:
            motion = re.split(r'\s{2,}', lines[page_index - 4].strip())[0]
            consent_calendar_bills = re.split(r'\s{2,}',
                                              lines[page_index - 1].strip())
            assert consent_calendar_bills, "Could not find bills for consent calendar vote"

        motion_keywords = [
            'favorable', 'reading', 'amendment', 'motion', 'introduced',
            'bill pass', 'committee'
        ]
        motion_lines = [
            3, 2, 4, 5
        ]  # Relative LineNumbers to be checked for existence of motion

        for i in motion_lines:
            if any(motion_keyword in motion.lower()
                   for motion_keyword in motion_keywords):
                break
            motion = re.split(r'\s{2,}', lines[page_index - i].strip())[0]
        else:
            if not any(motion_keyword in motion.lower()
                       for motion_keyword in motion_keywords):
                # This condition covers for the bad formating in SB 1260
                motion = lines[page_index - 3]
            if not any(motion_keyword in motion.lower()
                       for motion_keyword in motion_keywords):
                # Check this one for SB 747
                motion = "No motion given"
                self.warning("No motion given")

        vote = VoteEvent(
            bill=bill,
            chamber=chamber,
            start_date=date.strftime('%Y-%m-%d'),
            motion_text=motion,
            classification='passage',
            result='pass' if passed else 'fail',
        )

        # Include bill ID to avoid duplication for consent calendars
        vote.pupa_id = '{}#{}'.format(vote_url, bill.identifier)

        for index, vote_type in enumerate(vote_types):
            vote.set_count(vote_type, vote_counts[index])
        page_index = page_index + 2

        # Keywords for identifying where names are located in the pdf
        show_stoppers = [
            'Voting Nay', 'Not Voting', 'COPY', 'Excused',
            'indicates vote change', 'Indicates Vote Change'
        ]
        vote_index = 0

        # For matching number of names extracted with vote counts(extracted independently)
        vote_name_counts = 5 * [0]

        while page_index < len(lines):

            current_line = lines[page_index].strip()

            if not current_line or 'Voting Yea' in current_line:
                page_index += 1
                continue

            if any(show_stopper in current_line
                   for show_stopper in show_stoppers):
                page_index += 1
                vote_index = (vote_index + 1)
                continue

            names = re.split(r'\s{2,}', current_line)

            vote_name_counts[vote_index] += len(names)

            for name in names:
                vote.vote(vote_types[vote_index], name)
            page_index += 1

        if vote_counts != vote_name_counts:
            raise ValueError("Votes Count and Number of Names don't match")

        return vote
Пример #17
0
    def scrape_vote(self, chamber, session, bill_id, vote_url):
        NO_VOTE_URL = 'http://www.house.leg.state.mn.us/votes/novotefound.asp'
        resp = self.get(vote_url)
        html = resp.text

        # sometimes the link is broken, will redirect to NO_VOTE_URL
        if resp.url == NO_VOTE_URL:
            return

        doc = lxml.html.fromstring(html)
        try:
            motion = doc.xpath("//div[@id='leg_PageContent']/div/h2/text()")[0]
        except IndexError:
            self.logger.warning("Bill was missing a motion number, skipping")
            return

        vote_count = doc.xpath(
            ".//div[@id='leg_PageContent']/div/h3/text()")[1].split()
        yeas = int(vote_count[0])
        nays = int(vote_count[3])

        # second paragraph has date
        paragraphs = doc.xpath(".//div[@id='leg_PageContent']/div/p/text()")
        date = None
        for p in paragraphs:
            try:
                date = datetime.datetime.strptime(p.strip(), '%m/%d/%Y').date()
                break
            except ValueError:
                pass
        if date is None:
            self.logger.warning("No date could be found for vote on %s" %
                                motion)
            return

        vote = VoteEvent(chamber='lower',
                         start_date=date,
                         motion_text=motion,
                         result='pass' if yeas > nays else 'fail',
                         classification='passage',
                         legislative_session=session,
                         bill=bill_id,
                         bill_chamber=chamber)
        vote.set_count('yes', yeas)
        vote.set_count('no', nays)
        vote.add_source(vote_url)

        # first table has YEAs
        for name in doc.xpath('//table[1]/tr/td/font/text()'):
            vote.yes(name.strip())

        # second table is nays
        for name in doc.xpath('//table[2]/tr/td/font/text()'):
            vote.no(name.strip())

        yield vote
Пример #18
0
    def scrape_bill(self, chamber, session, bill_id):
        # try and get bill for the first year of the session biennium
        url = 'http://legislature.mi.gov/doc.aspx?%s-%s' % (
            session[:4], bill_id.replace(' ', '-'))
        html = self.get(url).text
        # Otherwise, try second year of the session biennium
        if ('Page Not Found' in html or
                'The bill you are looking for is not available yet' in html):
            url = 'http://legislature.mi.gov/doc.aspx?%s-%s' % (
                session[-4:], bill_id.replace(' ', '-'))
            html = self.get(url).text
            if ('Page Not Found' in html or
                    'The bill you are looking for is not available yet' in html):
                self.warning("Cannot open bill page for {}; skipping".format(bill_id))
                return

        doc = lxml.html.fromstring(html)
        doc.make_links_absolute('http://legislature.mi.gov')

        title = doc.xpath('//span[@id="frg_billstatus_ObjectSubject"]')[0].text_content()

        # get B/R/JR/CR part and look up bill type
        bill_type = bill_types[bill_id.split(' ')[0][1:]]

        bill = Bill(bill_id, session, title, chamber=chamber,
                    classification=bill_type)
        bill.add_source(url)

        # sponsors
        sponsors = doc.xpath('//span[@id="frg_billstatus_SponsorList"]/a')
        for sponsor in sponsors:
            name = sponsor.text.replace(u'\xa0', ' ')
            # sometimes district gets added as a link
            if name.isnumeric():
                continue

            if len(sponsors) > 1:
                classification = (
                    'primary'
                    if sponsor.tail and 'primary' in sponsor.tail
                    else 'cosponsor'
                )
            else:
                classification = 'primary'
            bill.add_sponsorship(
                name=name,
                chamber=chamber,
                entity_type='person',
                primary=classification == 'primary',
                classification=classification,
            )

        bill.subject = doc.xpath('//span[@id="frg_billstatus_CategoryList"]/a/text()')

        # actions (skip header)
        for row in doc.xpath('//table[@id="frg_billstatus_HistoriesGridView"]/tr')[1:]:
            tds = row.xpath('td')  # date, journal link, action
            date = tds[0].text_content()
            journal = tds[1].text_content()
            action = tds[2].text_content()
            date = TIMEZONE.localize(datetime.datetime.strptime(date, "%m/%d/%Y"))
            # instead of trusting upper/lower case, use journal for actor
            actor = 'upper' if 'SJ' in journal else 'lower'
            classification = categorize_action(action)
            bill.add_action(action, date, chamber=actor, classification=classification)

            # check if action mentions a sub
            submatch = re.search(r'WITH SUBSTITUTE\s+([\w\-\d]+)', action, re.IGNORECASE)
            if submatch and tds[2].xpath('a'):
                version_url = tds[2].xpath('a/@href')[0]
                version_name = tds[2].xpath('a/text()')[0].strip()
                version_name = 'Substitute {}'.format(version_name)
                self.info("Found Substitute {}".format(version_url))
                if version_url.lower().endswith('.pdf'):
                    mimetype = 'application/pdf'
                elif version_url.lower().endswith('.htm'):
                    mimetype = 'text/html'
                bill.add_version_link(version_name, version_url, media_type=mimetype)

            # check if action mentions a vote
            rcmatch = re.search(r'Roll Call # (\d+)', action, re.IGNORECASE)
            if rcmatch:
                rc_num = rcmatch.groups()[0]
                # in format mileg.aspx?page=getobject&objectname=2011-SJ-02-10-011
                journal_link = tds[1].xpath('a/@href')
                if journal_link:
                    objectname = journal_link[0].rsplit('=', 1)[-1]
                    chamber_name = {'upper': 'Senate', 'lower': 'House'}[actor]
                    vote_url = BASE_URL + '/documents/%s/Journal/%s/htm/%s.htm' % (
                        session, chamber_name, objectname)
                    results = self.parse_roll_call(vote_url, rc_num)

                    if results is not None:
                        vote_passed = len(results['yes']) > len(results['no'])
                        vote = VoteEvent(
                            start_date=date,
                            chamber=actor,
                            bill=bill,
                            motion_text=action,
                            result='pass' if vote_passed else 'fail',
                            classification='passage',
                        )

                        # check the expected counts vs actual
                        count = re.search(r'YEAS (\d+)', action, re.IGNORECASE)
                        count = int(count.groups()[0]) if count else 0
                        if count != len(results['yes']):
                            self.warning('vote count mismatch for %s %s, %d != %d' %
                                         (bill_id, action, count, len(results['yes'])))
                        count = re.search(r'NAYS (\d+)', action, re.IGNORECASE)
                        count = int(count.groups()[0]) if count else 0
                        if count != len(results['no']):
                            self.warning('vote count mismatch for %s %s, %d != %d' %
                                         (bill_id, action, count, len(results['no'])))

                        vote.set_count('yes', len(results['yes']))
                        vote.set_count('no', len(results['no']))
                        vote.set_count('other', len(results['other']))

                        for name in results['yes']:
                            vote.yes(name)
                        for name in results['no']:
                            vote.no(name)
                        for name in results['other']:
                            vote.vote('other', name)

                        vote.add_source(vote_url)
                        yield vote
                else:
                    self.warning("missing journal link for %s %s" %
                                 (bill_id, journal))

        # versions
        for row in doc.xpath('//table[@id="frg_billstatus_DocumentGridTable"]/tr'):
            parsed = self.parse_doc_row(row)
            if parsed:
                name, url = parsed
                if url.endswith('.pdf'):
                    mimetype = 'application/pdf'
                elif url.endswith('.htm'):
                    mimetype = 'text/html'
                bill.add_version_link(name, url, media_type=mimetype)

        # documents
        for row in doc.xpath('//table[@id="frg_billstatus_HlaTable"]/tr'):
            document = self.parse_doc_row(row)
            if document:
                name, url = document
                bill.add_document_link(name, url)
        for row in doc.xpath('//table[@id="frg_billstatus_SfaTable"]/tr'):
            document = self.parse_doc_row(row)
            if document:
                name, url = document
                bill.add_document_link(name, url)

        yield bill
    def scrape_vote(self, bill, name, url):
        if "VOTE/H" in url:
            vote_chamber = 'lower'
            cols = (1, 5, 9, 13)
            name_offset = 3
            yes_offset = 0
            no_offset = 1
        else:
            vote_chamber = 'upper'
            cols = (1, 6)
            name_offset = 4
            yes_offset = 1
            no_offset = 2

        # Connecticut's SSL is causing problems with Scrapelib, so use Requests
        page = requests.get(url, verify=False).text

        if 'BUDGET ADDRESS' in page:
            return

        page = lxml.html.fromstring(page)

        yes_count = page.xpath(
            "string(//span[contains(., 'Those voting Yea')])")
        yes_count = int(re.match(r'[^\d]*(\d+)[^\d]*', yes_count).group(1))

        no_count = page.xpath(
            "string(//span[contains(., 'Those voting Nay')])")
        no_count = int(re.match(r'[^\d]*(\d+)[^\d]*', no_count).group(1))

        other_count = page.xpath("string(//span[contains(., 'Those absent')])")
        other_count = int(re.match(r'[^\d]*(\d+)[^\d]*', other_count).group(1))

        need_count = page.xpath("string(//span[contains(., 'Necessary for')])")
        need_count = int(re.match(r'[^\d]*(\d+)[^\d]*', need_count).group(1))

        date = page.xpath("string(//span[contains(., 'Taken on')])")
        date = re.match(r'.*Taken\s+on\s+(\d+/\s?\d+)', date).group(1)
        date = date.replace(' ', '')
        date = datetime.datetime.strptime(
            date + " " + bill.legislative_session, "%m/%d %Y").date()

        # not sure about classification.
        vote = Vote(chamber=vote_chamber,
                    start_date=date,
                    motion_text=name,
                    result='pass' if yes_count > need_count else 'fail',
                    classification='passage',
                    bill=bill)
        vote.set_count('yes', yes_count)
        vote.set_count('no', no_count)
        vote.set_count('other', other_count)
        vote.add_source(url)

        table = page.xpath("//table")[0]
        for row in table.xpath("tr"):
            for i in cols:
                name = row.xpath("string(td[%d])" % (i + name_offset)).strip()

                if not name or name == 'VACANT':
                    continue

                if "Y" in row.xpath("string(td[%d])" % (i + yes_offset)):
                    vote.yes(name)
                elif "N" in row.xpath("string(td[%d])" % (i + no_offset)):
                    vote.no(name)
                else:
                    vote.vote('other', name)

        yield vote
    def scrape_votes(self, bill_page, page_url, bill, insert, year):
        root = lxml.html.fromstring(bill_page)
        trs = root.xpath('/html/body/div/table[6]//tr')
        assert len(trs) >= 1, "Didn't find the Final Passage Votes' table"

        for tr in trs[1:]:
            links = tr.xpath('td/a[contains(text(), "Passage")]')
            if len(links) == 0:
                self.warning("Non-passage vote found for {}; ".format(bill.identifier) +
                             "probably a motion for the calendar. It will be skipped.")
            else:
                assert len(links) == 1, \
                    "Too many votes found for XPath query, on bill {}".format(bill.identifier)
                link = links[0]

            motion = link.text
            if 'Assembly' in motion:
                chamber = 'lower'
            else:
                chamber = 'upper'

            votes = {}
            tds = tr.xpath('td')
            for td in tds:
                if td.text:
                    text = td.text.strip()
                    date = re.match('... .*?, ....', text)
                    count = re.match('(?P<category>.*?) (?P<votes>[0-9]+)[,]?', text)
                    if date:
                        vote_date = datetime.strptime(text, '%b %d, %Y')
                    elif count:
                        votes[count.group('category')] = int(count.group('votes'))

            yes = votes['Yea']
            no = votes['Nay']
            excused = votes['Excused']
            not_voting = votes['Not Voting']
            absent = votes['Absent']
            other = excused + not_voting + absent
            passed = yes > no

            vote = VoteEvent(chamber=chamber, start_date=self._tz.localize(vote_date),
                             motion_text=motion, result='pass' if passed else 'fail',
                             classification='passage', bill=bill,
                             )
            vote.set_count('yes', yes)
            vote.set_count('no', no)
            vote.set_count('other', other)
            vote.set_count('not voting', not_voting)
            vote.set_count('absent', absent)
            # try to get vote details
            try:
                vote_url = 'http://www.leg.state.nv.us/Session/%s/Reports/%s' % (
                    insert, link.get('href'))
                vote.pupa_id = vote_url
                vote.add_source(vote_url)

                if vote_url in self._seen_votes:
                    self.warning('%s is included twice, skipping second', vote_url)
                    continue
                else:
                    self._seen_votes.add(vote_url)

                page = self.get(vote_url).text
                page = page.replace(u"\xa0", " ")
                root = lxml.html.fromstring(page)

                for el in root.xpath('//table[2]/tr'):
                    tds = el.xpath('td')
                    name = tds[1].text_content().strip()
                    vote_result = tds[2].text_content().strip()

                    if vote_result == 'Yea':
                        vote.yes(name)
                    elif vote_result == 'Nay':
                        vote.no(name)
                    else:
                        vote.vote('other', name)
                vote.add_source(page_url)
            except scrapelib.HTTPError:
                self.warning("failed to fetch vote page, adding vote without details")

            yield vote
Пример #21
0
    def scrape_votes(self, session, zip_url):
        votes = {}
        last_line = []

        for line in self.zf.open('tblrollcallsummary.txt'):
            if line.strip() == "":
                continue

            line = line.split('|')
            if len(line) < 14:
                if len(last_line + line[1:]) == 14:
                    line = last_line
                    self.warning('used bad vote line')
                else:
                    last_line = line
                    self.warning('bad vote line %s' % '|'.join(line))
            session_yr = line[0]
            body = line[1]
            vote_num = line[2]
            timestamp = line[3]
            bill_id = line[4].strip()
            yeas = int(line[5])
            nays = int(line[6])
            # present = int(line[7])
            # absent = int(line[8])
            motion = line[11].strip() or '[not available]'

            if session_yr == session and bill_id in self.bills_by_id:
                actor = 'lower' if body == 'H' else 'upper'
                time = dt.datetime.strptime(timestamp,
                                            '%m/%d/%Y %I:%M:%S %p')
                # TODO: stop faking passed somehow
                passed = yeas > nays
                vote = Vote(chamber=actor,
                            start_date=time.strftime("%Y-%m-%d"),
                            motion_text=motion,
                            result='pass' if passed else 'fail',
                            classification='passage',
                            bill=self.bills_by_id[bill_id])
                vote.set_count('yes', yeas)
                vote.set_count('no', nays)
                vote.add_source(zip_url)
                votes[body+vote_num] = vote

        for line in self.zf.open('tblrollcallhistory.txt'):
            # 2012    | H   | 2    | 330795  | HB309  | Yea |1/4/2012 8:27:03 PM
            session_yr, body, v_num, employee, bill_id, vote, date \
                    = line.split('|')

            if not bill_id:
                continue

            if session_yr == session and bill_id.strip() in self.bills_by_id:
                try:
                    leg = self.legislators[employee]['name']
                except KeyError:
                    self.warning("Error, can't find person %s" % employee)
                    continue

                vote = vote.strip()
                if body+v_num not in votes:
                    self.warning("Skipping processing this vote:")
                    self.warning("Bad ID: %s" % (body+v_num))
                    continue
                other_count = 0
                # code = self.legislators[employee]['seat']
                if vote == 'Yea':
                    votes[body+v_num].yes(leg)
                elif vote == 'Nay':
                    votes[body+v_num].no(leg)
                else:
                    votes[body+v_num].other(leg)
                    other_count += 1
                votes[body+v_num].set_count('other', other_count)
        for vote in votes.values():
            yield vote
Пример #22
0
    def process_vote(self, votes, url, base_url, bill, legislators,
                     chamber_dict, vote_results):
        for v in votes["items"]:
            try:
                v["yeas"]
            except KeyError:
                # sometimes the actual vote is buried a second layer deep
                v = self.get(base_url + v["link"]).json()
                try:
                    v["yeas"]
                except KeyError:
                    self.logger.warning("No vote info available, skipping")
                    continue

            try:
                chamber = chamber_dict[v["chamber"]]
            except KeyError:
                chamber = "lower" if "house" in v["apn"] else "upper"
            try:
                date = self._tz.localize(
                    datetime.datetime.strptime(v["date"], "%m/%d/%y"))
                date = "{:%Y-%m-%d}".format(date)
            except KeyError:
                try:
                    date = self._tz.localize(
                        datetime.datetime.strptime(v["occurred"], "%m/%d/%y"))
                    date = "{:%Y-%m-%d}".format(date)
                except KeyError:
                    self.logger.warning("No date found for vote, skipping")
                    continue
            try:
                motion = v["action"]
            except KeyError:
                motion = v["motiontype"]

            # Sometimes Ohio's SOLAR will only return part of the JSON, so in that case skip
            if (not motion and isinstance(v['yeas'], str)
                    and isinstance(v['nays'], str)):
                waringText = 'Malformed JSON found for vote ("revno" of {}); skipping'
                self.warning(waringText.format(v['revno']))
                continue

            result = v.get("results") or v.get("passed")
            if result is None:
                if len(v['yeas']) > len(v['nays']):
                    result = "passed"
                else:
                    result = "failed"

            passed = vote_results[result.lower()]
            if "committee" in v:
                vote = VoteEvent(
                    chamber=chamber,
                    start_date=date,
                    motion_text=motion,
                    result='pass' if passed else 'fail',
                    # organization=v["committee"],
                    bill=bill,
                    classification='passed')
            else:
                vote = VoteEvent(chamber=chamber,
                                 start_date=date,
                                 motion_text=motion,
                                 result='pass' if passed else 'fail',
                                 classification='passed',
                                 bill=bill)
            # Concatenate the bill identifier and vote identifier to avoid collisions
            vote.pupa_id = '{}:{}'.format(bill.identifier.replace(' ', ''),
                                          v['revno'])
            # the yea and nay counts are not displayed, but vote totals are
            # and passage status is.
            yes_count = 0
            no_count = 0
            absent_count = 0
            excused_count = 0
            for voter_id in v["yeas"]:
                vote.yes(legislators[voter_id])
                yes_count += 1
            for voter_id in v["nays"]:
                vote.no(legislators[voter_id])
                no_count += 1
            if "absent" in v:
                for voter_id in v["absent"]:
                    vote.vote('absent', legislators[voter_id])
                    absent_count += 1
            if "excused" in v:
                for voter_id in v["excused"]:
                    vote.vote('excused', legislators[voter_id])
                    excused_count += 1

            vote.set_count('yes', yes_count)
            vote.set_count('no', no_count)
            vote.set_count('absent', absent_count)
            vote.set_count('excused', excused_count)
            # check to see if there are any other things that look
            # like vote categories, throw a warning if so
            for key, val in v.items():
                if (type(val) == list and len(val) > 0
                        and key not in ["yeas", "nays", "absent", "excused"]):
                    if val[0] in legislators:
                        self.logger.warning(
                            "{k} looks like a vote type that's not being counted."
                            " Double check it?".format(k=key))
            vote.add_source(url)

            yield vote
    def scrape_vote(self, bill, date, url):
        page = self.get(url).text
        page = lxml.html.fromstring(page)

        header = page.xpath("string(//h4[contains(@id, 'hdVote')])")

        if 'No Bill Action' in header:
            self.warning("bad vote header -- skipping")
            return
        location = header.split(', ')[1]

        if location.startswith('House'):
            chamber = 'lower'
        elif location.startswith('Senate'):
            chamber = 'upper'
        elif location.startswith('Joint'):
            chamber = 'legislature'
        else:
            raise ScrapeError("Bad chamber: %s" % location)

        # committee = ' '.join(location.split(' ')[1:]).strip()
        # if not committee or committee.startswith('of Representatives'):
        #     committee = None

        motion = ', '.join(header.split(', ')[2:]).strip()
        if motion:
            # If we can't detect a motion, skip this vote
            yes_count = int(
                page.xpath("string(//td[contains(@id, 'tdAyes')])"))
            no_count = int(
                page.xpath("string(//td[contains(@id, 'tdNays')])"))
            excused_count = int(
                page.xpath("string(//td[contains(@id, 'tdExcused')])"))
            absent_count = int(
                page.xpath("string(//td[contains(@id, 'tdAbsent')])"))

            passed = yes_count > no_count

            if motion.startswith('Do Pass'):
                type = 'passage'
            elif motion == 'Concurred in amendments':
                type = 'amendment'
            elif motion == 'Veto override':
                type = 'veto_override'
            else:
                type = 'other'

            vote = VoteEvent(chamber=chamber,
                             start_date=date,
                             motion_text=motion,
                             result='pass' if passed else 'fail',
                             classification=type,
                             bill=bill
                             )
            vote.pupa_id = url      # vote id is in URL

            vote.add_source(url)
            vote.set_count('yes', yes_count)
            vote.set_count('no', no_count)
            vote.set_count('excused', excused_count)
            vote.set_count('absent', absent_count)

            for td in page.xpath("//table[contains(@id, 'tblVotes')]/tr/td"):
                if td.text in ('Aye', 'Yea'):
                    vote.yes(td.getprevious().text.strip())
                elif td.text == 'Nay':
                    vote.no(td.getprevious().text.strip())
                elif td.text == 'Excused':
                    vote.vote('excused', td.getprevious().text.strip())
                elif td.text == 'Absent':
                    vote.vote('absent', td.getprevious().text.strip())

            yield vote
Пример #24
0
    def scrape_bill(self, chamber, session, bill_id):
        # try and get bill for the first year of the session biennium
        url = "http://legislature.mi.gov/doc.aspx?%s-%s" % (
            session[:4],
            bill_id.replace(" ", "-"),
        )
        html = self.get(url).text
        # Otherwise, try second year of the session biennium
        if ("Page Not Found" in html or
                "The bill you are looking for is not available yet" in html):
            url = "http://legislature.mi.gov/doc.aspx?%s-%s" % (
                session[-4:],
                bill_id.replace(" ", "-"),
            )
            html = self.get(url).text
            if ("Page Not Found" in html
                    or "The bill you are looking for is not available yet"
                    in html):
                self.warning(
                    "Cannot open bill page for {}; skipping".format(bill_id))
                return

        doc = lxml.html.fromstring(html)
        doc.make_links_absolute("http://legislature.mi.gov")

        title = doc.xpath(
            '//span[@id="frg_billstatus_ObjectSubject"]')[0].text_content()

        # get B/R/JR/CR part and look up bill type
        bill_type = bill_types[bill_id.split(" ")[0][1:]]

        bill = Bill(bill_id,
                    session,
                    title,
                    chamber=chamber,
                    classification=bill_type)
        bill.add_source(url)

        # sponsors
        sponsors = doc.xpath('//span[@id="frg_billstatus_SponsorList"]/a')
        for sponsor in sponsors:
            name = sponsor.text.replace(u"\xa0", " ")
            # sometimes district gets added as a link
            if name.isnumeric():
                continue

            if len(sponsors) > 1:
                classification = ("primary"
                                  if sponsor.tail and "primary" in sponsor.tail
                                  else "cosponsor")
            else:
                classification = "primary"
            bill.add_sponsorship(
                name=name.strip(),
                chamber=chamber,
                entity_type="person",
                primary=classification == "primary",
                classification=classification,
            )

        bill.subject = doc.xpath(
            '//span[@id="frg_billstatus_CategoryList"]/a/text()')

        # actions (skip header)
        for row in doc.xpath(
                '//table[@id="frg_billstatus_HistoriesGridView"]/tr')[1:]:
            tds = row.xpath("td")  # date, journal link, action
            date = tds[0].text_content()
            journal = tds[1].text_content()
            action = tds[2].text_content()
            date = TIMEZONE.localize(
                datetime.datetime.strptime(date, "%m/%d/%Y"))
            # instead of trusting upper/lower case, use journal for actor
            actor = "upper" if "SJ" in journal else "lower"
            classification = categorize_action(action)
            bill.add_action(action,
                            date,
                            chamber=actor,
                            classification=classification)

            # check if action mentions a sub
            submatch = re.search(r"WITH SUBSTITUTE\s+([\w\-\d]+)", action,
                                 re.IGNORECASE)
            if submatch and tds[2].xpath("a"):
                version_url = tds[2].xpath("a/@href")[0]
                version_name = tds[2].xpath("a/text()")[0].strip()
                version_name = "Substitute {}".format(version_name)
                self.info("Found Substitute {}".format(version_url))
                if version_url.lower().endswith(".pdf"):
                    mimetype = "application/pdf"
                elif version_url.lower().endswith(".htm"):
                    mimetype = "text/html"
                bill.add_version_link(version_name,
                                      version_url,
                                      media_type=mimetype)

            # check if action mentions a vote
            rcmatch = re.search(r"Roll Call # (\d+)", action, re.IGNORECASE)
            if rcmatch:
                rc_num = rcmatch.groups()[0]
                # in format mileg.aspx?page=getobject&objectname=2011-SJ-02-10-011
                journal_link = tds[1].xpath("a/@href")
                if journal_link:
                    objectname = journal_link[0].rsplit("=", 1)[-1]
                    chamber_name = {"upper": "Senate", "lower": "House"}[actor]
                    vote_url = BASE_URL + "/documents/%s/Journal/%s/htm/%s.htm" % (
                        session,
                        chamber_name,
                        objectname,
                    )
                    results = self.parse_roll_call(vote_url, rc_num, session)

                    if results is not None:
                        vote_passed = len(results["yes"]) > len(results["no"])
                        vote = VoteEvent(
                            start_date=date,
                            chamber=actor,
                            bill=bill,
                            motion_text=action,
                            result="pass" if vote_passed else "fail",
                            classification="passage",
                        )

                        # check the expected counts vs actual
                        count = re.search(r"YEAS (\d+)", action, re.IGNORECASE)
                        count = int(count.groups()[0]) if count else 0
                        if count != len(results["yes"]):
                            self.warning(
                                "vote count mismatch for %s %s, %d != %d" %
                                (bill_id, action, count, len(results["yes"])))
                        count = re.search(r"NAYS (\d+)", action, re.IGNORECASE)
                        count = int(count.groups()[0]) if count else 0
                        if count != len(results["no"]):
                            self.warning(
                                "vote count mismatch for %s %s, %d != %d" %
                                (bill_id, action, count, len(results["no"])))

                        vote.set_count("yes", len(results["yes"]))
                        vote.set_count("no", len(results["no"]))
                        vote.set_count("other", len(results["other"]))
                        possible_vote_results = ["yes", "no", "other"]
                        for pvr in possible_vote_results:
                            for name in results[pvr]:
                                if session == "2017-2018":
                                    names = name.split("\t")
                                    for n in names:
                                        vote.vote(pvr, name.strip())
                                else:
                                    # Prevents voter names like "House Bill No. 4451, entitled" and other sentences
                                    if len(name.split()) < 5:
                                        vote.vote(pvr, name.strip())
                        vote.add_source(vote_url)
                        yield vote
                else:
                    self.warning("missing journal link for %s %s" %
                                 (bill_id, journal))

        # versions
        for row in doc.xpath(
                '//table[@id="frg_billstatus_DocumentGridTable"]/tr'):
            parsed = self.parse_doc_row(row)
            if parsed:
                name, url = parsed
                if url.endswith(".pdf"):
                    mimetype = "application/pdf"
                elif url.endswith(".htm"):
                    mimetype = "text/html"
                bill.add_version_link(name, url, media_type=mimetype)

        # documents
        for row in doc.xpath('//table[@id="frg_billstatus_HlaTable"]/tr'):
            document = self.parse_doc_row(row)
            if document:
                name, url = document
                bill.add_document_link(name, url)
        for row in doc.xpath('//table[@id="frg_billstatus_SfaTable"]/tr'):
            document = self.parse_doc_row(row)
            if document:
                name, url = document
                bill.add_document_link(name, url)

        yield bill
Пример #25
0
    def _build_lower_votes(self):
        url = self.shared_url + '&Floor%26nbspVotes=Y'
        self.urls.add(votes=url)
        self.bill.add_source(url)
        doc = self.urls.votes.doc
        if doc is None:
            return

        # Grab bill information.
        try:
            pre = doc.xpath('//pre')[0].text_content().strip()

            no_votes = 'There are no votes for this bill in this legislative '

            if pre == no_votes:
                raise ValueError('No votes for this bill.')
        # Skip bill if votes can't be found.
        except (IndexError, ValueError):
            return

        for table in doc.xpath('//table'):

            date = table.xpath('caption/span[contains(., "DATE:")]')
            date = next(date[0].itersiblings()).text
            date = datetime.datetime.strptime(date, '%m/%d/%Y')
            date = date.replace(tzinfo=timezone('UTC'))

            spanText = table.xpath('caption/span/text()')
            motion = spanText[2].strip() + spanText[3].strip()

            votes = table.xpath('caption/span/span')[0].text.split(
                ':')[1].split('/')
            yes_count, no_count = map(int, votes)
            passed = yes_count > no_count
            vote = VoteEvent(chamber='lower',
                             start_date=date,
                             motion_text=motion,
                             bill=self.bill,
                             result='pass' if passed else 'fail',
                             classification='passage')

            vote.set_count('yes', yes_count)
            vote.set_count('no', no_count)
            absent_count = 0
            excused_count = 0
            tds = table.xpath('tr/td/text()')
            votes = [tds[i:i + 2] for i in range(0, len(tds), 2)]

            vote_dictionary = {
                'Y': 'yes',
                'NO': 'no',
                'ER': 'excused',
                'AB': 'absent',
                'NV': 'not voting'
            }

            for vote_pair in votes:
                name, vote_val = vote_pair
                vote.vote(vote_dictionary[vote_val], name)
                if vote_val == 'AB':
                    absent_count += 1
                elif vote_val == 'ER':
                    excused_count += 1

            vote.set_count('absent', absent_count)
            vote.set_count('excused', excused_count)
            vote.add_source(url)
            vote.pupa_id = url + motion + spanText[1]

            yield vote
Пример #26
0
    def scrape_votes(self, bill):
        bill_num = bill.identifier.split()[1]

        url = ("http://wslwebservices.leg.wa.gov/legislationservice.asmx/"
               "GetRollCalls?billNumber=%s&biennium=%s" %
               (bill_num, self.biennium))
        page = self.get(url)
        page = lxml.etree.fromstring(page.content)

        for rc in xpath(page, "//wa:RollCall"):
            motion = xpath(rc, "string(wa:Motion)")
            seq_no = xpath(rc, "string(wa:SequenceNumber)")

            date = xpath(rc, "string(wa:VoteDate)").split("T")[0]
            date = datetime.datetime.strptime(date, "%Y-%m-%d").date()

            yes_count = int(xpath(rc, "string(wa:YeaVotes/wa:Count)"))
            no_count = int(xpath(rc, "string(wa:NayVotes/wa:Count)"))
            abs_count = int(xpath(rc, "string(wa:AbsentVotes/wa:Count)"))
            ex_count = int(xpath(rc, "string(wa:ExcusedVotes/wa:Count)"))

            other_count = abs_count + ex_count

            agency = xpath(rc, "string(wa:Agency)")
            chamber = {'House': 'lower', 'Senate': 'upper'}[agency]

            vote = Vote(chamber=chamber,
                        start_date=date,
                        motion_text='{} (#{})'.format(motion, seq_no),
                        result='pass' if yes_count >
                        (no_count + other_count) else 'fail',
                        classification='other',
                        bill=bill)
            vote.set_count('yes', yes_count)
            vote.set_count('no', no_count)
            vote.set_count('other', other_count)
            vote.add_source(url)
            for sv in xpath(rc, "wa:Votes/wa:Vote"):
                name = xpath(sv, "string(wa:Name)")
                vtype = xpath(sv, "string(wa:VOte)")

                if vtype == 'Yea':
                    vote.yes(name)
                elif vtype == 'Nay':
                    vote.no(name)
                else:
                    vote.vote('other', name)

            yield vote
Пример #27
0
    def scrape_votes(self, session):
        self.session_key = SESSION_KEYS[session]
        self.legislators = index_legislators(self, self.session_key)
        measures_response = self.api_client.get('votes', page=500, session=self.session_key)

        for measure in measures_response:
            bid = '{} {}'.format(measure['MeasurePrefix'], measure['MeasureNumber'])

            measure_history = measure['MeasureHistoryActions']
            for event in measure_history:
                if event['MeasureVotes']:
                    tally = self.tally_votes(event, 'measure')
                    passed = self.passed_vote(tally)

                    classification = self.determine_vote_classifiers(event['ActionText'])
                    when = datetime.datetime.strptime(event['ActionDate'], '%Y-%m-%dT%H:%M:%S')
                    when = self.tz.localize(when)

                    vote = VoteEvent(
                        start_date=when,
                        bill_chamber=self.chamber_code[bid[0]],
                        motion_text=event['ActionText'],
                        classification=classification,
                        result='pass' if passed else 'fail',
                        legislative_session=session,
                        bill=bid,
                        chamber=self.chamber_code[event['Chamber']]
                    )

                    vote.set_count('yes', tally['yes'])
                    vote.set_count('no', tally['no'])
                    vote.set_count('absent', tally['absent'])

                    vote_call = event['MeasureVotes']
                    self.add_individual_votes(vote, vote_call, 'measure')

                    vote.add_source(
                        'https://olis.leg.state.or.us/liz/{session}'
                        '/Measures/Overview/{bid}'.format(
                            session=self.session_key,
                            bid=bid.replace(' ', '')
                        ))

                    yield vote

            committee_history = measure['CommitteeAgendaItems']
            for event in committee_history:
                if event['CommitteeVotes']:
                    tally = self.tally_votes(event, 'committee')
                    passed = self.passed_vote(tally)

                    # there is at least one event w/o an Action listed
                    action = event['Action'] or event['Comments']

                    classification = self.determine_vote_classifiers(action)
                    when = datetime.datetime.strptime(event['MeetingDate'], '%Y-%m-%dT%H:%M:%S')
                    when = self.tz.localize(when)

                    vote = VoteEvent(
                        start_date=when,
                        bill_chamber=self.chamber_code[bid[0]],
                        motion_text=action,
                        classification=classification,
                        result='pass' if passed else 'fail',
                        legislative_session=session,
                        bill=bid,
                        chamber=self.chamber_code[event['CommitteCode'][0]]
                    )

                    vote.set_count('yes', tally['yes'])
                    vote.set_count('no', tally['no'])
                    vote.set_count('absent', tally['absent'])

                    vote_call = event['CommitteeVotes']
                    self.add_individual_votes(vote, vote_call, 'committee')

                    meeting_date = when.strftime('%Y-%m-%d-%H-%M')
                    vote.add_source(
                        'https://olis.leg.state.or.us/liz/{session}/Committees'
                        '/{committee}/{meeting_date}/{bid}/Details'.format(
                            session=self.session_key,
                            committee=event['CommitteCode'],
                            meeting_date=meeting_date,
                            bid=bid.replace(' ', '')
                        ))

                    yield vote
Пример #28
0
    def handle_page(self):
        MOTION_INDEX = 4
        TOTALS_INDEX = 6
        VOTE_START_INDEX = 9

        if len(self.lines) < 2:
            self.scraper.warning("Bad PDF! " + self.url)
            return

        motion = self.lines[MOTION_INDEX].strip()
        # Sometimes there is no motion name, only "Passage" in the line above
        if not motion and not self.lines[MOTION_INDEX -
                                         1].startswith("Calendar Page:"):
            motion = self.lines[MOTION_INDEX - 1]
            MOTION_INDEX -= 1
            TOTALS_INDEX -= 1
            VOTE_START_INDEX -= 1
        else:
            assert motion, "Floor vote's motion name appears to be empty"

        for _extra_motion_line in range(2):
            MOTION_INDEX += 1
            if self.lines[MOTION_INDEX].strip():
                motion = "{}, {}".format(motion,
                                         self.lines[MOTION_INDEX].strip())
                TOTALS_INDEX += 1
                VOTE_START_INDEX += 1
            else:
                break

        (yes_count, no_count, nv_count) = [
            int(x) for x in re.search(
                r"^\s+Yeas - (\d+)\s+Nays - (\d+)\s+Not Voting - (\d+)\s*$",
                self.lines[TOTALS_INDEX],
            ).groups()
        ]
        result = "pass" if yes_count > no_count else "fail"

        vote = VoteEvent(
            start_date=self.kwargs["date"],
            chamber=self.kwargs["chamber"],
            bill=self.kwargs["bill"],
            motion_text=motion,
            result=result,
            classification="passage",
        )
        vote.add_source(self.url)
        vote.set_count("yes", yes_count)
        vote.set_count("no", no_count)
        vote.set_count("not voting", nv_count)

        for line in self.lines[VOTE_START_INDEX:]:
            if not line.strip():
                break

            if " President " in line:
                line = line.replace(" President ", " ")
            elif " Speaker " in line:
                line = line.replace(" Speaker ", " ")

            # Votes follow the pattern of:
            # [vote code] [member name]-[district number]
            for vtype, member in re.findall(
                    r"\s*(Y|N|EX|AV)\s+(.*?)-\d{1,3}\s*", line):
                vtype = {
                    "Y": "yes",
                    "N": "no",
                    "EX": "excused",
                    "AV": "abstain"
                }[vtype]
                member = member.strip()
                vote.vote(vtype, member)

        # check totals line up
        yes_count = no_count = nv_count = 0
        for vc in vote.counts:
            if vc["option"] == "yes":
                yes_count = vc["value"]
            elif vc["option"] == "no":
                no_count = vc["value"]
            else:
                nv_count += vc["value"]

        for vr in vote.votes:
            if vr["option"] == "yes":
                yes_count -= 1
            elif vr["option"] == "no":
                no_count -= 1
            else:
                nv_count -= 1

        if yes_count != 0 or no_count != 0:
            raise ValueError("vote count incorrect: " + self.url)

        if nv_count != 0:
            # On a rare occasion, a member won't have a vote code,
            # which indicates that they didn't vote. The totals reflect
            # this.
            self.scraper.info(
                "Votes don't add up; looking for additional ones")
            for line in self.lines[VOTE_START_INDEX:]:
                if not line.strip():
                    break
                for member in re.findall(r"\s{8,}([A-Z][a-z\'].*?)-\d{1,3}",
                                         line):
                    member = member.strip()
                    vote.vote("not voting", member)
        yield vote
Пример #29
0
    def parse_committee_votes(self, bill, url):
        bill.add_source(url)
        html = self.get(url).text
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)
        chamber = ('upper' if 'Senate' in doc.xpath('string(//h1)') else 'lower')
        committee = tuple(doc.xpath('//h2')[0].itertext())[-2].strip()
        for link in doc.xpath("//a[contains(@href, 'listVoteSummary.cfm')]"):

            # Date
            for fmt in ("%m/%d/%Y", "%m-%d-%Y"):
                date = link.xpath('../../td')[0].text_content()
                try:
                    date = datetime.datetime.strptime(date, fmt)
                except ValueError:
                    continue
                break

            # Motion
            motion = link.text_content().split(' - ')[-1].strip()
            motion = 'Committee vote (%s): %s' % (committee, motion)

            # Roll call
            vote_url = link.attrib['href']
            rollcall = self.parse_upper_committee_vote_rollcall(bill, vote_url)

            vote = VoteEvent(
                chamber=chamber,
                start_date=tz.localize(date),
                motion_text=motion,
                classification='other',
                result='pass' if rollcall['passed'] else 'fail',
                bill=bill,
            )
            vote.pupa_id = vote_url
            vote.set_count('yes', rollcall['yes_count'])
            vote.set_count('no', rollcall['no_count'])
            vote.set_count('other', rollcall['other_count'])

            for voteval in ('yes', 'no', 'other'):
                for name in rollcall.get(voteval + '_votes', []):
                    vote.vote(voteval, name)

            vote.add_source(url)
            vote.add_source(vote_url)

            yield vote
Пример #30
0
    def handle_page(self):
        (_, motion) = self.lines[5].split("FINAL ACTION:")
        motion = motion.strip()
        if not motion:
            self.scraper.warning("Vote appears to be empty")
            return

        vote_top_row = [
            self.lines.index(x) for x in self.lines
            if re.search(r"^\s+Yea\s+Nay.*?(?:\s+Yea\s+Nay)+$", x)
        ][0]
        yea_columns_end = self.lines[vote_top_row].index("Yea") + len("Yea")
        nay_columns_begin = self.lines[vote_top_row].index("Nay")

        votes = {"yes": [], "no": [], "other": []}
        for line in self.lines[(vote_top_row + 1):]:
            if line.strip():
                member = re.search(
                    r"""(?x)
                        ^\s+(?:[A-Z\-]+)?\s+    # Possible vote indicator
                        ([A-Z][a-z]+            # Name must have lower-case characters
                        [\w\-\s]+)              # Continue looking for the rest of the name
                        (?:,[A-Z\s]+?)?         # Leadership has an all-caps title
                        (?:\s{2,}.*)?           # Name ends when many spaces are seen
                        """,
                    line,
                ).group(1)
                # sometimes members have trailing X's from other motions in the
                # vote sheet we aren't collecting
                member = re.sub(r"(\s+X)+", "", member)
                # Usually non-voting members won't even have a code listed
                # Only a couple of codes indicate an actual vote:
                # "VA" (vote after roll call) and "VC" (vote change)
                did_vote = bool(re.search(r"^\s+(X|VA|VC)\s+[A-Z][a-z]", line))
                if did_vote:
                    # Check where the "X" or vote code is on the page
                    vote_column = len(line) - len(line.lstrip())
                    if vote_column <= yea_columns_end:
                        votes["yes"].append(member)
                    elif vote_column >= nay_columns_begin:
                        votes["no"].append(member)
                    else:
                        raise ValueError(
                            "Unparseable vote found for {0} in {1}:\n{2}".
                            format(member, self.url, line))
                else:
                    votes["other"].append(member)

            # End loop as soon as no more members are found
            else:
                break

        totals = re.search(r"(?msu)\s+(\d{1,3})\s+(\d{1,3})\s+.*?TOTALS",
                           self.text).groups()
        yes_count = int(totals[0])
        no_count = int(totals[1])
        result = "pass" if (yes_count > no_count) else "fail"

        vote = VoteEvent(
            start_date=self.kwargs["date"],
            bill=self.kwargs["bill"],
            chamber="upper",
            motion_text=motion,
            classification="committee",
            result=result,
        )
        vote.add_source(self.url)
        vote.set_count("yes", yes_count)
        vote.set_count("no", no_count)
        vote.set_count("other", len(votes["other"]))

        # set voters
        for vtype, voters in votes.items():
            for voter in voters:
                voter = voter.strip()
                # Removes the few voter names with a ton of extra spaces with  VA at the end.
                # Ex: Cruz                                                               VA
                if "  VA" in voter:
                    voter = " ".join(voter.split()[:-2])
                if len(voter) > 0:
                    vote.vote(vtype, voter)

        yield vote
Пример #31
0
    def scrape_action_page(self, bill, page):
        action_rows = page.xpath('//tbody/tr')
        for row in action_rows:
            action_date = row.xpath('td[1]/text()')[0]
            action_date = datetime.strptime(action_date, '%m/%d/%Y')
            action_year = action_date.year
            action_date = action_date.strftime('%Y-%m-%d')

            if row.xpath('td[2]/text()'):
                action_actor = row.xpath('td[2]/text()')[0]
                action_actor = self.chamber_map_reverse[action_actor.strip()]

            action_name = row.xpath('string(td[3])')

            # House votes
            if "Supplement" in action_name:
                actor = "lower"
                vote_action = action_name.split(' -')[0]
                y = int(action_name.strip().split('-')[1].split('YEAS')[0])
                n = int(action_name.strip().split('YEAS to')[1].split('NAYS')[0])

                # get supplement number
                n_supplement = int(action_name.strip().split('No. ')[1].split(r')')[0])
                cached_vote = VoteEvent(
                    chamber=actor,
                    start_date=action_date,
                    motion_text=vote_action,
                    result='pass' if y > n else 'fail',
                    classification='passage',
                    bill=bill,
                )
                cached_vote.set_count('yes', y)
                cached_vote.set_count('no', n)

                housevote_pdf = 'http://www.mass.gov/legis/journal/combined{}RCs.pdf'.format(
                    action_year
                )
                # note: 2014-2015 different format and no data on website for years prior to 2014
                self.scrape_house_vote(cached_vote, housevote_pdf, n_supplement)
                cached_vote.add_source(housevote_pdf)

                cached_vote.pupa_id = '{}#{}'.format(housevote_pdf, n_supplement)

                yield cached_vote

            # Senate votes
            if "Roll Call" in action_name:
                actor = "upper"
                # placeholder
                vote_action = action_name.split(' -')[0]
                try:
                    y, n = re.search('(\d+) yeas .*? (\d+) nays', action_name.lower()).groups()
                    y = int(y)
                    n = int(n)
                except AttributeError:
                    y = int(re.search(r"yeas\s*(\d*)", action_name.lower()).group(1))
                    n = int(re.search(r"nays\s*(\d*)", action_name.lower()).group(1))

                # TODO: other count isn't included, set later
                cached_vote = VoteEvent(
                    chamber=actor,
                    start_date=action_date,
                    motion_text=vote_action,
                    result='pass' if y > n else 'fail',
                    classification='passage',
                    bill=bill,
                )
                cached_vote.set_count('yes', y)
                cached_vote.set_count('no', n)

                rollcall_pdf = 'http://malegislature.gov' + row.xpath('string(td[3]/a/@href)')
                self.scrape_senate_vote(cached_vote, rollcall_pdf)
                cached_vote.add_source(rollcall_pdf)
                yield cached_vote

            attrs = self.categorizer.categorize(action_name)
            action = bill.add_action(
                action_name.strip(),
                action_date,
                chamber=action_actor,
                classification=attrs['classification'],
            )
            for com in attrs.get('committees', []):
                action.add_related_entity(com, entity_type='organization')
Пример #32
0
    def handle_page(self):
        # Checks to see if any vote totals are provided
        if (len(
                self.doc.xpath(
                    '//span[contains(@id, "ctl00_MainContent_lblTotal")]/text()'
                )) > 0):
            (date,
             ) = self.doc.xpath('//span[contains(@id, "lblDate")]/text()')
            date = format_datetime(
                datetime.datetime.strptime(date, "%m/%d/%Y %I:%M:%S %p"),
                "US/Eastern")
            # ctl00_MainContent_lblTotal //span[contains(@id, "ctl00_MainContent_lblTotal")]
            yes_count = int(
                self.doc.xpath('//span[contains(@id, "lblYeas")]/text()')[0])
            no_count = int(
                self.doc.xpath('//span[contains(@id, "lblNays")]/text()')[0])
            other_count = int(
                self.doc.xpath('//span[contains(@id, "lblMissed")]/text()')[0])
            result = "pass" if yes_count > no_count else "fail"

            (committee,
             ) = self.doc.xpath('//span[contains(@id, "lblCommittee")]/text()')
            (action,
             ) = self.doc.xpath('//span[contains(@id, "lblAction")]/text()')
            motion = "{} ({})".format(action, committee)

            vote = VoteEvent(
                start_date=date,
                bill=self.kwargs["bill"],
                chamber="lower",
                motion_text=motion,
                result=result,
                classification="committee",
            )
            vote.add_source(self.url)
            vote.set_count("yes", yes_count)
            vote.set_count("no", no_count)
            vote.set_count("not voting", other_count)

            for member_vote in self.doc.xpath(
                    '//ul[contains(@class, "vote-list")]/li'):
                if not member_vote.text_content().strip():
                    continue

                (member, ) = member_vote.xpath("span[2]//text()")
                (member_vote, ) = member_vote.xpath("span[1]//text()")

                member = member.strip()
                if member_vote == "Y":
                    vote.yes(member)
                elif member_vote == "N":
                    vote.no(member)
                elif member_vote == "-":
                    vote.vote("not voting", member)
                # Parenthetical votes appear to not be counted in the
                # totals for Yea, Nay, _or_ Missed
                elif re.search(r"\([YN]\)", member_vote):
                    continue
                else:
                    raise ValueError(
                        "Unknown vote type found: {}".format(member_vote))

            yield vote
Пример #33
0
    def _process_votes(self, rollcalls, bill_id, original_chamber, session,
                       proxy):
        result_types = {
            "FAILED": False,
            "DEFEATED": False,
            "PREVAILED": True,
            "PASSED": True,
            "SUSTAINED": True,
            "NOT SECONDED": False,
            "OVERRIDDEN": True,
            "ADOPTED": True,
        }

        for r in rollcalls:
            proxy_link = proxy["url"] + r["link"]

            try:
                (path, resp) = self.urlretrieve(proxy_link)
            except scrapelib.HTTPError as e:
                self.warning(e)
                self.warning(
                    "Unable to contact openstates proxy, skipping vote {}".
                    format(r["link"]))
                continue

            text = convert_pdf(path, "text").decode("utf-8")
            lines = text.split("\n")
            os.remove(path)

            chamber = ("lower" if "house of representatives"
                       in lines[0].lower() else "upper")
            date_parts = lines[1].strip().split()[-3:]
            date_str = " ".join(date_parts).title() + " " + lines[2].strip()

            vote_date = datetime.datetime.strptime(date_str,
                                                   "%b %d, %Y %I:%M:%S %p")
            vote_date = pytz.timezone("America/Indiana/Indianapolis").localize(
                vote_date)
            vote_date = vote_date.isoformat()

            passed = None

            for res, val in result_types.items():
                # We check multiple lines now because the result of the
                # roll call vote as parsed can potentially be split.
                # PDF documents suck.
                for line in lines[3:5]:
                    if res in line.upper():
                        passed = val
                        break

            if passed is None:
                raise AssertionError("Missing bill passage type")

            motion = " ".join(lines[4].split()[:-2])
            try:
                yeas = int(lines[4].split()[-1])
                nays = int(lines[5].split()[-1])
                excused = int(lines[6].split()[-1])
                not_voting = int(lines[7].split()[-1])
            except ValueError:
                self.logger.warning("Vote format is weird, skipping")
                continue

            vote = VoteEvent(
                chamber=chamber,
                legislative_session=session,
                bill=bill_id,
                bill_chamber=original_chamber,
                start_date=vote_date,
                motion_text=motion,
                result="pass" if passed else "fail",
                classification="passage",
            )

            vote.set_count("yes", yeas)
            vote.set_count("no", nays)
            vote.set_count("excused", excused)
            vote.set_count("not voting", not_voting)
            vote.add_source(proxy_link)

            currently_counting = ""

            possible_vote_lines = lines[8:]
            for line in possible_vote_lines:
                line = line.replace("NOT\xc2\xa0VOTING", "NOT VOTING")
                line = line.replace("\xc2\xa0", " -")
                if "yea-" in line.lower().replace(" ", ""):
                    currently_counting = "yes"
                elif "nay-" in line.lower().replace(" ", ""):
                    currently_counting = "no"
                elif "excused-" in line.lower().replace(" ", ""):
                    currently_counting = "excused"
                elif "notvoting-" in line.lower().replace(" ", ""):
                    currently_counting = "not voting"
                elif currently_counting == "":
                    pass
                elif re.search(r"v\. \d\.\d", line):
                    # this gets rid of the version number
                    # which is often found at the bottom of the doc
                    pass
                else:
                    voters = line.split("  ")
                    for v in voters:
                        if v.strip():
                            vote.vote(currently_counting, v.strip())

            yield vote
Пример #34
0
    def _parse_votes(self, url, vote, bill):
        '''Given a vote url and a vote object, extract the voters and
        the vote counts from the vote page and update the vote object.
        '''
        if url.lower().endswith('.pdf'):

            try:
                resp = self.get(url)
            except HTTPError:
                # This vote document wasn't found.
                msg = 'No document found at url %r' % url
                self.logger.warning(msg)
                return

            try:
                v = PDFCommitteeVote(url, resp.content, bill)
                return v.asvote()
            except PDFCommitteeVoteParseError:
                # Warn and skip.
                self.warning("Could't parse committee vote at %r" % url)
                return

        html = self.get(url).text
        doc = lxml.html.fromstring(html)

        # Yes, no, excused, absent.
        try:
            vals = doc.xpath('//table')[1].xpath('tr/td/text()')
        except IndexError:
            # Most likely was a bogus link lacking vote data.
            return

        yes_count, no_count, excused_count, absent_count = map(int, vals)

        # Get the motion.
        try:
            motion = doc.xpath('//br')[-1].tail.strip()
        except:
            # Some of them mysteriously have no motion listed.
            motion = vote['action']

        if not motion:
            motion = vote['action']

        vote['motion'] = motion

        action = vote['action']
        vote_url = vote['vote_url']

        vote = VoteEvent(
            chamber=vote['chamber'],
            start_date=vote['date'],
            motion_text=vote['motion'],
            result='fail',  # placeholder
            classification='passage',
            bill=bill,
            bill_action=vote['action'],
        )
        vote.add_source(vote_url)
        vote.set_count('yes', yes_count)
        vote.set_count('no', no_count)
        vote.set_count('excused', excused_count)
        vote.set_count('absent', absent_count)

        for text in doc.xpath('//table')[2].xpath('tr/td/text()'):
            if not text.strip(u'\xa0'):
                continue
            v, name = filter(None, text.split(u'\xa0'))
            # Considering Name is brackets as short name
            regex = re.compile(".*?\((.*?)\)")
            short_name = re.findall(regex, name)
            if len(short_name) > 0:
                note = 'Short Name: ' + short_name[0]
            else:
                note = ''
            # Name without brackets like 'Kary, Douglas'
            name = re.sub("[\(\[].*?[\)\]]", "", name)
            if v == 'Y':
                vote.yes(name, note=note)
            elif v == 'N':
                vote.no(name, note=note)
            elif v == 'E':
                vote.vote('excused', name, note=note)
            elif v == 'A':
                vote.vote('absent', name, note=note)

        # code to deterimine value of `passed`
        passed = None

        # some actions take a super majority, so we aren't just
        # comparing the yeas and nays here.
        for i in vote_passage_indicators:
            if i in action:
                passed = True
                break
        for i in vote_failure_indicators:
            if i in action and passed:
                # a quick explanation:  originally an exception was
                # thrown if both passage and failure indicators were
                # present because I thought that would be a bug in my
                # lists.  Then I found 2007 HB 160.
                # Now passed = False if the nays outnumber the yays..
                # I won't automatically mark it as passed if the yays
                # ounumber the nays because I don't know what requires
                # a supermajority in MT.
                if no_count >= yes_count:
                    passed = False
                    break
                else:
                    raise Exception("passage and failure indicator"
                                    "both present at: %s" % url)
            if i in action and passed is None:
                passed = False
                break
        for i in vote_ambiguous_indicators:
            if i in action:
                passed = yes_count > no_count
                break
        if passed is None:
            raise Exception("Unknown passage at: %s" % url)

        vote.result = 'pass' if passed else 'fail'

        return vote
Пример #35
0
    def scrape_vote(self, bill, vote_url, chamber, date):
        page = self.lxmlize(vote_url)

        try:
            motion = page.xpath("//font/text()")[2]
        except IndexError:
            self.warning("Vote Summary Page Broken ")
            return

        # eg. http://leg.colorado.gov/content/sb18-033vote563ce6
        if ("AM" in motion or "PM" in motion) and "/" in motion:
            motion = "Motion not given."

        if "withdrawn" not in motion:
            yes_no_counts = page.xpath(
                "//tr/td[preceding-sibling::td/descendant::"
                "font[contains(text(),'Aye')]]/font/text()"
            )
            other_counts = page.xpath(
                "//tr/td[preceding-sibling::td/descendant::"
                "font[contains(text(),'Absent')]]/font/text()"
            )
            abstain_counts = page.xpath(
                "//tr/td[preceding-sibling::td/descendant::"
                "font[contains(text(),'17C')]]/font/text()"
            )
            yes_count = int(yes_no_counts[0])
            no_count = int(yes_no_counts[2])
            exc_count = int(other_counts[2])
            absent_count = int(other_counts[0])
            abstain_count = 0
            if abstain_counts:
                abstain_count = int(abstain_counts[0])

            # fix for
            # http://leg.colorado.gov/content/hb19-1029vote65e72e
            if absent_count == -1:
                absent_count = 0

            passed = yes_count > no_count
            vote = VoteEvent(
                chamber=chamber,
                start_date=self._tz.localize(date),
                motion_text=motion,
                result="pass" if passed else "fail",
                bill=bill,
                classification="passage",
            )
            vote.pupa_id = vote_url
            vote.set_count("yes", yes_count)
            vote.set_count("no", no_count)
            vote.set_count("excused", exc_count)
            vote.set_count("absent", absent_count)
            vote.set_count("abstain", abstain_count)
            vote.add_source(vote_url)

            rolls = page.xpath(
                "//tr[preceding-sibling::tr/descendant::"
                "td/div/b/font[contains(text(),'Vote')]]"
            )

            vote_abrv = {
                "Y": "yes",
                "N": "no",
                "E": "excused",
                "A": "absent",
                "-": "absent",
                "17C": "abstain",
            }
            for roll in rolls:
                voted = roll.xpath(".//td/div/font/text()")[0].strip()
                voter = roll.xpath(".//td/font/text()")[0].strip()
                if voted == "V":
                    continue
                vote.vote(vote_abrv[voted], voter)
            yield vote
Пример #36
0
    def asvote(self):
        v = VoteEvent(
            chamber=self.chamber(),
            start_date=self.date(),
            motion_text=self.motion(),
            result='pass' if self.passed() else 'fail',
            classification='passage',
            bill=self.bill,
        )
        v.set_count('yes', self.yes_count())
        v.set_count('no', self.no_count())
        v.set_count('other', self.other_count())

        for voter in self.yes_votes():
            v.yes(voter)
        for voter in self.no_votes():
            v.no(voter)
        for voter in self.other_votes():
            v.vote('other', voter)
        v.add_source(self.url)
        return v
Пример #37
0
    def scrape_vote(self, bill, motion, url):
        page = self.get(url, retry_on_404=True).text
        page = lxml.html.fromstring(page)

        yeas_cell = page.xpath("//td[text() = 'Yeas (Y):']")[0]
        yes_count = int(yeas_cell.xpath("string(following-sibling::td)"))

        nays_cell = page.xpath("//td[text() = 'Nays (N):']")[0]
        no_count = int(nays_cell.xpath("string(following-sibling::td)"))

        abs_cell = page.xpath("//td[text() = 'Absent (X):']")[0]
        abs_count = int(abs_cell.xpath("string(following-sibling::td)"))

        ex_cell = page.xpath("//td[text() = 'Excused (E):']")[0]
        ex_count = int(ex_cell.xpath("string(following-sibling::td)"))

        other_count = abs_count + ex_count

        if 'chamber=House' in url:
            chamber = 'lower'
        elif 'chamber=Senate' in url:
            chamber = 'upper'

        date_cell = page.xpath("//td[text() = 'Date:']")[0]
        date = date_cell.xpath("string(following-sibling::td)")
        try:
            date = datetime.datetime.strptime(date, "%B %d, %Y")
        except ValueError:
            date = datetime.datetime.strptime(date, "%b. %d, %Y")

        outcome_cell = page.xpath("//td[text()='Outcome:']")[0]
        outcome = outcome_cell.xpath("string(following-sibling::td)")

        vote = VoteEvent(
            chamber=chamber,
            start_date=date.strftime('%Y-%m-%d'),
            motion_text=motion,
            result='pass' if outcome == 'PREVAILS' else 'fail',
            classification='passage',
            bill=bill,
        )
        vote.set_count('yes', yes_count)
        vote.set_count('no', no_count)
        vote.set_count('other', other_count)
        vote.add_source(url)
        vote.pupa_id = url

        member_cell = page.xpath("//td[text() = 'Member']")[0]
        for row in member_cell.xpath("../../tr")[1:]:
            name = row.xpath("string(td[2])")
            # name = name.split(" of ")[0]

            vtype = row.xpath("string(td[4])")
            if vtype == 'Y':
                vote.vote('yes', name)
            elif vtype == 'N':
                vote.vote('no', name)
            elif vtype == 'X' or vtype == 'E':
                vote.vote('other', name)

        yield vote
Пример #38
0
    def handle_page(self):
        summary = self.doc.xpath('/'.join([
            '//h4[starts-with(text(), "SUMMARY")]',
            '/following-sibling::p',
            'text()',
        ]))
        if summary and summary[0].strip():
            self.obj.add_abstract(abstract=summary[0].strip(), note='summary')

        # versions
        for va in self.doc.xpath(
                '//h4[text()="FULL TEXT"]/following-sibling::ul[1]/li/a[1]'):

            # 11/16/09 \xa0House: Prefiled and ordered printed; offered 01/13/10 10100110D
            date, desc = va.text.split(u' \xa0')
            desc.rsplit(' ', 1)[0]  # chop off last part
            link = va.get('href')
            if 'http' not in link:
                link = '{}{}'.format(BASE_URL, link)
            date = datetime.datetime.strptime(date, '%m/%d/%y').date()

            # budget bills in VA are searchable but no full text available
            if '+men+' in link:
                self.warning(
                    'not adding budget version, bill text not available')
            else:
                # VA duplicates reprinted bills, lets keep the original name
                self.obj.add_version_link(desc,
                                          link,
                                          date=date,
                                          media_type='text/html',
                                          on_duplicate='ignore')

        # actions
        cached_vote = None
        cached_action = None
        for ali in self.doc.xpath(
                '//h4[text()="HISTORY"]/following-sibling::ul[1]/li'):
            vote = None

            date, action = ali.text_content().split(u' \xa0')
            actor, action = action.split(': ', 1)

            # Bill history entries purely in parentheses tend to be
            # notes and not actions, so we'll skip them.
            if action.startswith('(') and action.endswith(')'):
                continue

            actor = self.actor_map[actor]
            date = datetime.datetime.strptime(date.strip(), '%m/%d/%y').date()

            # if action ends in (##-Y ##-N) remove that part
            vrematch = self.vote_strip_re.match(action)
            # The following conditional logic is messy to handle
            # Virginia's crazy and inconsistently formatted bill
            # histories. Someone less harried and tired than me
            # could probably make this much cleaner. - alo
            if vrematch:
                vote_action, y, n, o = vrematch.groups()
                y = int(y)
                n = int(n)
                # Set default count for "other" votes to 0. We have to
                # do this explicitly as it's excluded from the action
                # text when there were no abstentions (the only type of
                # "other" vote encountered thus far).
                if o is None:
                    o = 0
                else:
                    o = int(o)

                vote_url = ali.xpath('a/@href')

                # Caches relevant information from the current action if
                # vote count encountered, then searches for the presence
                # of identical counts in the next entry (we assume that
                # it's probably there). If matching votes are found, it
                # pulls the cached data to create a unified vote record.
                #
                # This is because Virginia usually publishes two lines
                # of history data for a single vote, without guaranteed
                # order, so we cache and unsafely attempt to match on
                # identical vote counts in the next line.
                if cached_vote is None:
                    cached_action = action
                    cached_vote = VoteEvent(
                        start_date=date,
                        chamber=actor,
                        motion_text=vote_action,
                        result='pass' if y > n else 'fail',
                        classification='passage',
                        bill=self.obj,
                    )
                    cached_vote.set_count('yes', y)
                    cached_vote.set_count('no', n)
                    cached_vote.set_count('other', o)
                    if vote_url:
                        cached_vote.add_source(vote_url[0])
                    else:
                        cached_vote.add_source(self.url)
                    continue
                elif cached_vote is not None:
                    if vote_action.startswith(u'VOTE:'):
                        counts = {
                            count['option']: count['value']
                            for count in cached_vote.counts
                        }
                        if (vote_url and counts['yes'] == y
                                and counts['no'] == n
                                and counts['other'] == o):
                            vote = cached_vote
                            list(
                                self.scrape_page_items(VotePage,
                                                       url=vote_url[0],
                                                       obj=vote))
                            vote.add_source(vote_url[0])
                            action = cached_action
                    elif cached_vote.motion_text.startswith('VOTE:'):
                        counts = {
                            count['option']: count['value']
                            for count in cached_vote.counts
                        }
                        if (counts['yes'] == y and counts['no'] == n
                                and counts['other'] == o):
                            vote = cached_vote
                            vote['motion'] = vote_action
                    else:
                        # Cached vote doesn't match up to the current
                        # one. Save, then cache the current vote to
                        # begin the next search.
                        yield from add_pupa_id(cached_vote)
                        cached_vote = VoteEvent(
                            start_date=date,
                            chamber=actor,
                            motion_text=vote_action,
                            result='pass' if y > n else 'fail',
                            classification='passage',
                            bill=self.obj,
                        )
                        cached_vote.set_count('yes', y)
                        cached_vote.set_count('no', n)
                        cached_vote.set_count('other', o)
                        if vote_url:
                            cached_vote.add_source(vote_url[0])
                        else:
                            cached_vote.add_source(self.url)
                        cached_action = action
                        continue

                if vote is not None:
                    yield from add_pupa_id(vote)
            else:
                # If this action isn't a vote, but the last one was,
                # there's obviously no additional vote data to match.
                # Go ahead and save the cached data.
                if cached_vote is not None:
                    yield from add_pupa_id(cached_vote)

            cached_vote = cached_action = None

            # categorize actions
            for pattern, atype in ACTION_CLASSIFIERS:
                if re.match(pattern, action):
                    break
            else:
                atype = None

            # if matched a 'None' atype, don't add the action
            if atype != SKIP:
                self.obj.add_action(action,
                                    date,
                                    chamber=actor,
                                    classification=atype)
Пример #39
0
    def scrape(self, window=28, matter_ids=None):
        '''By default, scrape board reports updated in the last 28 days.
        Optionally specify a larger or smaller window of time from which to
        scrape updates, or specific matters to scrape.
        Note that passing a value for :matter_ids supercedes the value of
        :window, such that the given matters will be scraped regardless of
        when they were updated.
        
        Optional parameters
        :window (numeric) - Amount of time for which to scrape updates, e.g.
        a window of 7 will scrape legislation updated in the last week. Pass
        a window of 0 to scrape all legislation.
        :matter_ids (str) - Comma-separated list of matter IDs to scrape
        '''

        if matter_ids:
            matters = [
                self.matter(matter_id) for matter_id in matter_ids.split(',')
            ]
            matters = filter(
                None, matters)  # Skip matters that are not yet in Legistar
        elif float(window):  # Support for partial days, i.e., window=0.15
            n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(
                float(window))
            matters = self.matters(n_days_ago)
        else:
            # Scrape all matters, including those without a last-modified date
            matters = self.matters()

        n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(
            float(window))
        for matter in matters:
            # If this Boolean field is True, then do not scrape the Bill.
            # This issue explains why a restricted Bill might appear (unwelcome) in the Legistar API:
            # https://github.com/datamade/la-metro-councilmatic/issues/345#issuecomment-421184826
            if matter['MatterRestrictViewViaWeb']:
                continue

            matter_id = matter['MatterId']

            date = matter['MatterIntroDate']
            title = matter['MatterTitle']
            identifier = matter['MatterFile']

            if not all((date, title, identifier)):
                continue

            bill_session = self.session(self.toTime(date))
            bill_type = BILL_TYPES[matter['MatterTypeName']]

            if identifier.startswith('S'):
                alternate_identifiers = [identifier]
                identifier = identifier[1:]
            else:
                alternate_identifiers = []

            bill = Bill(identifier=identifier,
                        legislative_session=bill_session,
                        title=title,
                        classification=bill_type,
                        from_organization={"name": "Board of Directors"})

            legistar_web = matter['legistar_url']

            legistar_api = self.BASE_URL + '/matters/{0}'.format(matter_id)

            bill.add_source(legistar_web, note='web')
            bill.add_source(legistar_api, note='api')

            for identifier in alternate_identifiers:
                bill.add_identifier(identifier)

            for action, vote in self.actions(matter_id):
                act = bill.add_action(**action)

                if action['description'] == 'Referred':
                    body_name = matter['MatterBodyName']
                    act.add_related_entity(
                        body_name,
                        'organization',
                        entity_id=_make_pseudo_id(name=body_name))

                result, votes = vote
                if result:
                    vote_event = VoteEvent(
                        legislative_session=bill.legislative_session,
                        motion_text=action['description'],
                        organization=action['organization'],
                        classification=None,
                        start_date=action['date'],
                        result=result,
                        bill=bill)

                    vote_event.add_source(legistar_web)
                    vote_event.add_source(legistar_api + '/histories')

                    for vote in votes:
                        raw_option = vote['VoteValueName'].lower()
                        clean_option = self.VOTE_OPTIONS.get(
                            raw_option, raw_option)
                        vote_event.vote(clean_option,
                                        vote['VotePersonName'].strip())

                    yield vote_event

            for sponsorship in self.sponsorships(matter_id):
                bill.add_sponsorship(**sponsorship)

            for topic in self.topics(matter_id):
                bill.add_subject(topic['MatterIndexName'].strip())

            for relation in self.relations(matter_id):
                try:
                    # Get data (i.e., json) for the related bill.
                    # Then, we can find the 'MatterFile' (i.e., identifier) and the 'MatterIntroDate' (i.e., to determine its legislative session).
                    # Sometimes, the related bill does not yet exist: in this case, throw an error, and continue.
                    related_bill = self.endpoint(
                        '/matters/{0}', relation['MatterRelationMatterId'])
                except scrapelib.HTTPError:
                    continue
                else:
                    date = related_bill['MatterIntroDate']
                    related_bill_session = self.session(self.toTime(date))
                    identifier = related_bill['MatterFile']
                    bill.add_related_bill(
                        identifier=identifier,
                        legislative_session=related_bill_session,
                        relation_type='companion')
                    # Currently, the relation type for bills can be one of a few possibilites: https://github.com/opencivicdata/python-opencivicdata/blob/master/opencivicdata/common.py#L104
                    # Metro simply understands these as related files, suggesting that they receive a relation of 'companion'.

            bill.add_version_link(
                'Board Report',
                'https://metro.legistar.com/ViewReport.ashx?M=R&N=TextL5&GID=557&ID={}&GUID=LATEST&Title=Board+Report'
                .format(matter_id),
                media_type="application/pdf")

            for attachment in self.attachments(matter_id):
                if attachment['MatterAttachmentName']:
                    bill.add_document_link(
                        attachment['MatterAttachmentName'],
                        attachment['MatterAttachmentHyperlink'],
                        media_type="application/pdf")

            bill.extras = {'local_classification': matter['MatterTypeName']}

            text = self.text(matter_id)

            if text:
                if text['MatterTextPlain']:
                    bill.extras['plain_text'] = text['MatterTextPlain']

                if text['MatterTextRtf']:
                    bill.extras['rtf_text'] = text['MatterTextRtf'].replace(
                        u'\u0000', '')

            yield bill
Пример #40
0
    def scrape_votes(self, bill):
        bill_num = bill.identifier.split()[1]

        url = ("http://wslwebservices.leg.wa.gov/legislationservice.asmx/"
               "GetRollCalls?billNumber=%s&biennium=%s" % (
                   bill_num, self.biennium))
        page = self.get(url)
        page = lxml.etree.fromstring(page.content)

        for rc in xpath(page, "//wa:RollCall"):
            motion = xpath(rc, "string(wa:Motion)")
            seq_no = xpath(rc, "string(wa:SequenceNumber)")

            date = xpath(rc, "string(wa:VoteDate)").split("T")[0]
            date = datetime.datetime.strptime(date, "%Y-%m-%d").date()

            yes_count = int(xpath(rc, "string(wa:YeaVotes/wa:Count)"))
            no_count = int(xpath(rc, "string(wa:NayVotes/wa:Count)"))
            abs_count = int(
                xpath(rc, "string(wa:AbsentVotes/wa:Count)"))
            ex_count = int(
                xpath(rc, "string(wa:ExcusedVotes/wa:Count)"))

            other_count = abs_count + ex_count

            agency = xpath(rc, "string(wa:Agency)")
            chamber = {'House': 'lower', 'Senate': 'upper'}[agency]

            vote = Vote(chamber=chamber, start_date=date,
                        motion_text='{} (#{})'.format(motion, seq_no),
                        result='pass' if yes_count > (no_count + other_count) else 'fail',
                        classification='other', bill=bill)
            vote.set_count('yes', yes_count)
            vote.set_count('no', no_count)
            vote.set_count('other', other_count)
            vote.add_source(url)
            for sv in xpath(rc, "wa:Votes/wa:Vote"):
                name = xpath(sv, "string(wa:Name)")
                vtype = xpath(sv, "string(wa:VOte)")

                if vtype == 'Yea':
                    vote.yes(name)
                elif vtype == 'Nay':
                    vote.no(name)
                else:
                    vote.vote('other', name)

            yield vote
Пример #41
0
    def scrape_votes(self, bill, page):
        base_url = "https://apps.azleg.gov/api/BillStatusFloorAction"
        for header in page["FloorHeaders"]:
            params = {
                "billStatusId": page["BillId"],
                "billStatusActionId": header["BillStatusActionId"],
                "includeVotes": "true",
            }
            resp = self.get(base_url, params=params)
            actions = json.loads(resp.content.decode("utf-8"))

            for action in actions:
                if action["Action"] == "No Action":
                    continue
                action_date = datetime.datetime.strptime(
                    action["ReportDate"], "%Y-%m-%dT%H:%M:%S")
                vote = VoteEvent(
                    chamber={
                        "S": "upper",
                        "H": "lower"
                    }[header["LegislativeBody"]],
                    motion_text=action["Action"],
                    classification="passage",
                    result=("pass" if action["UnanimouslyAdopted"]
                            or action["Ayes"] > action["Nays"] else "fail"),
                    start_date=action_date.strftime("%Y-%m-%d"),
                    bill=bill,
                )
                vote.add_source(resp.url)
                vote.set_count("yes", action["Ayes"] or 0)
                vote.set_count("no", action["Nays"] or 0)
                vote.set_count("other", (action["Present"] or 0))
                vote.set_count("absent", (action["Absent"] or 0))
                vote.set_count("excused", (action["Excused"] or 0))
                vote.set_count("not voting", (action["NotVoting"] or 0))

                for v in action["Votes"]:
                    vote_type = {"Y": "yes", "N": "no"}.get(v["Vote"], "other")
                    vote.vote(vote_type, v["Legislator"]["FullName"])
                vote.pupa_id = resp.url + str(action["ReferralNumber"])
                yield vote
Пример #42
0
    def parse_bill_actions_table(self, bill, action_table, bill_id, session, url, bill_chamber):
        for action in action_table.xpath('*')[1:]:
            date = action[0].text_content()
            date = dt.datetime.strptime(date, "%m/%d/%Y").strftime('%Y-%m-%d')
            actor = action[1].text_content().upper()
            string = action[2].text_content()
            actor = {
                "S": "upper",
                "H": "lower",
                "D": "legislature",  # "Data Systems",
                "$": "Appropriation measure",
                "CONAM": "Constitutional Amendment"
            }[actor]
            act_type, committees = categorize_action(string)
            # XXX: Translate short-code to full committee name for the
            #      matcher.

            real_committees = []

            if committees:
                for committee in committees:
                    try:
                        committee = self.short_ids[committee]['name']
                        real_committees.append(committee)
                    except KeyError:
                        pass
            act = bill.add_action(string, date, chamber=actor,
                                  classification=act_type)
            for committee in real_committees:
                act.add_related_entity(name=committee, entity_type="organization")
            vote = self.parse_vote(string)
            if vote:
                v, motion = vote
                vote = VoteEvent(start_date=date,
                                 chamber=actor,
                                 bill=bill_id,
                                 bill_chamber=bill_chamber,
                                 legislative_session=session,
                                 motion_text=motion,
                                 result='pass' if 'passed' in string.lower() else 'fail',
                                 classification='passage')
                vote.add_source(url)
                vote.set_count('yes', int(v['n_yes'] or 0))
                vote.set_count('no', int(v['n_no'] or 0))
                vote.set_count('not voting', int(v['n_excused'] or 0))
                for voter in split_specific_votes(v['yes']):
                    vote.yes(voter)
                for voter in split_specific_votes(v['yes_resv']):
                    vote.yes(voter)
                for voter in split_specific_votes(v['no']):
                    vote.no(voter)
                for voter in split_specific_votes(v['excused']):
                    vote.vote('not voting', voter)

                yield vote
Пример #43
0
    def scrape_action_page(self, bill, page):
        action_rows = page.xpath('//tbody/tr')
        for row in action_rows:
            action_date = row.xpath('td[1]/text()')[0]
            action_date = datetime.strptime(action_date, '%m/%d/%Y')
            action_year = action_date.year
            action_date = action_date.strftime('%Y-%m-%d')

            if row.xpath('td[2]/text()'):
                action_actor = row.xpath('td[2]/text()')[0]
                action_actor = self.chamber_map_reverse[action_actor.strip()]

            action_name = row.xpath('string(td[3])')

            # House votes
            if "Supplement" in action_name:
                actor = "lower"
                vote_action = re.findall(r'(.+)-\s*\d+\s*YEAS', action_name)[0].strip()

                y = int(re.findall(r'(\d+)\s*YEAS', action_name)[0])
                n = int(re.findall(r'(\d+)\s*NAYS', action_name)[0])

                # get supplement number
                n_supplement = int(re.findall(r'No\.\s*(\d+)', action_name)[0])
                cached_vote = VoteEvent(
                    chamber=actor,
                    start_date=action_date,
                    motion_text=vote_action,
                    result='pass' if y > n else 'fail',
                    classification='passage',
                    bill=bill,
                )
                cached_vote.set_count('yes', y)
                cached_vote.set_count('no', n)

                housevote_pdf = 'https://malegislature.gov/Journal/House/{}/{}/RollCalls'.format(
                    bill.legislative_session, action_year)
                self.scrape_house_vote(cached_vote, housevote_pdf, n_supplement)
                cached_vote.add_source(housevote_pdf)

                cached_vote.pupa_id = '{}#{}'.format(housevote_pdf, n_supplement)

                # XXX: disabled house votes on 8/1 to try to get MA importing again
                # will leaving this in and commented out once we resolve the ID issue
                # yield cached_vote

            # Senate votes
            if "Roll Call" in action_name:
                actor = "upper"
                # placeholder
                vote_action = action_name.split(' -')[0]
                try:
                    y, n = re.search(r'(\d+) yeas .*? (\d+) nays', action_name.lower()).groups()
                    y = int(y)
                    n = int(n)
                except AttributeError:
                    y = int(re.search(r"yeas\s+(\d+)", action_name.lower()).group(1))
                    n = int(re.search(r"nays\s+(\d+)", action_name.lower()).group(1))

                # TODO: other count isn't included, set later
                cached_vote = VoteEvent(
                    chamber=actor,
                    start_date=action_date,
                    motion_text=vote_action,
                    result='pass' if y > n else 'fail',
                    classification='passage',
                    bill=bill,
                )
                cached_vote.set_count('yes', y)
                cached_vote.set_count('no', n)

                rollcall_pdf = 'http://malegislature.gov' + row.xpath('string(td[3]/a/@href)')
                self.scrape_senate_vote(cached_vote, rollcall_pdf)
                cached_vote.add_source(rollcall_pdf)
                cached_vote.pupa_id = rollcall_pdf
                # XXX: also disabled, see above note
                # yield cached_vote

            attrs = self.categorizer.categorize(action_name)
            action = bill.add_action(
                action_name.strip(),
                action_date,
                chamber=action_actor,
                classification=attrs['classification'],
            )
            for com in attrs.get('committees', []):
                action.add_related_entity(com, entity_type='organization')
Пример #44
0
    def parse_vote(self, bill, link):
        member_doc = lxml.html.fromstring(self.get(link).text)
        motion = member_doc.xpath("//div[@id='main_content']/h4/text()")
        opinions = member_doc.xpath("//div[@id='main_content']/h3/text()")
        if len(opinions) > 0:
            temp = opinions[0].split()
            vote_chamber = temp[0]
            vote_date = datetime.datetime.strptime(temp[-1], '%m/%d/%Y')
            vote_status = " ".join(temp[2:-2])
            vote_status = vote_status if vote_status.strip() else motion[0]
            vote_chamber = 'upper' if vote_chamber == 'Senate' else 'lower'

            for i in opinions:
                try:
                    count = int(i[i.find("(") + 1:i.find(")")])
                except:
                    pass
                if "yea" in i.lower():
                    yes_count = count
                elif "nay" in i.lower():
                    no_count = count
                elif "present" in i.lower():
                    p_count = count
                elif "absent" in i.lower():
                    a_count = count
            vote = VoteEvent(
                bill=bill,
                start_date=vote_date.strftime('%Y-%m-%d'),
                chamber=vote_chamber,
                motion_text=vote_status,
                result='pass' if yes_count > no_count else 'fail',
                classification='passage',
            )

            vote.set_count('yes', yes_count)
            vote.set_count('no', no_count)
            vote.set_count('abstain', p_count)
            vote.set_count('absent', a_count)

            vote.add_source(link)

            a_links = member_doc.xpath("//div[@id='main_content']/a/text()")
            for i in range(1, len(a_links)):
                if i <= yes_count:
                    vote.vote('yes', re.sub(',', '', a_links[i]).split()[0])
                elif no_count != 0 and i > yes_count and i <= yes_count + no_count:
                    vote.vote('no', re.sub(',', '', a_links[i]).split()[0])
                else:
                    vote.vote('other', re.sub(',', '', a_links[i]).split()[0])
            yield vote
        else:
            self.warning("No Votes for: %s", link)
Пример #45
0
    def parse_roll_call(self, bill, link, chamber, date):
        url = link.attrib['href']
        page = self.get(url).text
        page = lxml.html.fromstring(page)

        xpath = 'string(//div[@class="Column-OneFourth"]/div[3])'
        motion = page.xpath(xpath).strip()
        motion = re.sub(r'\s+', ' ', motion)

        if motion == 'FP':
            motion = 'FINAL PASSAGE'

        if motion == 'FINAL PASSAGE':
            type = 'passage'
        elif re.match(r'CONCUR(RENCE)? IN \w+ AMENDMENTS', motion):
            type = 'amendment'
        else:
            type = 'other'
            motion = link.text_content()

        yeas = int(page.xpath("//div[text() = 'YEAS']")[0].getnext().text)
        nays = int(page.xpath("//div[text() = 'NAYS']")[0].getnext().text)
        lve = int(page.xpath("//div[text() = 'LVE']")[0].getnext().text)
        nv = int(page.xpath("//div[text() = 'N/V']")[0].getnext().text)
        other = lve + nv

        vote = VoteEvent(
            chamber=chamber,
            start_date=tz.localize(date),
            motion_text=motion,
            classification=type,
            result='pass' if yeas > (nays + other) else 'fail',
            bill=bill,
        )
        # pupa_id situation here is a bit weird, same vote can be used for
        # multiple bills see:
        # http://www.legis.state.pa.us/CFDOCS/Legis/RC/Public/rc_view_action2.cfm?sess_yr=2017&sess_ind=0&rc_body=H&rc_nbr=11       # noqa
        # so we toss the bill id onto the end of the URL
        vote.pupa_id = url + '#' + bill.identifier
        vote.add_source(url)
        vote.set_count('yes', yeas)
        vote.set_count('no', nays)
        vote.set_count('other', other)

        for div in page.xpath('//*[contains(@class, "RollCalls-Vote")]'):
            name = div.text_content().strip()
            name = re.sub(r'^[\s,]+', '', name)
            name = re.sub(r'[\s,]+$', '', name)
            class_attr = div.attrib['class'].lower()
            if 'yea' in class_attr:
                voteval = 'yes'
            elif 'nay' in class_attr:
                voteval = 'no'
            elif 'nvote' in class_attr:
                voteval = 'other'
            elif 'lve' in class_attr:
                voteval = 'other'
            else:
                msg = 'Unrecognized vote val: %s' % class_attr
                raise Exception(msg)
            vote.vote(voteval, name)

        return vote
Пример #46
0
    def scrape_votes(self, bill, bill_page, chamber):
        vote_links = bill_page.xpath(
            '//table[contains(@class,"history")]//a[contains(@href, "view_votes")]'
        )
        for vote_link in vote_links:
            vote_url = vote_link.attrib['href']
            date_td, motion_td, *_ = vote_link.xpath('ancestor::tr/td')
            date = datetime.strptime(date_td.text, '%b %d, %Y')
            motion_text = motion_td.text_content()
            vote_page = self.lxmlize(vote_url)
            passed = ('Passed' in motion_text or 'Advanced' in motion_text)
            cells = vote_page.xpath(
                '//div[contains(@class,"table-responsive")]/table//td')
            vote = VoteEvent(
                bill=bill,
                chamber=chamber,
                start_date=TIMEZONE.localize(date),
                motion_text=motion_text,
                classification='passage',
                result='pass' if passed else 'fail',
            )

            yes_count = self.process_count(vote_page, 'Yes:')
            no_count = self.process_count(vote_page, 'No:')
            exc_count = self.process_count(vote_page, 'Excused - Not Voting:')
            absent_count = self.process_count(vote_page,
                                              'Absent - Not Voting:')
            present_count = self.process_count(vote_page,
                                               'Present - Not Voting:')

            vote.set_count('yes', yes_count)
            vote.set_count('no', no_count)
            vote.set_count('excused', exc_count)
            vote.set_count('absent', absent_count)
            vote.set_count('abstain', present_count)

            query_params = urllib.parse.parse_qs(
                urllib.parse.urlparse(vote_url).query)
            vote.pupa_id = query_params['KeyID'][0]
            vote.add_source(vote_url)
            for chunk in range(0, len(cells), 2):
                name = cells[chunk].text
                vote_type = cells[chunk + 1].text
                if name and vote_type:
                    vote.vote(VOTE_TYPE_MAP.get(vote_type.lower(), 'other'),
                              name)
            yield vote
Пример #47
0
    def scrape_vote(self, bill, vote_id, session):
        vote_url = 'https://legis.delaware.gov/json/RollCall/GetRollCallVoteByRollCallId'
        form = {
            'rollCallId': vote_id,
            'sort': '',
            'group': '',
            'filter': '',
        }

        page = self.post(url=vote_url, data=form, allow_redirects=True).json()
        if page:
            roll = page['Model']
            vote_chamber = self.chamber_map[roll['ChamberName']]
            # "7/1/16 01:00 AM"
            vote_date = dt.datetime.strptime(roll['TakenAtDateTime'],
                                             '%m/%d/%y %I:%M %p').strftime('%Y-%m-%d')

            # TODO: What does this code mean?
            vote_motion = roll['RollCallVoteType']

            vote_passed = 'pass' if roll['RollCallStatus'] == 'Passed' else 'fail'
            other_count = (int(roll['NotVotingCount']) +
                           int(roll['VacantVoteCount']) +
                           int(roll['AbsentVoteCount']) +
                           int(roll['ConflictVoteCount'])
                           )
            vote = Vote(chamber=vote_chamber,
                        start_date=vote_date,
                        motion_text=vote_motion,
                        result=vote_passed,
                        classification='other',
                        bill=bill.identifier,
                        legislative_session=session
                        )
            vote.add_source(vote_url)
            vote.set_count('yes', roll['YesVoteCount'])
            vote.set_count('no', roll['NoVoteCount'])
            vote.set_count('other', other_count)

            for row in roll['AssemblyMemberVotes']:
                # AssemblyMemberId looks like it should work here,
                # but for some sessions it's bugged to only return session
                try:
                    voter = self.legislators_by_short[str(row['ShortName'])]
                    name = voter['DisplayName']
                except KeyError:
                    self.warning('could not find legislator short name %s',
                                 row['ShortName'])
                    name = row['ShortName']
                if row['SelectVoteTypeCode'] == 'Y':
                    vote.yes(name)
                elif row['SelectVoteTypeCode'] == 'N':
                    vote.no(name)
                else:
                    vote.vote('other', name)

            # bill.add_vote_event(vote)
            yield vote
Пример #48
0
    def parse_vote_pdf(self, vote_url, bill):

        filename, response = self.urlretrieve(vote_url)

        text = convert_pdf(filename, type="text").decode()
        lines = text.splitlines()

        if "Senate" in vote_url:
            chamber = "upper"
        else:
            chamber = "lower"

        date_string = lines[0].split("Calendar Date:")[1].strip()
        date = datetime.datetime.strptime(date_string, "%b %d, %Y %I:%M (%p)")

        page_index = None
        for index, line in enumerate(lines):
            if "Yeas" in line and "Nays" in line:
                page_index = index
                break

        vote_counts = 5 * [0]
        vote_types = ["yes", "no", "not voting", "excused", "absent"]

        if page_index:

            counts = re.split(r"\s{2,}", lines[page_index].strip())

            for index, count in enumerate(counts):
                number, string = count.split(" ", 1)
                number = int(number)
                vote_counts[index] = number
        else:
            raise ValueError("Vote Counts Not found at %s" % vote_url)

        passed = vote_counts[0] > vote_counts[1]

        # Consent calendar votes address multiple bills in one VoteEvent
        # eg, http://mgaleg.maryland.gov/2018RS/votes/Senate/0478.pdf
        is_consent_calendar = any(
            ["Consent Calendar" in line for line in lines[:page_index]])
        consent_calendar_bills = None
        motion = ""
        if is_consent_calendar:
            motion = re.split(r"\s{2,}", lines[page_index - 4].strip())[0]
            consent_calendar_bills = re.split(r"\s{2,}",
                                              lines[page_index - 1].strip())
            assert (consent_calendar_bills
                    ), "Could not find bills for consent calendar vote"

        motion_keywords = [
            "favorable",
            "reading",
            "amendment",
            "motion",
            "introduced",
            "bill pass",
            "committee",
        ]
        motion_lines = [
            3,
            2,
            4,
            5,
        ]  # Relative LineNumbers to be checked for existence of motion

        for i in motion_lines:
            if any(motion_keyword in motion.lower()
                   for motion_keyword in motion_keywords):
                break
            motion = re.split(r"\s{2,}", lines[page_index - i].strip())[0]
        else:
            if not any(motion_keyword in motion.lower()
                       for motion_keyword in motion_keywords):
                # This condition covers for the bad formating in SB 1260
                motion = lines[page_index - 3]
            if not any(motion_keyword in motion.lower()
                       for motion_keyword in motion_keywords):
                # Check this one for SB 747
                motion = "No motion given"
                self.warning("No motion given")

        vote = VoteEvent(
            bill=bill,
            chamber=chamber,
            start_date=date.strftime("%Y-%m-%d"),
            motion_text=motion,
            classification="passage",
            result="pass" if passed else "fail",
        )

        # Include bill ID to avoid duplication for consent calendars
        vote.pupa_id = "{}#{}".format(vote_url, bill.identifier)

        for index, vote_type in enumerate(vote_types):
            vote.set_count(vote_type, vote_counts[index])
        page_index = page_index + 2

        # Keywords for identifying where names are located in the pdf
        show_stoppers = [
            "Voting Nay",
            "Not Voting",
            "COPY",
            "Excused",
            "indicates vote change",
            "Indicates Vote Change",
        ]
        vote_index = 0

        # For matching number of names extracted with vote counts(extracted independently)
        vote_name_counts = 5 * [0]

        while page_index < len(lines):

            current_line = lines[page_index].strip()

            if not current_line or "Voting Yea" in current_line:
                page_index += 1
                continue

            if any(show_stopper in current_line
                   for show_stopper in show_stoppers):
                page_index += 1
                vote_index = vote_index + 1
                continue

            names = re.split(r"\s{2,}", current_line)

            vote_name_counts[vote_index] += len(names)

            for name in names:
                vote.vote(vote_types[vote_index], name)
            page_index += 1

        if vote_counts != vote_name_counts:
            raise ValueError("Votes Count and Number of Names don't match")

        return vote
Пример #49
0
    def scrape(self, session=None):
        HTML_TAGS_RE = r'<.*?>'

        if session is None:
            session = self.latest_session()

        year_slug = self.jurisdiction.get_year_slug(session)

        # Load all bills and resolutions via the private API
        bills_url = \
            'http://legislature.vermont.gov/bill/loadBillsReleased/{}/'.\
            format(year_slug)
        bills_json = self.get(bills_url).text
        bills = json.loads(bills_json)['data'] or []

        bills_url = \
            'http://legislature.vermont.gov/bill/loadBillsIntroduced/{}/'.\
            format(year_slug)
        bills_json = self.get(bills_url).text
        bills.extend(json.loads(bills_json)['data'] or [])

        resolutions_url = \
            'http://legislature.vermont.gov/bill/loadAllResolutionsByChamber/{}/both'.\
            format(year_slug)
        resolutions_json = self.get(resolutions_url).text
        bills.extend(json.loads(resolutions_json)['data'] or [])

        # Parse the information from each bill
        for info in bills:
            # Strip whitespace from strings
            info = {k: v.strip() for k, v in info.items()}

            # Identify the bill type and chamber
            if info['BillNumber'].startswith('J.R.H.'):
                bill_type = 'joint resolution'
                bill_chamber = 'lower'
            elif info['BillNumber'].startswith('J.R.S.'):
                bill_type = 'joint resolution'
                bill_chamber = 'upper'

            elif info['BillNumber'].startswith('H.C.R.'):
                bill_type = 'concurrent resolution'
                bill_chamber = 'lower'
            elif info['BillNumber'].startswith('S.C.R.'):
                bill_type = 'concurrent resolution'
                bill_chamber = 'upper'

            elif info['BillNumber'].startswith('H.R.'):
                bill_type = 'resolution'
                bill_chamber = 'lower'
            elif info['BillNumber'].startswith('S.R.'):
                bill_type = 'resolution'
                bill_chamber = 'upper'

            elif info['BillNumber'].startswith('PR.'):
                bill_type = 'constitutional amendment'
                if info['Body'] == 'H':
                    bill_chamber = 'lower'
                elif info['Body'] == 'S':
                    bill_chamber = 'upper'
                else:
                    raise AssertionError("Amendment not tied to chamber")

            elif info['BillNumber'].startswith('H.'):
                bill_type = 'bill'
                bill_chamber = 'lower'
            elif info['BillNumber'].startswith('S.'):
                bill_type = 'bill'
                bill_chamber = 'upper'

            else:
                raise AssertionError(
                    "Unknown bill type found: '{}'".
                    format(info['BillNumber'])
                )

            bill_id = info['BillNumber'].replace('.', '').replace(' ', '')
            # put one space back in between type and number
            bill_id = re.sub(r'([a-zA-Z]+)(\d+)', r'\1 \2', bill_id)

            # Create the bill using its basic information
            bill = Bill(
                identifier=bill_id,
                legislative_session=session,
                chamber=bill_chamber,
                title=info['Title'],
                classification=bill_type
            )
            if 'resolution' in bill_type:
                bill.add_source(resolutions_url)
            else:
                bill.add_source(bills_url)

            # Load the bill's information page to access its metadata
            bill_url = 'http://legislature.vermont.gov/bill/status/{0}/{1}'.\
                format(year_slug, info['BillNumber'])
            doc = self.lxmlize(bill_url)
            bill.add_source(bill_url)

            # Capture sponsors
            sponsors = doc.xpath(
                '//dl[@class="summary-table"]/dt[text()="Sponsor(s)"]/'
                'following-sibling::dd[1]/ul/li'
            )
            sponsor_type = 'primary'
            for sponsor in sponsors:
                if sponsor.xpath('span/text()') == ['Additional Sponsors']:
                    sponsor_type = 'cosponsor'
                    continue

                sponsor_name = sponsor.xpath('a/text()')[0].\
                    replace("Rep.", "").replace("Sen.", "").strip()
                if sponsor_name and not \
                        (sponsor_name[:5] == "Less" and len(sponsor_name) == 5):
                    bill.add_sponsorship(
                        name=sponsor_name,
                        classification=sponsor_type,
                        entity_type='person',
                        primary=(sponsor_type == 'primary')
                    )

            # Capture bill text versions
            # Warning: There's a TODO in VT's source code saying 'move this to where it used to be'
            # so leave in the old and new positions
            versions = doc.xpath(
                '//dl[@class="summary-table"]/dt[text()="Bill/Resolution Text"]/'
                'following-sibling::dd[1]/ul/li/a |'
                '//ul[@class="bill-path"]//a'
            )

            for version in versions:
                if version.xpath('text()'):
                    bill.add_version_link(
                        note=version.xpath('text()')[0],
                        url=version.xpath('@href')[0].replace(' ', '%20'),
                        media_type='application/pdf'
                    )

            # Identify the internal bill ID, used for actions and votes
            # If there is no internal bill ID, then it has no extra information
            try:
                internal_bill_id = re.search(
                    r'"bill/loadBillDetailedStatus/.+?/(\d+)"',
                    lxml.etree.tostring(doc).decode('utf-8')
                ).group(1)
            except AttributeError:
                self.warning("Bill {} appears to have no activity".format(info['BillNumber']))
                yield bill
                continue

            # Capture actions
            actions_url = 'http://legislature.vermont.gov/bill/loadBillDetailedStatus/{0}/{1}'.\
                format(year_slug, internal_bill_id)
            actions_json = self.get(actions_url).text
            actions = json.loads(actions_json)['data']
            bill.add_source(actions_url)

            chambers_passed = set()
            for action in actions:
                action = {k: v for k, v in action.items() if v is not None}

                if "Signed by Governor" in action['FullStatus']:
                    actor = 'executive'
                elif action['ChamberCode'] == 'H':
                    actor = 'lower'
                elif action['ChamberCode'] == 'S':
                    actor = 'upper'
                else:
                    raise AssertionError("Unknown actor for bill action")

                # Categorize action
                if "Signed by Governor" in action['FullStatus']:
                    # assert chambers_passed == set("HS")
                    action_type = 'executive-signature'
                elif "Vetoed by the Governor" in action['FullStatus']:
                    action_type = 'executive-veto'
                elif "Read first time" in action['FullStatus'] \
                        or "Read 1st time" in action['FullStatus']:
                    action_type = 'introduction'
                elif "Reported favorably" in action['FullStatus']:
                    action_type = 'committee-passage-favorable'
                elif actor == 'lower' and any(x.lower().startswith('aspassed')
                                              for x in action['keywords'].split(';')):
                    action_type = 'passage'
                    chambers_passed.add("H")
                elif actor == 'upper' and any(x.lower().startswith(' aspassed')
                                              or x.lower().startswith('aspassed')
                                              for x in action['keywords'].split(';')):
                    action_type = 'passage'
                    chambers_passed.add("S")
                else:
                    action_type = None

                bill.add_action(
                    description=re.sub(HTML_TAGS_RE, "", action['FullStatus']),
                    date=datetime.datetime.strftime(
                        datetime.datetime.strptime(action['StatusDate'], '%m/%d/%Y'),
                        '%Y-%m-%d'
                    ),
                    chamber=actor,
                    classification=action_type
                )

            # Capture votes
            votes_url = 'http://legislature.vermont.gov/bill/loadBillRollCalls/{0}/{1}'.format(
                year_slug, internal_bill_id)
            votes_json = self.get(votes_url).text
            votes = json.loads(votes_json)['data']
            bill.add_source(votes_url)

            for vote in votes:
                roll_call_id = vote['VoteHeaderID']
                roll_call_url = ('http://legislature.vermont.gov/bill/'
                                 'loadBillRollCallDetails/{0}/{1}'.format(
                                     year_slug, roll_call_id))
                roll_call_json = self.get(roll_call_url).text
                roll_call = json.loads(roll_call_json)['data']

                roll_call_yea = []
                roll_call_nay = []
                roll_call_not_voting = []
                for member in roll_call:
                    (member_name, _district) = member['MemberName'].split(" of ")
                    member_name = member_name.strip()

                    if member['MemberVote'] == "Yea":
                        roll_call_yea.append(member_name)
                    elif member['MemberVote'] == "Nay":
                        roll_call_nay.append(member_name)
                    else:
                        roll_call_not_voting.append(member_name)

                if ("Passed -- " in vote['FullStatus'] or
                        "Veto of Governor overridden" in vote['FullStatus']):
                    did_pass = True
                elif ("Failed -- " in vote['FullStatus'] or
                      'Veto of the Governor sustained' in vote['FullStatus']):
                    did_pass = False
                else:
                    raise AssertionError("Roll call vote result is unclear")

                # Check vote counts
                yea_count = int(re.search(r'Yeas = (\d+)', vote['FullStatus']).group(1))
                nay_count = int(re.search(r'Nays = (\d+)', vote['FullStatus']).group(1))

                vote_to_add = VoteEvent(
                    bill=bill,
                    chamber=('lower' if vote['ChamberCode'] == 'H' else 'upper'),
                    start_date=datetime.datetime.strftime(
                        datetime.datetime.strptime(vote['StatusDate'], '%m/%d/%Y'),
                        '%Y-%m-%d'
                    ),
                    motion_text=re.sub(HTML_TAGS_RE, "", vote['FullStatus']).strip(),
                    result='pass' if did_pass else 'fail',
                    classification='passage',
                    legislative_session=session,
                )
                vote_to_add.add_source(roll_call_url)

                vote_to_add.set_count('yes', yea_count)
                vote_to_add.set_count('no', nay_count)
                vote_to_add.set_count('not voting', len(roll_call_not_voting))

                for member in roll_call_yea:
                    vote_to_add.yes(member)
                for member in roll_call_nay:
                    vote_to_add.no(member)
                for member in roll_call_not_voting:
                    vote_to_add.vote('not voting', member)

                yield vote_to_add

            # Capture extra information-  Not yet implemented
            # Witnesses:
            #   http://legislature.vermont.gov/bill/loadBillWitnessList/{year_slug}/{internal_bill_id}
            # Conference committee members:
            #   http://legislature.vermont.gov/bill/loadBillConference/{year_slug}/{bill_number}
            # Committee meetings:
            #   http://legislature.vermont.gov/committee/loadHistoryByBill/{year_slug}?LegislationId={internal_bill_id}

            yield bill
Пример #50
0
    def parse_vote(self, bill, link):
        member_doc = lxml.html.fromstring(self.get(link).text)
        motion = member_doc.xpath("//div[@id='main_content']/h4/text()")
        opinions = member_doc.xpath("//div[@id='main_content']/h3/text()")
        if len(opinions) > 0:
            temp = opinions[0].split()
            vote_chamber = temp[0]
            vote_date = datetime.datetime.strptime(temp[-1], '%m/%d/%Y')
            vote_status = " ".join(temp[2:-2])
            vote_status = vote_status if vote_status.strip() else motion[0]
            vote_chamber = 'upper' if vote_chamber == 'Senate' else 'lower'

            for i in opinions:
                try:
                    count = int(i[i.find("(") + 1:i.find(")")])
                except ValueError:
                    # This is likely not a vote-count text chunk
                    # It's probably '`On roll call the vote was:`
                    pass
                else:
                    if "yea" in i.lower():
                        yes_count = count
                    elif "nay" in i.lower():
                        no_count = count
                    elif "present" in i.lower():
                        p_count = count
                    elif "absent" in i.lower():
                        a_count = count

            vote = VoteEvent(
                bill=bill,
                start_date=vote_date.strftime('%Y-%m-%d'),
                chamber=vote_chamber,
                motion_text=vote_status,
                result='pass' if yes_count > no_count else 'fail',
                classification='passage',
            )
            vote.pupa_id = link

            vote.set_count('yes', yes_count)
            vote.set_count('no', no_count)
            vote.set_count('abstain', p_count)
            vote.set_count('absent', a_count)

            vote.add_source(link)

            a_links = member_doc.xpath("//div[@id='main_content']/a/text()")
            for i in range(1, len(a_links)):
                if i <= yes_count:
                    vote.vote('yes', re.sub(',', '', a_links[i]).split()[0])
                elif no_count != 0 and i > yes_count and i <= yes_count + no_count:
                    vote.vote('no', re.sub(',', '', a_links[i]).split()[0])
                else:
                    vote.vote('other', re.sub(',', '', a_links[i]).split()[0])
            yield vote
        else:
            self.warning("No Votes for: %s", link)
Пример #51
0
def build_vote(session, bill_id, url, vote_record, chamber, motion_text):
    passed = len(vote_record['yes']) > len(vote_record['no'])
    vote_event = VoteEvent(
        result='pass' if passed else 'fail',
        chamber=chamber,
        start_date=vote_record['date'].strftime('%Y-%m-%d'),
        motion_text=motion_text,
        classification='passage',
        legislative_session=session,
        bill=bill_id,
        bill_chamber='upper' if bill_id[0] is 'S' else 'lower'
    )
    vote_event.pupa_id = url
    vote_event.set_count('yes', len(vote_record['yes']))
    vote_event.set_count('no', len(vote_record['no']))
    vote_event.set_count('excused', len(vote_record['excused']))
    vote_event.set_count('absent', len(vote_record['absent']))
    vote_event.set_count('other', len(vote_record['other']))
    for vote_type in ['yes', 'no', 'excused', 'absent', 'other']:
        for voter in vote_record[vote_type]:
            vote_event.vote(vote_type, voter)

    vote_event.add_source(url)
    return vote_event
Пример #52
0
    def scrape_action_page(self, bill, page):
        action_rows = page.xpath("//tbody/tr")
        for row in action_rows:
            action_date = row.xpath("td[1]/text()")[0]
            action_date = datetime.strptime(action_date, "%m/%d/%Y")
            action_year = action_date.year
            action_date = action_date.strftime("%Y-%m-%d")

            if row.xpath("td[2]/text()"):
                action_actor = row.xpath("td[2]/text()")[0]
                action_actor = self.chamber_map_reverse[action_actor.strip()]

            action_name = row.xpath("string(td[3])")

            # House votes
            if "Supplement" in action_name:
                actor = "lower"
                vote_action = re.findall(r"(.+)-\s*\d+\s*YEAS", action_name)[0].strip()

                y = int(re.findall(r"(\d+)\s*YEAS", action_name)[0])
                n = int(re.findall(r"(\d+)\s*NAYS", action_name)[0])

                # get supplement number
                n_supplement = int(re.findall(r"No\.\s*(\d+)", action_name)[0])
                cached_vote = VoteEvent(
                    chamber=actor,
                    start_date=action_date,
                    motion_text=vote_action,
                    result="pass" if y > n else "fail",
                    classification="passage",
                    bill=bill,
                )
                cached_vote.set_count("yes", y)
                cached_vote.set_count("no", n)

                housevote_pdf = "https://malegislature.gov/Journal/House/{}/{}/RollCalls".format(
                    bill.legislative_session, action_year
                )
                self.scrape_house_vote(cached_vote, housevote_pdf, n_supplement)
                cached_vote.add_source(housevote_pdf)

                cached_vote.pupa_id = "{}#{}".format(housevote_pdf, n_supplement)

                # XXX: disabled house votes on 8/1 to try to get MA importing again
                # will leaving this in and commented out once we resolve the ID issue
                # yield cached_vote

            # Senate votes
            if "Roll Call" in action_name:
                actor = "upper"
                # placeholder
                vote_action = action_name.split(" -")[0]
                # 2019 H86 Breaks our regex,
                # Ordered to a third reading --
                # see Senate   Roll Call #25 and House Roll Call 56
                if "yeas" in action_name and "nays" in action_name:
                    try:
                        y, n = re.search(
                            r"(\d+) yeas .*? (\d+) nays", action_name.lower()
                        ).groups()
                        y = int(y)
                        n = int(n)
                    except AttributeError:
                        y = int(
                            re.search(r"yeas\s+(\d+)", action_name.lower()).group(1)
                        )
                        n = int(
                            re.search(r"nays\s+(\d+)", action_name.lower()).group(1)
                        )

                    # TODO: other count isn't included, set later
                    cached_vote = VoteEvent(
                        chamber=actor,
                        start_date=action_date,
                        motion_text=vote_action,
                        result="pass" if y > n else "fail",
                        classification="passage",
                        bill=bill,
                    )
                    cached_vote.set_count("yes", y)
                    cached_vote.set_count("no", n)

                    rollcall_pdf = "http://malegislature.gov" + row.xpath(
                        "string(td[3]/a/@href)"
                    )
                    self.scrape_senate_vote(cached_vote, rollcall_pdf)
                    cached_vote.add_source(rollcall_pdf)
                    cached_vote.pupa_id = rollcall_pdf
                    # XXX: also disabled, see above note
                    # yield cached_vote

            attrs = self.categorizer.categorize(action_name)
            action = bill.add_action(
                action_name.strip(),
                action_date,
                chamber=action_actor,
                classification=attrs["classification"],
            )
            for com in attrs.get("committees", []):
                com = com.strip()
                action.add_related_entity(com, entity_type="organization")
Пример #53
0
def build_vote(session, bill_id, url, vote_record, chamber, motion_text):
    passed = len(vote_record['yes']) > len(vote_record['no'])
    vote_event = VoteEvent(
        result='pass' if passed else 'fail',
        chamber=chamber,
        start_date=vote_record['date'].strftime('%Y-%m-%d'),
        motion_text=motion_text,
        classification='passage',
        legislative_session=session,
        bill=bill_id,
        bill_chamber='upper' if bill_id[0] is 'S' else 'lower')
    vote_event.pupa_id = url
    vote_event.set_count('yes', len(vote_record['yes']))
    vote_event.set_count('no', len(vote_record['no']))
    vote_event.set_count('excused', len(vote_record['excused']))
    vote_event.set_count('absent', len(vote_record['absent']))
    vote_event.set_count('other', len(vote_record['other']))
    for vote_type in ['yes', 'no', 'excused', 'absent', 'other']:
        for voter in vote_record[vote_type]:
            vote_event.vote(vote_type, voter)

    vote_event.add_source(url)
    return vote_event
Пример #54
0
    def scrape_vote(self, bill, vote_url, chamber, date):
        page = self.lxmlize(vote_url)

        try:
            motion = page.xpath("//font/text()")[2]
        except IndexError:
            self.warning("Vote Summary Page Broken ")
            return

        # eg. http://leg.colorado.gov/content/sb18-033vote563ce6
        if ('AM' in motion or 'PM' in motion) and '/' in motion:
            motion = "Motion not given."

        if 'withdrawn' not in motion:
            yes_no_counts = page.xpath(
                "//tr/td[preceding-sibling::td/descendant::"
                "font[contains(text(),'Aye')]]/font/text()")
            other_counts = page.xpath(
                "//tr/td[preceding-sibling::td/descendant::"
                "font[contains(text(),'Absent')]]/font/text()")
            abstain_counts = page.xpath(
                "//tr/td[preceding-sibling::td/descendant::"
                "font[contains(text(),'17C')]]/font/text()")
            yes_count = int(yes_no_counts[0])
            no_count = int(yes_no_counts[2])
            exc_count = int(other_counts[2])
            absent_count = int(other_counts[0])
            abstain_count = 0
            if abstain_counts:
                abstain_count = int(abstain_counts[0])

            # fix for
            # http://leg.colorado.gov/content/hb19-1029vote65e72e
            if absent_count == -1:
                absent_count = 0

            passed = yes_count > no_count
            vote = VoteEvent(
                chamber=chamber,
                start_date=self._tz.localize(date),
                motion_text=motion,
                result='pass' if passed else 'fail',
                bill=bill,
                classification='passage',
            )
            vote.pupa_id = vote_url
            vote.set_count('yes', yes_count)
            vote.set_count('no', no_count)
            vote.set_count('excused', exc_count)
            vote.set_count('absent', absent_count)
            vote.set_count('abstain', abstain_count)
            vote.add_source(vote_url)

            rolls = page.xpath("//tr[preceding-sibling::tr/descendant::"
                               "td/div/b/font[contains(text(),'Vote')]]")

            vote_abrv = {
                'Y': 'yes',
                'N': 'no',
                'E': 'excused',
                'A': 'absent',
                '-': 'absent',
                '17C': 'abstain'
            }
            for roll in rolls:
                voted = roll.xpath(".//td/div/font/text()")[0].strip()
                voter = roll.xpath(".//td/font/text()")[0].strip()
                if voted == 'V':
                    continue
                vote.vote(vote_abrv[voted], voter)
            yield vote
Пример #55
0
    def scrape(self):
        unreachable_urls = []

        for leg_summary in self.legislation(created_after=datetime.datetime(2015, 5, 17)) :
            title = leg_summary['Title'].strip()

            if not title or not leg_summary['Intro\xa0Date'] :
                continue
                # https://chicago.legistar.com/LegislationDetail.aspx?ID=1800754&GUID=29575A7A-5489-4D8B-8347-4FC91808B201&Options=Advanced&Search=
                # doesn't have an intro date

            bill_type = BILL_TYPES[leg_summary['Type']]

            bill_session = self.session(self.toTime(leg_summary['Intro\xa0Date']))
            bill = Bill(identifier=leg_summary['Record #'],
                        legislative_session=bill_session,
                        title=title,
                        classification=bill_type,
                        from_organization={"name":"Chicago City Council"})

            bill.add_source(leg_summary['url'])

            try :
                leg_details = self.legDetails(leg_summary['url'])
            except IndexError :
                unreachable_urls.append(leg_summary['url'])
                yield bill
                continue

            for related_bill in leg_details.get('Related files', []) :
                lower_title = title.lower()
                if "sundry" in title or "miscellaneous" in title: #these are ominbus
                    bill.add_related_bill(identifier = related_bill['label'],
                                          legislative_session = bill.legislative_session,
                                          relation_type='replaces')
                #for now we're skipping related bills if they
                #don't contain words that make us think they're
                #in a ominbus relationship with each other
                
            for i, sponsor in enumerate(leg_details.get('Sponsors', [])) :
                if i == 0 :
                    primary = True
                    sponsorship_type = "Primary"
                else :
                    primary = False
                    sponsorship_type = "Regular"

                sponsor_name = sponsor['label']

                # Does the Mayor/Clerk introduce legisislation as
                # individuals role holders or as the OFfice of City
                # Clerk and the Office of the Mayor?
                entity_type = 'person'
                if sponsor_name.startswith(('City Clerk', 
                                            'Mendoza, Susana')) :
                    sponsor_name = 'Office of the City Clerk'
                    entity_type = 'organization'
                elif sponsor_name.startswith(('Emanuel, Rahm',)) :
                    sponsor_name = 'Office of the Mayor'
                    entity_type = 'organization'
                if not sponsor_name.startswith(('Misc. Transmittal',
                                                'No Sponsor',
                                                'Dept./Agency')) :
                    bill.add_sponsorship(sponsor_name, 
                                         sponsorship_type,
                                         entity_type,
                                         primary,
                                         entity_id = _make_pseudo_id(name=sponsor_name))

            if 'Topic' in leg_details :
                for subject in leg_details[u'Topic'].split(',') :
                    bill.add_subject(subject)

            for attachment in leg_details.get('Attachments', []) :
                if attachment['label'] :
                    bill.add_version_link(attachment['label'],
                                          attachment['url'],
                                          media_type="application/pdf")

            for action in self.history(leg_summary['url']) :
                action_description = action['Action']
                try :
                    action_date =  self.toTime(action['Date']).date().isoformat()
                except AttributeError : # https://chicago.legistar.com/LegislationDetail.aspx?ID=1424866&GUID=CEC53337-B991-4268-AE8A-D4D174F8D492
                    continue

                if action_description :
                    try :
                        responsible_org = action['Action\xa0By']['label']
                    except TypeError  :
                        responsible_org = action['Action\xa0By']
                    if responsible_org == 'City Council' :
                        responsible_org = 'Chicago City Council'

                    act = bill.add_action(action_description,
                                          action_date,
                                          organization={'name': responsible_org},
                                          classification=ACTION_CLASSIFICATION[action_description])

                    if action_description == 'Referred' :
                        try :
                            leg_details['Current Controlling Legislative Body']['label']
                            controlling_bodies = [leg_details['Current Controlling Legislative Body']]
                        except TypeError :
                            controlling_bodies = leg_details['Current Controlling Legislative Body']
                        if controlling_bodies :
                            for controlling_body in controlling_bodies :
                                body_name = controlling_body['label']
                                if body_name.startswith("Joint Committee") :
                                    act.add_related_entity(body_name,
                                                           'organization')
                                else :
                                    act.add_related_entity(body_name,
                                                           'organization',
                                                           entity_id = _make_pseudo_id(name=body_name))


                    if 'url' in action['Action\xa0Details'] :
                        action_detail_url = action['Action\xa0Details']['url']
                        result, votes = self.extractVotes(action_detail_url)

                        if votes and result : # see https://github.com/datamade/municipal-scrapers-us/issues/15
                            action_vote = VoteEvent(legislative_session=bill.legislative_session, 
                                               motion_text=action_description,
                                               organization={'name': responsible_org},
                                               classification=None,
                                               start_date=action_date,
                                               result=result,
                                               bill=bill)
                            action_vote.add_source(action_detail_url)

                            for option, voter in votes :
                                action_vote.vote(option, voter)

                            yield action_vote

            bill.extras = {'local_classification' : leg_summary['Type']}
                            
            yield bill
        print(unreachable_urls)
Пример #56
0
    def parse_vote(self, actor, date, row, session, bill_id, bill_chamber,
                   source):
        """
        takes the actor, date and row element and returns a Vote object
        """
        spans = row.xpath(".//span")
        motion = row.text.replace(u"\u00a0", " ").replace("-", "").strip()
        motion = motion if motion else "passage"
        passed, yes_count, no_count, other_count = (
            spans[0].text_content().rsplit("-", 3))
        yes_votes = self.get_names(spans[1].tail)
        no_votes = self.get_names(spans[2].tail)

        other_votes = []
        for span in spans[3:]:
            if span.text.startswith(("Absent", "Excused")):
                other_votes += self.get_names(span.tail)
        for key, val in {
                "adopted": "pass",
                "passed": "pass",
                "failed": "fail"
        }.items():
            if key in passed.lower():
                passed = val
                break
        vote = VoteEvent(
            chamber=actor,
            start_date=date,
            motion_text=motion,
            bill=bill_id,
            bill_chamber=bill_chamber,
            result=passed,
            classification="passage",
            legislative_session=session,
        )
        vote.add_source(source)
        vote.set_count("yes", int(yes_count))
        vote.set_count("no", int(no_count))
        vote.set_count("absent", int(other_count))
        for name in yes_votes:
            if name and name != "None":
                vote.yes(name)
        for name in no_votes:
            if name and name != "None":
                vote.no(name)
        for name in other_votes:
            if name and name != "None":
                vote.vote("absent", name)
        yield vote
Пример #57
0
    def parse_vote(self, actor, date, row, session, bill_id, bill_chamber, source):
        """
        takes the actor, date and row element and returns a Vote object
        """
        spans = row.xpath('.//span')
        motion = row.text.replace(u'\u00a0', " ").replace("-", "").strip()
        motion = motion if motion else "passage"
        passed, yes_count, no_count, other_count = spans[0].text_content().rsplit('-', 3)
        yes_votes = self.get_names(spans[1].tail)
        no_votes = self.get_names(spans[2].tail)

        other_votes = []
        for span in spans[3:]:
            if span.text.startswith(('Absent', 'Excused')):
                other_votes += self.get_names(span.tail)
        for key, val in {'adopted': 'pass', 'passed': 'pass', 'failed': 'fail'}.items():
            if key in passed.lower():
                passed = val
                break
        vote = VoteEvent(chamber=actor,
                         start_date=date,
                         motion_text=motion,
                         bill=bill_id,
                         bill_chamber=bill_chamber,
                         result=passed,
                         classification="passage",
                         legislative_session=session)
        vote.add_source(source)
        vote.set_count('yes', int(yes_count))
        vote.set_count('no', int(no_count))
        vote.set_count('absent', int(other_count))
        for name in yes_votes:
            if name and name != 'None':
                vote.yes(name)
        for name in no_votes:
            if name and name != 'None':
                vote.no(name)
        for name in other_votes:
            if name and name != 'None':
                vote.vote('absent', name)
        yield vote
Пример #58
0
    def scrape_votes(self, vote_url, bill, chamber):
        # Grabs text from pdf
        pdflines = [
            line.decode("utf-8")
            for line in convert_pdf(vote_url, "text").splitlines()
        ]
        vote_date = 0
        voters = defaultdict(list)
        for x in range(len(pdflines)):
            line = pdflines[x]
            if re.search(r"(\d+/\d+/\d+)", line):
                initial_date = line.strip()
            if ("AM" in line) or ("PM" in line):
                split_l = line.split()
                for y in split_l:
                    if ":" in y:
                        time_location = split_l.index(y)
                        motion = " ".join(split_l[0:time_location])
                        time = split_l[time_location:]
                        if len(time) > 0:
                            time = "".join(time)
                        dt = initial_date + " " + time
                        dt = datetime.strptime(dt, "%m/%d/%Y %I:%M:%S%p")
                        vote_date = central.localize(dt)
                        vote_date = vote_date.isoformat()
                        # In rare case that no motion is provided
                        if len(motion) < 1:
                            motion = "No Motion Provided"
            if "YEAS:" in line:
                yeas = int(line.split()[-1])
            if "NAYS:" in line:
                nays = int(line.split()[-1])
            if "ABSTAINED:" in line:
                abstained = int(line.split()[-1])
            if "PASSES:" in line:
                abstained = int(line.split()[-1])
            if "NOT VOTING:" in line:
                not_voting = int(line.split()[-1])

            if "YEAS :" in line:
                y = 0
                next_line = pdflines[x + y]
                while "NAYS : " not in next_line:
                    next_line = next_line.split("  ")
                    if next_line and ("YEAS" not in next_line):
                        for v in next_line:
                            if v and "YEAS" not in v:
                                voters["yes"].append(v.strip())
                    next_line = pdflines[x + y]
                    y += 1
            if line and "NAYS :" in line:
                y = 0
                next_line = 0
                next_line = pdflines[x + y]
                while ("ABSTAINED : " not in next_line) and ("PASSES :"
                                                             not in next_line):
                    next_line = next_line.split("  ")
                    if next_line and "NAYS" not in next_line:
                        for v in next_line:
                            if v and "NAYS" not in v:
                                voters["no"].append(v.strip())
                    next_line = pdflines[x + y]
                    y += 1

            if line and ("ABSTAINED :" in line or "PASSES :" in line):
                y = 2
                next_line = 0
                next_line = pdflines[x + y]
                while "NOT VOTING :" not in next_line:
                    next_line = next_line.split("  ")
                    if next_line and ("ABSTAINED" not in next_line
                                      or "PASSES" not in next_line):
                        for v in next_line:
                            if v:
                                voters["abstain"].append(v.strip())
                    next_line = pdflines[x + y]
                    y += 1

            if line and "NOT VOTING : " in line:
                lines_to_go_through = math.ceil(not_voting / len(line.split()))
                next_line = pdflines[x]
                for y in range(lines_to_go_through):
                    next_line = pdflines[x + y + 2].split("  ")
                    for v in next_line:
                        if v:
                            voters["not voting"].append(v.strip())
                if yeas > (nays + abstained + not_voting):
                    passed = True
                else:
                    passed = False

                ve = VoteEvent(
                    chamber=chamber,
                    start_date=vote_date,
                    motion_text=motion,
                    result="pass" if passed else "fail",
                    classification="bill",
                    bill=bill,
                )
                ve.add_source(vote_url)
                for how_voted, how_voted_voters in voters.items():
                    for voter in how_voted_voters:
                        if len(voter) > 0:
                            ve.vote(how_voted, voter)
                # Resets voters dictionary before going onto next page in pdf
                voters = defaultdict(list)
                yield ve
Пример #59
0
    def _process_votes(self, rollcalls, bill_id, original_chamber, session, proxy):
        result_types = {
            'FAILED': False,
            'DEFEATED': False,
            'PREVAILED': True,
            'PASSED': True,
            'SUSTAINED': True,
            'NOT SECONDED': False,
            'OVERRIDDEN': True,
            'ADOPTED': True,
        }

        for r in rollcalls:
            proxy_link = proxy["url"] + r["link"]
            (path, resp) = self.urlretrieve(proxy_link)
            text = convert_pdf(path, 'text').decode("utf-8")
            lines = text.split("\n")
            os.remove(path)

            chamber = "lower" if "house of representatives" in lines[0].lower() else "upper"
            date_parts = lines[1].strip().split()[-3:]
            date_str = " ".join(date_parts).title() + " " + lines[2].strip()

            vote_date = datetime.datetime.strptime(date_str, "%b %d, %Y %I:%M:%S %p")
            vote_date = pytz.timezone('America/Indiana/Indianapolis').localize(vote_date)
            vote_date = vote_date.isoformat()

            passed = None

            for res, val in result_types.items():
                # We check multiple lines now because the result of the
                # roll call vote as parsed can potentially be split.
                # PDF documents suck.
                for line in lines[3:5]:
                    if res in line.upper():
                        passed = val
                        break

            if passed is None:
                raise AssertionError("Missing bill passage type")

            motion = " ".join(lines[4].split()[:-2])
            try:
                yeas = int(lines[4].split()[-1])
                nays = int(lines[5].split()[-1])
                excused = int(lines[6].split()[-1])
                not_voting = int(lines[7].split()[-1])
            except ValueError:
                self.logger.warning("Vote format is weird, skipping")
                continue

            vote = VoteEvent(chamber=chamber,
                             legislative_session=session,
                             bill=bill_id,
                             bill_chamber=original_chamber,
                             start_date=vote_date,
                             motion_text=motion,
                             result="pass" if passed else "fail",
                             classification="passage")

            vote.set_count('yes', yeas)
            vote.set_count('no', nays)
            vote.set_count('excused', excused)
            vote.set_count('not voting', not_voting)
            vote.add_source(proxy_link)

            currently_counting = ""

            possible_vote_lines = lines[8:]
            for l in possible_vote_lines:
                l = l.replace("NOT\xc2\xa0VOTING", "NOT VOTING")
                l = l.replace("\xc2\xa0", " -")
                if "yea-" in l.lower().replace(" ", ""):
                    currently_counting = "yes"
                elif "nay-" in l.lower().replace(" ", ""):
                    currently_counting = "no"
                elif "excused-" in l.lower().replace(" ", ""):
                    currently_counting = "excused"
                elif "notvoting-" in l.lower().replace(" ", ""):
                    currently_counting = "not voting"
                elif currently_counting == "":
                    pass
                elif re.search(r'v\. \d\.\d', l):
                    # this gets rid of the version number
                    # which is often found at the bottom of the doc
                    pass
                else:
                    voters = l.split("  ")
                    for v in voters:
                        if v.strip():
                            vote.vote(currently_counting, v.strip())

            yield vote
Пример #60
0
    def parse_vote_page(self, vote_url, bill):
        vote_html = self.get(vote_url).text
        doc = lxml.html.fromstring(vote_html)
        # chamber
        if "senate" in vote_url:
            chamber = "upper"
        else:
            chamber = "lower"

        # date in the following format: Mar 23, 2009
        date = doc.xpath('//td[starts-with(text(), "Legislative")]')[0].text
        date = date.replace(u"\xa0", " ")
        date = datetime.datetime.strptime(date[18:], "%b %d, %Y")

        # motion
        motion = "".join(x.text_content()
                         for x in doc.xpath('//td[@colspan="23"]'))
        if motion == "":
            motion = "No motion given"  # XXX: Double check this. See SJ 3.
        motion = motion.replace(u"\xa0", " ")

        # totals
        tot_class = doc.xpath('//td[contains(text(), "Yeas")]')[0].get("class")
        totals = doc.xpath('//td[@class="%s"]/text()' % tot_class)[1:]
        yes_count = int(totals[0].split()[-1])
        no_count = int(totals[1].split()[-1])
        other_count = int(totals[2].split()[-1])
        other_count += int(totals[3].split()[-1])
        other_count += int(totals[4].split()[-1])
        passed = yes_count > no_count

        vote = VoteEvent(
            bill=bill,
            chamber=chamber,
            start_date=date.strftime("%Y-%m-%d"),
            motion_text=motion,
            classification="passage",
            result="pass" if passed else "fail",
        )
        vote.pupa_id = vote_url  # contains sequence number
        vote.set_count("yes", yes_count)
        vote.set_count("no", no_count)
        vote.set_count("other", other_count)

        # go through, find Voting Yea/Voting Nay/etc. and next tds are voters
        func = None
        for td in doc.xpath("//td/text()"):
            td = td.replace(u"\xa0", " ")
            if td.startswith("Voting Yea"):
                func = vote.yes
            elif td.startswith("Voting Nay"):
                func = vote.no
            elif td.startswith("Not Voting"):
                func = vote.other
            elif td.startswith("Excused"):
                func = vote.other
            elif func:
                td = td.rstrip("*")
                func(td)

        return vote