示例#1
0
    def scrape_bill(self, chamber, session, bill_id, session_id):
        """
        Scrapes documents, actions, vote counts and votes for
        a given bill.
        """
        bill_json_url = 'https://apps.azleg.gov/api/Bill/?billNumber={}&sessionId={}&' \
                        'legislativeBody={}'.format(bill_id, session_id, self.chamber_map[chamber])
        response = self.get(bill_json_url)
        # print(response.content)
        page = json.loads(response.content.decode('utf-8'))

        bill_title = page['ShortTitle']
        bill_id = page['Number']
        internal_id = page['BillId']
        bill_type = self.get_bill_type(bill_id)
        bill = Bill(
            bill_id,
            legislative_session=session,
            chamber=chamber,
            title=bill_title,
            classification=bill_type,
        )

        bill = self.scrape_actions(bill, page, chamber)
        bill = self.scrape_versions(bill, internal_id)
        bill = self.scrape_sponsors(bill, internal_id)
        bill = self.scrape_subjects(bill, internal_id)

        bill_url = 'https://apps.azleg.gov/BillStatus/BillOverview/{}?SessionId={}'.format(
                    internal_id, session_id)
        bill.add_source(bill_url)

        bill = self.sort_bill_actions(bill)

        yield bill
def test_fix_bill_id():
    j = create_jurisdiction()
    j.legislative_sessions.create(name='1900', identifier='1900')

    org1 = ScrapeOrganization(name='House', classification='lower')
    bill = ScrapeBill('HB 1', '1900', 'Test Bill ID',
                      classification='bill', chamber='lower')

    oi = OrganizationImporter('jid')

    oi.import_data([org1.as_dict()])

    from pupa.settings import IMPORT_TRANSFORMERS
    IMPORT_TRANSFORMERS['bill'] = {
        'identifier': lambda x: re.sub(r'([A-Z]*)\s*0*([-\d]+)', r'\1 \2', x, 1)
    }

    bi = BillImporter('jid', oi, DumbMockImporter())
    bi.import_data([bill.as_dict()])

    ve = ScrapeVoteEvent(legislative_session='1900', motion_text='passage',
                         start_date='1900-04-02', classification='passage:bill',
                         result='fail', bill_chamber='lower', bill='HB1',
                         identifier='4',
                         bill_action='passage',
                         organization=org1._id)

    VoteEventImporter('jid', DumbMockImporter(), oi, bi).import_data([
        ve.as_dict(),
    ])

    IMPORT_TRANSFORMERS['bill'] = {}

    ve = VoteEvent.objects.get()
    ve.bill.identifier == 'HB 1'
示例#3
0
 def get_bill(self, bill_id, **kwargs):
     url = kwargs.pop('url')
     agenda_item = kwargs.pop('agenda_item')
     _type = self.get_type(bill_id)
     bill = Bill(bill_id, self.session, type=_type, **kwargs)
     bill.add_source(url, note='detail')
     return bill
示例#4
0
    def scrape_bill(self, chamber, session, bill_id, session_id):
        bill_json_url = 'https://apps.azleg.gov/api/Bill/?billNumber={}&sessionId={}&' \
                        'legislativeBody={}'.format(bill_id, session_id, self.chamber_map[chamber])
        response = self.get(bill_json_url)
        page = json.loads(response.content.decode('utf-8'))

        if not page:
            self.warning('null page for %s', bill_id)
            return

        bill_title = page['ShortTitle']
        bill_id = page['Number']
        internal_id = page['BillId']
        bill_type = self.get_bill_type(bill_id)
        bill = Bill(
            bill_id,
            legislative_session=session,
            chamber=chamber,
            title=bill_title,
            classification=bill_type,
        )

        self.scrape_actions(bill, page, chamber)
        self.scrape_versions_and_documents(bill, internal_id)
        self.scrape_sponsors(bill, internal_id)
        self.scrape_subjects(bill, internal_id)
        yield from self.scrape_votes(bill, page)

        bill_url = 'https://apps.azleg.gov/BillStatus/BillOverview/{}?SessionId={}'.format(
                    internal_id, session_id)
        bill.add_source(bill_url)

        self.sort_bill_actions(bill)

        yield bill
示例#5
0
    def scrape(self):

        for i, page in enumerate(self.searchLegislation()) :
            for legislation_summary in self.parseSearchResults(page) :
                title = legislation_summary['Title'].strip()
                if title == "":
                    continue

                if legislation_summary['Type'].lower() in ('order', 
                                                           'claim', 
                                                           'communication', 
                                                           'report', 
                                                           'oath of office') :
                    continue
                else :
                    bill_type = legislation_summary['Type'].lower()

                bill_session = self.session(legislation_summary['Intro\xa0Date'])

                bill = Bill(identifier=legislation_summary['Record #'],
                            legislative_session=bill_session,
                            title=title,
                            classification=bill_type,
                            from_organization=self.jurisdiction.name)

                bill.add_source(legislation_summary['url'])

                bill, votes = self.addDetails(bill, legislation_summary['url'])

                yield bill
                for vote in votes :
                    yield vote
示例#6
0
def test_bill_sponsor_by_identifier():
    create_jurisdiction()
    org = create_org()

    bill = ScrapeBill('HB 1', '1900', 'Axe & Tack Tax Act',
                      classification='tax bill', chamber='lower')
    bill.add_sponsorship_by_identifier(name="SNODGRASS",
                                       classification='sponsor',
                                       entity_type='person',
                                       primary=True,
                                       identifier="TOTALLY_REAL_ID",
                                       scheme="TOTALLY_REAL_SCHEME")

    oi = OrganizationImporter('jid')
    pi = PersonImporter('jid')

    zs = ScrapePerson(name='Zadock Snodgrass')
    zs.add_identifier(identifier='TOTALLY_REAL_ID',
                      scheme='TOTALLY_REAL_SCHEME')
    pi.import_data([zs.as_dict()])
    za_db = Person.objects.get()
    Membership.objects.create(person_id=za_db.id,
                              organization_id=org.id)

    BillImporter('jid', oi, pi).import_data([bill.as_dict()])

    obj = Bill.objects.get()
    (entry,) = obj.sponsorships.all()
    assert entry.person.name == "Zadock Snodgrass"
示例#7
0
def toy_bill():
    b = Bill(
        identifier="HB 2017",
        legislative_session="2012A",
        title="A bill for an act to raise the cookie budget by 200%",
        from_organization="Foo Senate",
        classification="bill",
    )
    b.add_source("http://uri.example.com/", note="foo")
    return b
示例#8
0
 def get_bill(self, bill_id, **kwargs):
     if bill_id == '1':
         assert kwargs == {'extra': 'param'}
         raise self.ContinueScraping
     else:
         assert bill_id == '2'
         assert kwargs == {}
         b = Bill('1', self.session, 'title')
         b.add_source('http;//example.com')
         return b
示例#9
0
    def scrape_bill(self, row, chamber, session):
        bill_id = row['LegislationNumber']

        # TODO: re-evaluate if these should be separate bills
        if 'SA' in bill_id or 'HA' in bill_id:
            self.warning('skipping amendment %s', bill_id)
            return

        bill_type = self.classify_bill(bill_id)
        bill = Bill(identifier=bill_id,
                    legislative_session=session,
                    chamber=chamber,
                    title=row['LongTitle'],
                    classification=bill_type)
        if row['Synopsis']:
            bill.add_abstract(row['Synopsis'], 'synopsis')
        if row['ShortTitle']:
            bill.add_title(row['ShortTitle'], 'short title')
        if row['SponsorPersonId']:
            self.add_sponsor_by_legislator_id(bill, row['SponsorPersonId'], 'primary')

        # TODO: Is there a way get additional sponsors and cosponsors, and versions/fns via API?
        html_url = 'https://legis.delaware.gov/BillDetail?LegislationId={}'.format(
            row['LegislationId']
        )
        bill.add_source(html_url, note='text/html')

        html = self.lxmlize(html_url)

        # Additional Sponsors: '//label[text()="Additional Sponsor(s):"]/following-sibling::div/a'
        additional_sponsors = html.xpath('//label[text()="Additional Sponsor(s):"]'
                                         '/following-sibling::div/a/@href')
        for sponsor_url in additional_sponsors:
            sponsor_id = sponsor_url.replace('https://legis.delaware.gov/LegislatorDetail?'
                                             'personId=', '')
            self.add_sponsor_by_legislator_id(bill, sponsor_id, 'primary')

        # CoSponsors: '//label[text()="Co-Sponsor(s):"]/following-sibling::div/a'
        cosponsors = html.xpath('//label[text()="Additional Sponsor(s):"]/'
                                'following-sibling::div/a/@href')
        for sponsor_url in cosponsors:
            sponsor_id = sponsor_url.replace('https://legis.delaware.gov/LegislatorDetail?'
                                             'personId=', '')
            self.add_sponsor_by_legislator_id(bill, sponsor_id, 'cosponsor')

        versions = html.xpath('//label[text()="Original Text:"]/following-sibling::div/a/@href')
        for version_url in versions:
            media_type = self.mime_from_link(version_url)
            version_name = 'Bill Text'
            # on_duplicate='error'
            bill.add_version_link(version_name, version_url, media_type=media_type)

        fiscals = html.xpath('//div[contains(@class,"fiscalNote")]/a/@href')
        for fiscal in fiscals:
            self.scrape_fiscal_note(bill, fiscal)

        self.scrape_actions(bill, row['LegislationId'])
        yield from self.scrape_votes(bill, row['LegislationId'], session)

        yield bill
示例#10
0
def test_bill_chamber_param():
    create_jurisdiction()
    org = create_org()

    bill = ScrapeBill('HB 1', '1900', 'Axe & Tack Tax Act',
                      classification='tax bill', chamber='lower')

    oi = OrganizationImporter('jid')
    BillImporter('jid', oi).import_data([bill.as_dict()])

    assert Bill.objects.get().from_organization_id == org.id
示例#11
0
    def scrape_bill(self, session, bill_id, chamber):
        # https://malegislature.gov/Bills/189/SD2739
        session_for_url = self.replace_non_digits(session)
        bill_url = 'https://malegislature.gov/Bills/{}/{}'.format(session_for_url, bill_id)

        try:
            response = requests.get(bill_url)
        except requests.exceptions.RequestException as e:
            self.warning(u'Server Error on {}'.format(bill_url))
            return False

        html = response.text

        page = lxml.html.fromstring(html)

        if not page.xpath('//div[contains(@class, "followable")]/h1/text()'):
            self.warning(u'Server Error on {}'.format(bill_url))
            return False

        bill_title = page.xpath('//div[@id="contentContainer"]/div/div/h2/text()')[0]

        bill_id = re.sub(r'[^S|H|D|\d]', '', bill_id)

        bill = Bill(bill_id, legislative_session=session, chamber=chamber,
                    title=bill_title, classification='bill')

        bill_summary = None
        if page.xpath('//p[@id="pinslip"]/text()'):
            bill_summary = page.xpath('//p[@id="pinslip"]/text()')[0]
        if bill_summary:
            bill.add_abstract(bill_summary, 'summary')

        bill.add_source(bill_url)

        # https://malegislature.gov/Bills/189/SD2739 has a presenter
        # https://malegislature.gov/Bills/189/S2168 no sponsor
        # Find the non-blank text of the dt following Sponsor or Presenter,
        # including any child link text.
        sponsor = page.xpath(
            '//dt[text()="Sponsor:" or text()="Presenter:"]/'
            'following-sibling::dd/descendant-or-self::*/text()[normalize-space()]')
        if sponsor:
            sponsor = sponsor[0].strip()
            bill.add_sponsorship(sponsor, classification='primary', primary=True,
                                 entity_type='person')

        self.scrape_cosponsors(bill, bill_url)

        version = page.xpath("//div[contains(@class, 'modalBtnGroup')]/"
                             "a[contains(text(), 'Download PDF') and not(@disabled)]/@href")
        if version:
            version_url = "https://malegislature.gov{}".format(version[0])
            bill.add_version_link('Bill Text', version_url, media_type='application/pdf')

        # yield back votes and bill
        yield from self.scrape_actions(bill, bill_url, session)
        yield bill
def test_vote_event_bill_actions_two_stage():
    # this test is very similar to what we're testing in test_vote_event_bill_actions w/
    # ve3 and ve4, that two bills that reference the same action won't conflict w/ the
    # OneToOneField, but in this case we do it in two stages so that the conflict is found
    # even if the votes weren't in the same scrape
    j = create_jurisdiction()
    j.legislative_sessions.create(name='1900', identifier='1900')
    org1 = ScrapeOrganization(name='House', classification='lower')
    bill = ScrapeBill('HB 1', '1900', 'Axe & Tack Tax Act', from_organization=org1._id)

    bill.add_action(description='passage', date='1900-04-02', chamber='lower')

    ve1 = ScrapeVoteEvent(legislative_session='1900', motion_text='passage',
                          start_date='1900-04-02', classification='passage:bill',
                          result='pass', bill_chamber='lower', bill='HB 1',
                          bill_action='passage',
                          organization=org1._id)
    ve2 = ScrapeVoteEvent(legislative_session='1900', motion_text='passage',
                          start_date='1900-04-02', classification='passage:bill',
                          result='pass', bill_chamber='lower', bill='HB 1',
                          bill_action='passage',
                          organization=org1._id)
    # disambiguate them
    ve1.pupa_id = 'one'
    ve2.pupa_id = 'two'

    oi = OrganizationImporter('jid')
    oi.import_data([org1.as_dict()])

    bi = BillImporter('jid', oi, DumbMockImporter())
    bi.import_data([bill.as_dict()])

    # first imports just fine
    VoteEventImporter('jid', DumbMockImporter(), oi, bi).import_data([
        ve1.as_dict(),
    ])
    votes = list(VoteEvent.objects.all())
    assert len(votes) == 1
    assert votes[0].bill_action is not None

    # when second is imported, ensure that action stays pinned to first just as it would
    # have if they were both in same import
    VoteEventImporter('jid', DumbMockImporter(), oi, bi).import_data([
        ve1.as_dict(),
        ve2.as_dict(),
    ])
    votes = list(VoteEvent.objects.all())
    assert len(votes) == 2
    assert votes[0].bill_action is not None
    assert votes[1].bill_action is None
示例#13
0
    def get_bill(self, bill_id, **kwargs):
        url = 'http://www.denvergov.org/sirepub/item.aspx?itemid=%s' % bill_id
        self.urls.add(detail=url)

        bill_id = kwargs.pop('number')
        bill = Bill(bill_id, self.session, kwargs['title'], 'butt',
                    type=['bills'])
        bill.add_source(url, note='detail')

        xpath = '//table[contains(@class, "history")]/tr'
        for tr in self.urls.detail.xpath(xpath):
            import pdb; pdb.set_trace()

        return bill
示例#14
0
    def scrape_bill(self, bill_page_url):
        bill_page = lxml.html.fromstring(self.get(bill_page_url).text)

        title = bill_page.xpath('//span[@id="ctl00_ContentPlaceHolder_SubjectLabel"]/text()')
        if title:
            title = title[0]
        else:
            self.warning('Missing bill title {}'.format(bill_page_url))
            return False

        bill_no = bill_page.xpath(
            '//span[@id="ctl00_ContentPlaceHolder_BillNumberLabel"]/a/text()')
        if bill_no:
            bill_no = bill_no[0]
        else:
            bill_no = bill_page.xpath(
                '//span[@id="ctl00_ContentPlaceHolder_BillNumberLabel"]/text()')
            if bill_no:
                bill_no = bill_no[0]
            else:
                self.error('Missing bill number {}'.format(bill_page_url))
                return False

        bill = Bill(
            bill_no,
            legislative_session=self.session,
            chamber='legislature',
            title=title,
            classification='bill'
        )

        bill.add_source(bill_page_url)

        self.parse_versions(bill, bill_page, bill_no)
        self.parse_acts(bill, bill_page)

        sponsors = bill_page.xpath('//span[@id="ctl00_ContentPlaceHolder_SponsorsLabel"]/text()')
        if sponsors:
            self.assign_sponsors(bill, sponsors[0], 'primary')

        cosponsors = bill_page.xpath(
            '//span[@id="ctl00_ContentPlaceHolder_CoSponsorsLabel"]/text()')
        if cosponsors:
            self.assign_sponsors(bill, cosponsors[0], 'cosponsor')

        self.parse_date_actions(bill, bill_page)
        self.parse_actions(bill, bill_page)

        yield bill
示例#15
0
    def scrape_bill_info(self, session, chambers):
        info_url = "ftp://ftp.cga.ct.gov/pub/data/bill_info.csv"
        data = self.get(info_url)
        page = open_csv(data)

        chamber_map = {'H': 'lower', 'S': 'upper'}

        for row in page:
            bill_id = row['bill_num']
            chamber = chamber_map[bill_id[0]]

            if chamber not in chambers:
                continue

            # assert that the bill data is from this session, CT is tricky
            assert row['sess_year'] == session

            if re.match(r'^(S|H)J', bill_id):
                bill_type = 'joint resolution'
            elif re.match(r'^(S|H)R', bill_id):
                bill_type = 'resolution'
            else:
                bill_type = 'bill'

            bill = Bill(identifier=bill_id,
                        legislative_session=session,
                        title=row['bill_title'],
                        classification=bill_type,
                        chamber=chamber)
            bill.add_source(info_url)

            for introducer in self._introducers[bill_id]:
                bill.add_sponsorship(name=str(introducer),
                                     classification='primary',
                                     primary=True,
                                     entity_type='person')

            try:
                for subject in self._subjects[bill_id]:
                    bill.subject.append(subject)

                self.bills[bill_id] = [bill, chamber]

                yield from self.scrape_bill_page(bill)
            except SkipBill:
                self.warning('no such bill: ' + bill_id)
                pass
示例#16
0
def test_full_vote_event():
    j = Jurisdiction.objects.create(id='jid', division_id='did')
    j.legislative_sessions.create(name='1900', identifier='1900')
    sp1 = ScrapePerson('John Smith', primary_org='lower')
    sp2 = ScrapePerson('Adam Smith', primary_org='lower')
    org = ScrapeOrganization(name='House', classification='lower')
    bill = ScrapeBill('HB 1', '1900', 'Axe & Tack Tax Act', from_organization=org._id)
    vote_event = ScrapeVoteEvent(legislative_session='1900', motion_text='passage',
                                 start_date='1900-04-01', classification='passage:bill',
                                 result='pass', bill_chamber='lower', bill='HB 1',
                                 organization=org._id)
    vote_event.set_count('yes', 20)
    vote_event.yes('John Smith')
    vote_event.no('Adam Smith')

    oi = OrganizationImporter('jid')
    oi.import_data([org.as_dict()])

    pi = PersonImporter('jid')
    pi.import_data([sp1.as_dict(), sp2.as_dict()])

    mi = MembershipImporter('jid', pi, oi, DumbMockImporter())
    mi.import_data([sp1._related[0].as_dict(), sp2._related[0].as_dict()])

    bi = BillImporter('jid', oi, pi)
    bi.import_data([bill.as_dict()])

    VoteEventImporter('jid', pi, oi, bi).import_data([vote_event.as_dict()])

    assert VoteEvent.objects.count() == 1
    ve = VoteEvent.objects.get()
    assert ve.legislative_session == LegislativeSession.objects.get()
    assert ve.motion_classification == ['passage:bill']
    assert ve.bill == Bill.objects.get()
    count = ve.counts.get()
    assert count.option == 'yes'
    assert count.value == 20
    votes = list(ve.votes.all())
    assert len(votes) == 2
    for v in ve.votes.all():
        if v.voter_name == 'John Smith':
            assert v.option == 'yes'
            assert v.voter == Person.objects.get(name='John Smith')
        else:
            assert v.option == 'no'
            assert v.voter == Person.objects.get(name='Adam Smith')
示例#17
0
    def parse_bill(self, chamber, session, special, link):
        bill_num = link.text.strip()
        type_abbr = re.search('type=(B|R|)', link.attrib['href']).group(1)

        if type_abbr == 'B':
            btype = ['bill']
        elif type_abbr == 'R':
            btype = ['resolution']

        bill_id = "%s%s %s" % (utils.bill_abbr(chamber), type_abbr, bill_num)

        url = utils.info_url(chamber, session, special, type_abbr, bill_num)
        page = self.get(url).text
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        xpath = '/'.join([
            '//div[contains(@class, "BillInfo-ShortTitle")]',
            'div[@class="BillInfo-Section-Data"]',
        ])
        title = page.xpath(xpath).pop().text_content().strip()
        if not title:
            return
        bill = Bill(bill_id, legislative_session=session, title=title, chamber=chamber,
                    classification=btype)
        bill.add_source(url)

        self.parse_bill_versions(bill, page)

        self.parse_history(bill, chamber, utils.history_url(chamber, session, special,
                           type_abbr, bill_num))

        # only fetch votes if votes were seen in history
        # if vote_count:
        yield from self.parse_votes(
            bill,
            utils.vote_url(chamber, session, special, type_abbr, bill_num),
        )

        # Dedupe sources.
        sources = bill.sources
        for source in sources:
            if 1 < sources.count(source):
                sources.remove(source)

        yield bill
示例#18
0
    def _recursively_process_bills(self,
                                   request_session,
                                   chamber,
                                   session,
                                   first_item=1):
        '''
        Once a search has been initiated, this function will save a
        Bill object for every Paper from the given chamber
        '''

        url = 'http://legislature.maine.gov/LawMakerWeb/searchresults.asp'
        r = request_session.get(url, params={'StartWith': first_item})
        r.raise_for_status()

        bills = lxml.html.fromstring(r.text).xpath('//tr/td/b/a')
        if bills:
            for bill in bills:
                bill_id_slug = bill.xpath('./@href')[0]
                if bill_id_slug == 'summary.asp?ID=280068396':
                    continue
                bill_url = 'http://legislature.maine.gov/LawMakerWeb/{}'.format(
                    bill_id_slug)
                bill_id = bill.text[:2] + " " + bill.text[2:]

                if bill_id in BLACKLISTED_BILL_IDS[session]:
                    continue

                bill = Bill(
                    identifier=bill_id,
                    legislative_session=session,
                    title="",
                    chamber=chamber,
                )
                bill.add_source(bill_url)

                yield from self.scrape_bill(bill, chamber)
                yield bill

            # Make a recursive call to this function, for the next page
            PAGE_SIZE = 25
            yield from self._recursively_process_bills(
                request_session=request_session,
                chamber=chamber,
                session=session,
                first_item=first_item + PAGE_SIZE)
示例#19
0
    def scrape_bill(self, chamber, session, bill_id, url):
        html = self.get(url).text
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)

        try:
            title = doc.xpath('//h3[@class="h3billright"]')[0].text_content()
            # TODO: grab summary (none present at time of writing)
        except IndexError:
            if 'Unable to retrieve the requested information. Please try again' in html:
                self.warning('Soft error page, skipping.')
                return
            else:
                raise

        if 'B' in bill_id:
            _type = ['bill']
        elif 'J' in bill_id:
            _type = ['joint resolution']
        else:
            raise ValueError('unknown bill type ' + bill_id)

        bill = Bill(
            bill_id, legislative_session=session, chamber=chamber, title=title,
            classification=_type)
        bill.add_source(url)

        # process sponsors
        sponsors = _get_td(doc, 'All Sponsors:').text_content()
        sponsors = sponsors.replace('Delegates ', '')
        sponsors = sponsors.replace('Delegate ', '')
        sponsors = sponsors.replace('Senator ', '')
        sponsors = sponsors.replace('Senators ', '')
        sponsor_type = 'primary'

        for sponsor in re.split(', (?:and )?', sponsors):
            sponsor = sponsor.strip()
            if not sponsor:
                continue
            bill.add_sponsorship(
                sponsor,
                sponsor_type,
                primary=sponsor_type == 'primary',
                entity_type='person',
            )
            sponsor_type = 'cosponsor'

        # subjects
        subject_list = []
        for heading in ('Broad Subject(s):', 'Narrow Subject(s):'):
            subjects = _get_td(doc, heading).xpath('a/text()')
            subject_list += [s.split(' -see also-')[0] for s in subjects if s]
        bill.subject = subject_list

        # documents
        self.scrape_documents(bill, url.replace('stab=01', 'stab=02'))
        # actions
        self.scrape_actions(bill, url.replace('stab=01', 'stab=03'))

        yield bill
示例#20
0
    def handle_page(self):
        bills = self.doc.xpath('//ul[@class="linkSect"]/li')
        for bill in bills:
            link = bill.getchildren()[0]
            bill_id = str(link.text_content())

            if not bill_id.startswith(("S", "H")):
                continue

            # create a bill
            desc = bill.xpath("text()")[0].strip()
            chamber = {"H": "lower", "S": "upper"}[bill_id[0]]
            bill_type = {
                "B": "bill",
                "J": "joint resolution",
                "R": "resolution"
            }[bill_id[1]]
            bill = Bill(
                bill_id,
                self.kwargs["session"],
                desc,
                chamber=chamber,
                classification=bill_type,
            )

            bill_url = link.get("href")
            sponsor_url = BASE_URL + URL_PATTERNS["sponsors"].format(
                self.kwargs["session_id"], bill_id.replace(" ", ""))

            list(
                self.scrape_page_items(BillSponsorPage,
                                       url=sponsor_url,
                                       obj=bill))
            yield from self.scrape_page_items(BillDetailPage,
                                              url=bill_url,
                                              obj=bill)
            bill.subject = self.kwargs["subjects"][bill_id]
            bill.add_source(bill_url)
            yield bill

        next_url = self.doc.xpath('//a/b[text()="More..."]/../@href')
        if next_url:
            yield from self.scrape_page_items(BillListPage,
                                              url=next_url[0],
                                              **self.kwargs)
示例#21
0
    def handle_list_item(self, item):
        bill_id = item.text.strip()
        title = item.xpath("string(../following-sibling::td[1])").strip()
        sponsor = item.xpath("string(../following-sibling::td[2])").strip()
        bill_url = item.attrib['href'] + '/ByCategory'

        if bill_id.startswith(('SB ', 'HB ', 'SPB ', 'HPB ')):
            bill_type = 'bill'
        elif bill_id.startswith(('HR ', 'SR ')):
            bill_type = 'resolution'
        elif bill_id.startswith(('HJR ', 'SJR ')):
            bill_type = 'joint resolution'
        elif bill_id.startswith(('SCR ', 'HCR ')):
            bill_type = 'concurrent resolution'
        elif bill_id.startswith(('SM ', 'HM ')):
            bill_type = 'memorial'
        else:
            raise ValueError('Failed to identify bill type.')

        bill = Bill(bill_id, self.kwargs['session'], title,
                    chamber='lower' if bill_id[0] == 'H' else 'upper',
                    classification=bill_type)
        bill.add_source(bill_url)

        # normalize id from HB 0004 to H4
        subj_bill_id = re.sub('(H|S)\w+ 0*(\d+)', r'\1\2', bill_id)
        bill.subject = list(self.kwargs['subjects'][subj_bill_id])

        sponsor = re.sub(r'^(?:Rep|Sen)\.\s', "", sponsor)
        for sp in sponsor.split(', '):
            bill.add_sponsorship(sp, 'primary', 'person', True)

        yield from self.scrape_page_items(BillDetail, url=bill_url, obj=bill)

        yield bill
示例#22
0
    def scrape_bill(self, chamber, session):
        url = "ftp://www.arkleg.state.ar.us/dfadooas/LegislativeMeasures.txt"
        page = self.get(url).text
        page = unicode_csv_reader(StringIO(page), delimiter='|')

        for row in page:
            bill_chamber = {'H': 'lower', 'S': 'upper'}[row[0]]

            if bill_chamber != chamber:
                continue
            bill_id = "%s%s %s" % (row[0], row[1], row[2])

            type_spec = re.match(r'(H|S)([A-Z]+)\s', bill_id).group(2)
            bill_type = {
                'B': 'bill',
                'R': 'resolution',
                'JR': 'joint resolution',
                'CR': 'concurrent resolution',
                'MR': 'memorial',
                'CMR': 'concurrent memorial'
            }[type_spec]

            if row[-1] != self.slug:
                continue

            bill = Bill(bill_id,
                        legislative_session=session,
                        chamber=chamber,
                        title=row[3],
                        classification=bill_type)
            bill.add_source(url)

            primary = row[11]
            if not primary:
                primary = row[12]

            if primary:
                bill.add_sponsorship(primary,
                                     classification='primary',
                                     entity_type='person',
                                     primary=True)
            # ftp://www.arkleg.state.ar.us/Bills/
            # TODO: Keep on eye on this post 2017 to see if they apply R going forward.
            session_code = '2017R' if session == '2017' else session

            version_url = ("ftp://www.arkleg.state.ar.us/Bills/"
                           "%s/Public/%s.pdf" %
                           (session_code, bill_id.replace(' ', '')))
            bill.add_version_link(bill_id,
                                  version_url,
                                  media_type='application/pdf')

            yield from self.scrape_bill_page(bill)

            self.bills[bill_id] = bill
示例#23
0
    def scrape_bill(self, session, session_slug, chamber, url):
        page = lxml.html.fromstring(self.get(url).text)
        bill_no = page.xpath('//*[@id="item-header"]/text()')[0].strip()
        # state bill id
        internal_id = re.search(r"\/Bill\/(\d+)\/Overview", url).group(1)

        # bill data gets filled in from another call
        bill_data_base = (
            "https://www.leg.state.nv.us/App/NELIS/REL/{}/Bill/"
            "FillSelectedBillTab?selectedTab=Overview&billKey={}&_={}")
        bill_data_url = bill_data_base.format(session_slug, internal_id,
                                              time.time() * 1000)

        bill_page = lxml.html.fromstring(self.get(bill_data_url).text)

        short_title = self.get_header_field(bill_page, "Summary:").text
        short_title = short_title.replace(u"\u00a0", " ")

        bill = Bill(
            identifier=bill_no,
            legislative_session=session,
            title=short_title,
            chamber=chamber,
        )

        long_title = self.get_header_field(bill_page, "Title:").text
        if long_title is not None:
            bill.add_abstract(long_title, "Summary")

        sponsor_div = self.get_header_field(bill_page, "Primary Sponsor")
        if sponsor_div is not None:
            self.add_sponsors(sponsor_div, bill, "primary")

        cosponsor_div = self.get_header_field(bill_page, "Co-Sponsor")
        if cosponsor_div is not None:
            self.add_sponsors(cosponsor_div, bill, "cosponsor")

        self.add_actions(bill_page, bill, chamber)
        self.add_versions(session_slug, internal_id, bill)

        bill.subject = list(set(self.subject_mapping[bill_no]))

        bdr = self.extract_bdr(short_title)
        if bdr:
            bill.extras["BDR"] = bdr

        bill.extras["NV_ID"] = internal_id

        bill.add_source(url)
        yield bill
示例#24
0
    def scrape_chamber(self, chamber, session):
        chamber_abbrev = {'upper': 'SF', 'lower': 'HB'}[chamber]

        url = ("http://legisweb.state.wy.us/%s/billreference/"
               "BillReference.aspx?type=%s" % (session, chamber_abbrev))
        page = self.lxmlize(url)

        for tr in page.xpath(
                "//table[contains(@id,'cphContent_gvBills')]//tr")[1:]:
            bill_id = tr.xpath("string(td[1])").strip()
            title = tr.xpath("string(td[2])").strip()

            if bill_id[0:2] in ['SJ', 'HJ']:
                bill_type = 'joint resolution'
            else:
                bill_type = 'bill'

            bill = Bill(bill_id,
                        legislative_session=session,
                        title=title,
                        chamber=chamber,
                        classification=bill_type)

            yield from self.scrape_digest(bill, chamber)

            # versions
            for a in (tr.xpath('td[8]//a') + tr.xpath('td[11]//a') +
                      tr.xpath('td[12]//a')):
                # skip references to other bills
                if a.text.startswith('See'):
                    continue
                bill.add_version_link(a.text,
                                      a.get('href'),
                                      media_type='application/pdf')

            # documents
            fnote = tr.xpath('td[9]//a')
            if fnote:
                bill.add_document_link('Fiscal Note', fnote[0].get('href'))
            summary = tr.xpath('td[14]//a')
            if summary:
                bill.add_document_link('Summary', summary[0].get('href'))

            bill.add_source(url)
            yield bill
示例#25
0
    def get_bill(self, bill_id, **kwargs):
        url = 'http://www.denvergov.org/sirepub/item.aspx?itemid=%s' % bill_id
        self.urls.add(detail=url)

        bill_id = kwargs.pop('number')
        bill = Bill(bill_id,
                    self.session,
                    kwargs['title'],
                    'butt',
                    type=['bills'])
        bill.add_source(url, note='detail')

        xpath = '//table[contains(@class, "history")]/tr'
        for tr in self.urls.detail.xpath(xpath):
            import pdb
            pdb.set_trace()

        return bill
示例#26
0
    def scrape_bill(self, chamber, session):
        url = "ftp://www.arkleg.state.ar.us/SessionInformation/LegislativeMeasures.txt"
        page = csv.reader(get_utf_16_ftp_content(url).splitlines(), delimiter="|")

        for row in page:
            bill_chamber = {"H": "lower", "S": "upper"}[row[0]]

            if bill_chamber != chamber:
                continue
            bill_id = "%s%s %s" % (row[0], row[1], row[2])

            type_spec = re.match(r"(H|S)([A-Z]+)\s", bill_id).group(2)
            bill_type = {
                "B": "bill",
                "R": "resolution",
                "JR": "joint resolution",
                "CR": "concurrent resolution",
                "MR": "memorial",
                "CMR": "concurrent memorial",
            }[type_spec]

            if row[-1] != self.slug:
                continue

            bill = Bill(
                bill_id,
                legislative_session=session,
                chamber=chamber,
                title=row[3],
                classification=bill_type,
            )
            bill.add_source(url)

            primary = row[11]
            if not primary:
                primary = row[12]

            if primary:
                bill.add_sponsorship(
                    primary,
                    classification="primary",
                    entity_type="person",
                    primary=True,
                )

            version_url = (
                "ftp://www.arkleg.state.ar.us/Bills/"
                "%s/Public/Searchable/%s.pdf" % (self.slug, bill_id.replace(" ", ""))
            )
            bill.add_version_link(bill_id, version_url, media_type="application/pdf")

            yield from self.scrape_bill_page(bill)

            self.bills[bill_id] = bill
示例#27
0
def test_bill_type_setting():
    # default
    b = Bill(identifier="some bill", legislative_session="session", title="the title")
    assert b.classification == ["bill"]

    # string -> list
    b = Bill(identifier="some bill", legislative_session="session", title="the title",
             classification="string")
    assert b.classification == ["string"]

    # list unmodified
    b = Bill(identifier="some bill", legislative_session="session", title="the title",
             classification=["two", "items"])
    assert b.classification == ["two", "items"]

    # tuple -> list
    b = Bill(identifier="some bill", legislative_session="session", title="the title",
             classification=("two", "items"))
    assert b.classification == ["two", "items"]
示例#28
0
    def scrape_bill(self, chamber, session):
        url = "ftp://www.arkleg.state.ar.us/SessionInformation/LegislativeMeasures.txt"
        page = csv.reader(get_utf_16_ftp_content(url).splitlines(),
                          delimiter='|')

        for row in page:
            bill_chamber = {'H': 'lower', 'S': 'upper'}[row[0]]

            if bill_chamber != chamber:
                continue
            bill_id = "%s%s %s" % (row[0], row[1], row[2])

            type_spec = re.match(r'(H|S)([A-Z]+)\s', bill_id).group(2)
            bill_type = {
                'B': 'bill',
                'R': 'resolution',
                'JR': 'joint resolution',
                'CR': 'concurrent resolution',
                'MR': 'memorial',
                'CMR': 'concurrent memorial'
            }[type_spec]

            if row[-1] != self.slug:
                continue

            bill = Bill(bill_id,
                        legislative_session=session,
                        chamber=chamber,
                        title=row[3],
                        classification=bill_type)
            bill.add_source(url)

            primary = row[11]
            if not primary:
                primary = row[12]

            if primary:
                bill.add_sponsorship(primary,
                                     classification='primary',
                                     entity_type='person',
                                     primary=True)

            version_url = ("ftp://www.arkleg.state.ar.us/Bills/"
                           "%s/Public/Searchable/%s.pdf" %
                           (self.slug, bill_id.replace(' ', '')))
            bill.add_version_link(bill_id,
                                  version_url,
                                  media_type='application/pdf')

            yield from self.scrape_bill_page(bill)

            self.bills[bill_id] = bill
示例#29
0
def test_bill_update():
    create_jurisdiction()
    create_org()

    bill = ScrapeBill('HB 1', '1900', 'First Bill')

    oi = OrganizationImporter('jid')
    _, what = BillImporter('jid', oi).import_item(bill.as_dict())
    assert what == 'insert'
    _, what = BillImporter('jid', oi).import_item(bill.as_dict())
    assert what == 'noop'

    # ensure no new object was created
    assert Bill.objects.count() == 1

    # test basic update
    bill = ScrapeBill('HB 1', '1900', '1st Bill')
    _, what = BillImporter('jid', oi).import_item(bill.as_dict())
    assert what == 'update'
    assert Bill.objects.get().title == '1st Bill'
示例#30
0
def test_bill_update():
    create_jurisdiction()
    create_org()

    bill = ScrapeBill('HB 1', '1900', 'First Bill', chamber='lower')

    oi = OrganizationImporter('jid')
    _, what = BillImporter('jid', oi).import_item(bill.as_dict())
    assert what == 'insert'
    _, what = BillImporter('jid', oi).import_item(bill.as_dict())
    assert what == 'noop'

    # ensure no new object was created
    assert Bill.objects.count() == 1

    # test basic update
    bill = ScrapeBill('HB 1', '1900', '1st Bill', chamber='lower')
    _, what = BillImporter('jid', oi).import_item(bill.as_dict())
    assert what == 'update'
    assert Bill.objects.get().title == '1st Bill'
示例#31
0
def test_bill_sponsor_limit_lookup():
    create_jurisdiction()
    org = create_org()

    bill = ScrapeBill('HB 1',
                      '1900',
                      'Axe & Tack Tax Act',
                      classification='tax bill',
                      chamber='lower')
    bill.add_sponsorship_by_identifier(name="SNODGRASS",
                                       classification='sponsor',
                                       entity_type='person',
                                       primary=True,
                                       identifier="TOTALLY_REAL_ID",
                                       scheme="TOTALLY_REAL_SCHEME")

    oi = OrganizationImporter('jid')
    pi = PersonImporter('jid')

    zs = ScrapePerson(name='Zadock Snodgrass', birth_date="1800-01-01")
    zs.add_identifier(identifier='TOTALLY_REAL_ID',
                      scheme='TOTALLY_REAL_SCHEME')
    pi.import_data([zs.as_dict()])

    za_db = Person.objects.get()
    Membership.objects.create(person_id=za_db.id, organization_id=org.id)

    zs2 = ScrapePerson(name='Zadock Snodgrass', birth_date="1900-01-01")
    zs2.add_identifier(identifier='TOTALLY_REAL_ID',
                       scheme='TOTALLY_REAL_SCHEME')

    # This is contrived and perhaps broken, but we're going to check this.
    # We *really* don't want to *ever* cross jurisdiction bounds.
    PersonImporter('another-jurisdiction').import_data([zs.as_dict()])

    BillImporter('jid', oi, pi).import_data([bill.as_dict()])

    obj = Bill.objects.get()
    (entry, ) = obj.sponsorships.all()
    assert entry.person.name == "Zadock Snodgrass"
    assert entry.person.birth_date == "1800-01-01"
示例#32
0
def test_bill_action_extras():
    create_jurisdiction()
    create_org()

    bill = ScrapeBill('HB 1',
                      '1900',
                      'Axe & Tack Tax Act',
                      classification='tax bill',
                      chamber='lower')
    bill.add_action('sample',
                    '1900-01-01',
                    chamber='lower',
                    extras={'test': 3})

    oi = OrganizationImporter('jid')
    pi = PersonImporter('jid')

    BillImporter('jid', oi, pi).import_data([bill.as_dict()])

    b = Bill.objects.get()
    assert b.actions.all()[0].extras == {'test': 3}
示例#33
0
    def scrape_bill(self, chamber, session, bill_id):
        bill_num = bill_id.split()[1]

        url = ("%s/GetLegislation?biennium=%s&billNumber"
               "=%s" % (self._base_url, self.biennium, bill_num))

        page = self.get(url)
        page = lxml.etree.fromstring(page.content)
        page = xpath(page, "//wa:Legislation")[0]

        title = xpath(page, "string(wa:LongDescription)")

        bill_type = xpath(
            page,
            "string(wa:ShortLegislationType/wa:LongLegislationType)")
        bill_type = bill_type.lower()

        if bill_type == 'gubernatorial appointment':
            return

        bill = Bill(bill_id, legislative_session=session, chamber=chamber,
                    title=title, classification=[bill_type])
        fake_source = ("http://apps.leg.wa.gov/billinfo/"
                       "summary.aspx?bill=%s&year=%s" % (
                           bill_num, session[0:4]))

        bill.add_source(fake_source)

        try:
            for version in self.versions[bill_id]:
                bill.add_version_link(note=version['note'],
                                      url=version['url'],
                                      media_type=version['media_type'])
        except KeyError:
            self.warning("No versions were found for {}".format(bill_id))

        try:
            for document in self.documents[bill_num]:
                bill.add_document_link(note=document['note'],
                                       url=document['url'],
                                       media_type=document['media_type'])
        except KeyError:
            pass

        self.scrape_sponsors(bill)
        self.scrape_actions(bill, bill_num)
        self.scrape_hearings(bill, bill_num)
        yield from self.scrape_votes(bill)
        bill.subject = list(set(self._subjects[bill_id]))
        yield bill
示例#34
0
def test_bill_sponsor_limit_lookup():
    create_jurisdiction()
    org = create_org()

    bill = ScrapeBill('HB 1', '1900', 'Axe & Tack Tax Act',
                      classification='tax bill', chamber='lower')
    bill.add_sponsorship_by_identifier(name="SNODGRASS",
                                       classification='sponsor',
                                       entity_type='person',
                                       primary=True,
                                       identifier="TOTALLY_REAL_ID",
                                       scheme="TOTALLY_REAL_SCHEME")

    oi = OrganizationImporter('jid')
    pi = PersonImporter('jid')

    zs = ScrapePerson(name='Zadock Snodgrass', birth_date="1800-01-01")
    zs.add_identifier(identifier='TOTALLY_REAL_ID',
                      scheme='TOTALLY_REAL_SCHEME')
    pi.import_data([zs.as_dict()])

    za_db = Person.objects.get()
    Membership.objects.create(person_id=za_db.id,
                              organization_id=org.id)

    zs2 = ScrapePerson(name='Zadock Snodgrass', birth_date="1900-01-01")
    zs2.add_identifier(identifier='TOTALLY_REAL_ID',
                       scheme='TOTALLY_REAL_SCHEME')

    # This is contrived and perhaps broken, but we're going to check this.
    # We *really* don't want to *ever* cross jurisdiction bounds.
    PersonImporter('another-jurisdiction').import_data([zs.as_dict()])

    BillImporter('jid', oi, pi).import_data([bill.as_dict()])

    obj = Bill.objects.get()
    (entry,) = obj.sponsorships.all()
    assert entry.person.name == "Zadock Snodgrass"
    assert entry.person.birth_date == "1800-01-01"
示例#35
0
    def scrape_bill(self, session, session_slug, chamber, url):
        page = lxml.html.fromstring(self.get(url).text)
        bill_no = page.xpath('//*[@id="item-header"]/text()')[0].strip()
        # state bill id
        internal_id = re.search(r'\/Bill\/(\d+)\/Overview', url).group(1)

        # bill data gets filled in from another call
        bill_data_base = 'https://www.leg.state.nv.us/App/NELIS/REL/{}/Bill/' \
            'FillSelectedBillTab?selectedTab=Overview&billKey={}&_={}'
        bill_data_url = bill_data_base.format(
            session_slug, internal_id, time.time() * 1000)

        bill_page = lxml.html.fromstring(self.get(bill_data_url).text)

        short_title = self.get_header_field(bill_page, 'Summary:').text
        short_title = short_title.replace(u'\u00a0', ' ')

        bill = Bill(
            identifier=bill_no,
            legislative_session=session,
            title=short_title,
            chamber=chamber
        )

        long_title = self.get_header_field(bill_page, 'Title:').text
        if long_title is not None:
            bill.add_abstract(long_title, 'Summary')

        sponsor_div = self.get_header_field(bill_page, 'Primary Sponsor')
        if sponsor_div is not None:
            self.add_sponsors(sponsor_div, bill, 'primary')

        cosponsor_div = self.get_header_field(bill_page, 'Co-Sponsor')
        if cosponsor_div is not None:
            self.add_sponsors(cosponsor_div, bill, 'cosponsor')

        self.add_actions(bill_page, bill, chamber)
        self.add_versions(session_slug, internal_id, bill)

        bill.subject = list(set(self.subject_mapping[bill_no]))

        bdr = self.extract_bdr(short_title)
        if bdr:
            bill.extras['BDR'] = bdr

        bill.extras['NV_ID'] = internal_id

        bill.add_source(url)
        yield bill
示例#36
0
    def scrape_bill(self, session, chamber, bill_url):

        try:
            page = self.lxmlize('{}{}'.format(CO_URL_BASE, bill_url))
        except scrapelib.HTTPError as e:
            if e.response.status_code == 503:
                self.error('Skipping %s w/ 503', bill_url)
                return
            else:
                raise

        bill_number = page.xpath('//div[contains(@class,"field-name-field-bill-number")]'
                                 '//div[contains(@class,"field-item even")][1]/text()')[0].strip()

        bill_title = page.xpath('//span[@property="dc:title"]/@content')[0]

        bill_summary = page.xpath(
            'string(//div[contains(@class,"field-name-field-bill-summary")])')
        bill_summary = bill_summary.strip()
        bill = Bill(
                    bill_number,
                    legislative_session=session,
                    chamber=chamber,
                    title=bill_title,
            )
        if bill_summary:
            bill.add_abstract(bill_summary, 'summary')
        bill.add_source('{}{}'.format(CO_URL_BASE, bill_url))

        self.scrape_sponsors(bill, page)
        self.scrape_actions(bill, page)
        self.scrape_versions(bill, page)
        self.scrape_research_notes(bill, page)
        self.scrape_fiscal_notes(bill, page)
        self.scrape_committee_report(bill, page)
        self.scrape_amendments(bill, page)
        yield bill
        yield from self.scrape_votes(bill, page)
示例#37
0
    def handle_page(self):
        bills = self.doc.xpath('//ul[@class="linkSect"]/li')
        for bill in bills:
            link = bill.getchildren()[0]
            bill_id = str(link.text_content())

            if not bill_id.startswith(('S', 'H')):
                continue

            # create a bill
            desc = bill.xpath('text()')[0].strip()
            chamber = {
                'H': 'lower',
                'S': 'upper',
            }[bill_id[0]]
            bill_type = {
                'B': 'bill',
                'J': 'joint resolution',
                'R': 'resolution'
            }[bill_id[1]]
            bill = Bill(bill_id, self.kwargs['session'], desc,
                        chamber=chamber, classification=bill_type)

            bill_url = link.get('href')
            sponsor_url = BASE_URL + URL_PATTERNS['sponsors'].format(
                self.kwargs['session_id'],
                bill_id.replace(' ', ''),
            )

            list(self.scrape_page_items(BillSponsorPage, url=sponsor_url, obj=bill))
            yield from self.scrape_page_items(BillDetailPage, url=bill_url, obj=bill)
            bill.subject = self.kwargs['subjects'][bill_id]
            bill.add_source(bill_url)
            yield bill

        next_url = self.doc.xpath('//a/b[text()="More..."]/../@href')
        if next_url:
            yield from self.scrape_page_items(BillListPage, url=next_url[0], **self.kwargs)
示例#38
0
    def handle_page(self):
        bills = self.doc.xpath('//ul[@class="linkSect"]/li')
        for bill in bills:
            link = bill.getchildren()[0]
            bill_id = str(link.text_content())

            if not bill_id.startswith(('S', 'H')):
                continue

            # create a bill
            desc = bill.xpath('text()')[0].strip()
            chamber = {
                'H': 'lower',
                'S': 'upper',
            }[bill_id[0]]
            bill_type = {
                'B': 'bill',
                'J': 'joint resolution',
                'R': 'resolution'
            }[bill_id[1]]
            bill = Bill(bill_id, self.kwargs['session'], desc,
                        chamber=chamber, classification=bill_type)

            bill_url = link.get('href')
            sponsor_url = BASE_URL + URL_PATTERNS['sponsors'].format(
                self.kwargs['session_id'],
                bill_id.replace(' ', ''),
            )

            list(self.scrape_page_items(BillSponsorPage, url=sponsor_url, obj=bill))
            yield from self.scrape_page_items(BillDetailPage, url=bill_url, obj=bill)
            bill.subject = self.kwargs['subjects'][bill_id]
            bill.add_source(bill_url)
            yield bill

        next_url = self.doc.xpath('//a/b[text()="More..."]/../@href')
        if next_url:
            yield from self.scrape_page_items(BillListPage, url=next_url[0], **self.kwargs)
示例#39
0
def test_no_whitespace_in_uri():
    b = Bill(identifier="HB 2017",
             legislative_session="2012A",
             title="A bill for an act to raise the cookie budget by 200%",
             from_organization="Foo Senate",
             classification="bill")
    b.add_source("http://uri.example.com/fail here", note="foo")
    with pytest.raises(ScrapeValueError):
        b.validate()
示例#40
0
    def scrape_bill(self, chamber, session, url):
        html = self.get(url).content
        page = lxml.html.fromstring(html)
        page.make_links_absolute(self.BASE_URL)

        if page.xpath('//h2[@style="font-size:1.3rem;"]/a[1]/text()'):
            bill_id = page.xpath(
                '//h2[@style="font-size:1.3rem;"]/a[1]/text()')[0].strip()
        elif page.xpath('//h2[@style="font-size:1.3rem;"]/text()'):
            bill_id = page.xpath(
                '//h2[@style="font-size:1.3rem;"]/text()')[0].strip()
        else:
            self.warning("No bill id for {}".format(url))
            return
        title = page.xpath(
            '//dt[contains(text(), "Title")]/following-sibling::dd[1]/text()'
        )[0].strip()

        if "B" in bill_id:
            _type = ["bill"]
        elif "J" in bill_id:
            _type = ["joint resolution"]
        elif "HS" in bill_id:
            _type = ["resolution"]
        else:
            raise ValueError("unknown bill type " + bill_id)

        bill = Bill(
            bill_id,
            legislative_session=session,
            chamber=chamber,
            title=title,
            classification=_type,
        )
        bill.add_source(url)

        self.scrape_bill_subjects(bill, page)
        self.scrape_bill_sponsors(bill, page)
        self.scrape_bill_actions(bill, page)

        # fiscal note
        if page.xpath(
                '//dt[contains(text(), "Analysis")]/following-sibling::dd[1]/a'
        ):
            fiscal_note = page.xpath(
                '//dt[contains(text(), "Analysis")]/following-sibling::dd[1]/a'
            )[0]
            fiscal_url = fiscal_note.get("href")
            fiscal_title = fiscal_note.text_content()
            bill.add_document_link(
                fiscal_title,
                fiscal_url,
                media_type="application/pdf",
            )

        # yield from self.parse_bill_votes_new(doc, bill)
        yield bill
示例#41
0
    def scrape_bill(self, chamber, session, bill_id, session_id):
        bill_json_url = (
            "https://apps.azleg.gov/api/Bill/?billNumber={}&sessionId={}&"
            "legislativeBody={}".format(bill_id, session_id,
                                        self.chamber_map[chamber]))
        response = self.get(bill_json_url)
        page = json.loads(response.content.decode("utf-8"))

        if not page:
            self.warning("null page for %s", bill_id)
            return

        bill_title = page["ShortTitle"]
        bill_id = page["Number"]
        internal_id = page["BillId"]
        bill_type = self.get_bill_type(bill_id)
        bill = Bill(
            bill_id,
            legislative_session=session,
            chamber=chamber,
            title=bill_title,
            classification=bill_type,
        )

        self.scrape_actions(bill, page, chamber)
        self.scrape_versions_and_documents(bill, internal_id)
        self.scrape_sponsors(bill, internal_id)
        self.scrape_subjects(bill, internal_id)
        yield from self.scrape_votes(bill, page)

        bill_url = "https://apps.azleg.gov/BillStatus/BillOverview/{}?SessionId={}".format(
            internal_id, session_id)
        bill.add_source(bill_url)

        bill.actions = sorted(bill.actions, key=lambda action: action["date"])

        yield bill
示例#42
0
    def _recursively_process_bills(
            self, request_session, chamber, session, first_item=1):
        '''
        Once a search has been initiated, this function will save a
        Bill object for every Paper from the given chamber
        '''

        url = 'http://legislature.maine.gov/LawMakerWeb/searchresults.asp'
        r = request_session.get(url, params={'StartWith': first_item})
        r.raise_for_status()

        bills = lxml.html.fromstring(r.text).xpath('//tr/td/b/a')
        if bills:
            for bill in bills:
                bill_id_slug = bill.xpath('./@href')[0]
                bill_url = 'http://legislature.maine.gov/LawMakerWeb/{}'.format(bill_id_slug)
                bill_id = bill.text[:2] + " " + bill.text[2:]

                bill = Bill(
                    identifier=bill_id,
                    legislative_session=session,
                    title="",
                    chamber=chamber,
                )
                bill.add_source(bill_url)

                yield from self.scrape_bill(bill, chamber)
                yield bill

            # Make a recursive call to this function, for the next page
            PAGE_SIZE = 25
            yield from self._recursively_process_bills(
                request_session=request_session,
                chamber=chamber,
                session=session,
                first_item=first_item + PAGE_SIZE
            )
示例#43
0
    def scrape_bill_list(self, chamber, session, url):
        if 'joint_resolution' in url:
            bill_type = 'joint resolution'
        elif 'resolution' in url:
            bill_type = 'resolution'
        elif 'bill' in url:
            bill_type = 'bill'

        try:
            data = self.get(url).text
        except scrapelib.HTTPError:
            self.warning('skipping URL %s' % url)
            return
        doc = lxml.html.fromstring(data)
        doc.make_links_absolute(url)
        bill_list = doc.xpath('//ul[@class="infoLinks"]/li/div[@class="row-fluid"]')
        for b in bill_list:
            bill_url = b.xpath('./div[@class="span3"]/a/@href')[0]
            bill_id = bill_url.rsplit('/', 1)[-1]
            bill_id = bill_id.upper()

            title = b.xpath(
                './div[@class="span6"]/text()'
            )[0].replace(' - Relating to: ', '').strip()

            bill = Bill(
                bill_id,
                legislative_session=session,
                title=title,
                chamber=chamber,
                classification=bill_type,
            )
            bill.subject = list(set(self.subjects[bill_id]))
            yield from self.scrape_bill_history(bill, bill_url, chamber)

            yield bill
示例#44
0
    def scrape_bill_list(self, chamber, session, url):
        if 'joint_resolution' in url:
            bill_type = 'joint resolution'
        elif 'resolution' in url:
            bill_type = 'resolution'
        elif 'bill' in url:
            bill_type = 'bill'

        try:
            data = self.get(url).text
        except scrapelib.HTTPError:
            self.warning('skipping URL %s' % url)
            return
        doc = lxml.html.fromstring(data)
        doc.make_links_absolute(url)
        bill_list = doc.xpath(
            '//ul[@class="infoLinks"]/li/div[@class="row-fluid"]')
        for b in bill_list:
            bill_url = b.xpath('./div[@class="span3"]/a/@href')[0]
            bill_id = bill_url.rsplit('/', 1)[-1]
            bill_id = bill_id.upper()

            title = b.xpath('./div[@class="span6"]/text()')[0].replace(
                ' - Relating to: ', '').strip()

            bill = Bill(
                bill_id,
                legislative_session=session,
                title=title,
                chamber=chamber,
                classification=bill_type,
            )
            bill.subject = list(set(self.subjects[bill_id]))
            yield from self.scrape_bill_history(bill, bill_url, chamber)

            yield bill
示例#45
0
def test_fix_bill_id():
    create_jurisdiction()
    create_org()

    bill = ScrapeBill('HB1',
                      '1900',
                      'Test Bill ID',
                      classification='bill',
                      chamber='lower')

    oi = OrganizationImporter('jid')
    pi = PersonImporter('jid')

    from pupa.settings import IMPORT_TRANSFORMERS
    IMPORT_TRANSFORMERS['bill'] = {
        'identifier':
        lambda x: re.sub(r'([A-Z]*)\s*0*([-\d]+)', r'\1 \2', x, 1)
    }
    bi = BillImporter('jid', oi, pi)
    bi.import_data([bill.as_dict()])
    IMPORT_TRANSFORMERS['bill'] = {}

    b = Bill.objects.get()
    assert b.identifier == 'HB 1'
示例#46
0
def test_from_organization():
    # none set
    assert ((get_pseudo_id(
        Bill('HB 1', '2014', 'Some Bill').from_organization) == {
            'classification': 'legislature'
        }))

    # chamber set
    assert (get_pseudo_id(
        Bill('SB 1', '2014', 'Some Bill',
             chamber='upper').from_organization) == {
                 'classification': 'upper'
             })
    # org direct set
    assert Bill('HB 1', '2014', 'Some Bill',
                from_organization='test').from_organization == 'test'

    # can't set both
    with pytest.raises(ValueError):
        Bill('HB 1',
             '2014',
             'Some Bill',
             from_organization='upper',
             chamber='upper')
示例#47
0
    def scrape_chamber(self, chamber, session):
        chamber_abbrev = {'upper': 'SF', 'lower': 'HB'}[chamber]

        url = ("http://legisweb.state.wy.us/%s/billreference/"
               "BillReference.aspx?type=%s" % (session, chamber_abbrev))
        page = self.lxmlize(url)

        for tr in page.xpath("//table[contains(@id,'cphContent_gvBills')]//tr")[1:]:
            bill_id = tr.xpath("string(td[1])").strip()
            title = tr.xpath("string(td[2])").strip()

            if bill_id[0:2] in ['SJ', 'HJ']:
                bill_type = 'joint resolution'
            else:
                bill_type = 'bill'

            bill = Bill(bill_id, legislative_session=session, title=title, chamber=chamber,
                        classification=bill_type)

            yield from self.scrape_digest(bill, chamber)

            # versions
            for a in (tr.xpath('td[8]//a') + tr.xpath('td[11]//a') +
                      tr.xpath('td[12]//a')):
                # skip references to other bills
                if a.text.startswith('See'):
                    continue
                bill.add_version_link(a.text, a.get('href'),
                                      media_type='application/pdf')

            # documents
            fnote = tr.xpath('td[9]//a')
            if fnote:
                bill.add_document_link('Fiscal Note', fnote[0].get('href'))
            summary = tr.xpath('td[14]//a')
            if summary:
                bill.add_document_link('Summary', summary[0].get('href'))

            bill.add_source(url)
            yield bill
示例#48
0
    def scrape_bill(self, chamber, session, bill_id, url):
        html = self.get(url).text
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)

        title = doc.xpath('//h3[@class="h3billright"]')[0].text_content()
        # TODO: grab summary (none present at time of writing)

        if 'B' in bill_id:
            _type = ['bill']
        elif 'J' in bill_id:
            _type = ['joint resolution']
        else:
            raise ValueError('unknown bill type ' + bill_id)

        bill = Bill(
            bill_id, legislative_session=session, chamber=chamber, title=title,
            classification=_type)
        bill.add_source(url)

        # process sponsors
        sponsors = _get_td(doc, 'All Sponsors:').text_content()
        sponsors = sponsors.replace('Delegates ', '')
        sponsors = sponsors.replace('Delegate ', '')
        sponsors = sponsors.replace('Senator ', '')
        sponsors = sponsors.replace('Senators ', '')
        sponsor_type = 'primary'

        for sponsor in re.split(', (?:and )?', sponsors):
            sponsor = sponsor.strip()
            if not sponsor:
                continue
            bill.add_sponsorship(
                sponsor,
                sponsor_type,
                primary=sponsor_type == 'primary',
                entity_type='person',
            )
            sponsor_type = 'cosponsor'

        # subjects
        subject_list = []
        for heading in ('Broad Subject(s):', 'Narrow Subject(s):'):
            subjects = _get_td(doc, heading).xpath('a/text()')
            subject_list += [s.split(' -see also-')[0] for s in subjects if s]
        bill.subject = subject_list

        # documents
        yield from self.scrape_documents(bill, url.replace('stab=01', 'stab=02'))
        # actions
        self.scrape_actions(bill, url.replace('stab=01', 'stab=03'))

        yield bill
示例#49
0
    def scrape_bill_info(self, session, chambers):
        info_url = "ftp://ftp.cga.ct.gov/pub/data/bill_info.csv"
        data = self.get(info_url)
        page = open_csv(data)

        chamber_map = {"H": "lower", "S": "upper"}

        for row in page:
            bill_id = row["bill_num"]
            chamber = chamber_map[bill_id[0]]

            if chamber not in chambers:
                continue

            if re.match(r"^(S|H)J", bill_id):
                bill_type = "joint resolution"
            elif re.match(r"^(S|H)R", bill_id):
                bill_type = "resolution"
            else:
                bill_type = "bill"

            bill = Bill(
                identifier=bill_id,
                legislative_session=session,
                title=row["bill_title"],
                classification=bill_type,
                chamber=chamber,
            )
            bill.add_source(info_url)

            for introducer in self._introducers[bill_id]:
                introducer = string.capwords(
                    introducer.decode("utf-8").replace("Rep. ", "").replace(
                        "Sen. ", ""))
                if "Dist." in introducer:
                    introducer = " ".join(introducer.split()[:-2])
                bill.add_sponsorship(
                    name=introducer,
                    classification="primary",
                    primary=True,
                    entity_type="person",
                )

            try:
                for subject in self._subjects[bill_id]:
                    bill.subject.append(subject)

                self.bills[bill_id] = [bill, chamber]

                yield from self.scrape_bill_page(bill)
            except SkipBill:
                self.warning("no such bill: " + bill_id)
                pass
示例#50
0
    def scrape_bill(self, chamber, session):
        url = "ftp://www.arkleg.state.ar.us/dfadooas/LegislativeMeasures.txt"
        page = self.get(url).text
        page = unicode_csv_reader(StringIO(page), delimiter='|')

        for row in page:
            bill_chamber = {'H': 'lower', 'S': 'upper'}[row[0]]

            if bill_chamber != chamber:
                continue
            bill_id = "%s%s %s" % (row[0], row[1], row[2])

            type_spec = re.match(r'(H|S)([A-Z]+)\s', bill_id).group(2)
            bill_type = {
                'B': 'bill',
                'R': 'resolution',
                'JR': 'joint resolution',
                'CR': 'concurrent resolution',
                'MR': 'memorial',
                'CMR': 'concurrent memorial'}[type_spec]

            if row[-1] != self.slug:
                continue

            bill = Bill(bill_id, legislative_session=session,
                        chamber=chamber, title=row[3], classification=bill_type)
            bill.add_source(url)

            primary = row[11]
            if not primary:
                primary = row[12]

            if primary:
                bill.add_sponsorship(primary, classification='primary',
                                     entity_type='person', primary=True)
            # ftp://www.arkleg.state.ar.us/Bills/
            # TODO: Keep on eye on this post 2017 to see if they apply R going forward.
            session_code = '2017R' if session == '2017' else session

            version_url = ("ftp://www.arkleg.state.ar.us/Bills/"
                           "%s/Public/%s.pdf" % (
                               session_code, bill_id.replace(' ', '')))
            bill.add_version_link(bill_id, version_url, media_type='application/pdf')

            yield from self.scrape_bill_page(bill)

            self.bills[bill_id] = bill
示例#51
0
    def scrape_bill(self, chamber, session):
        url = "ftp://www.arkleg.state.ar.us/SessionInformation/LegislativeMeasures.txt"
        page = csv.reader(get_utf_16_ftp_content(url).splitlines(), delimiter='|')

        for row in page:
            bill_chamber = {'H': 'lower', 'S': 'upper'}[row[0]]

            if bill_chamber != chamber:
                continue
            bill_id = "%s%s %s" % (row[0], row[1], row[2])

            type_spec = re.match(r'(H|S)([A-Z]+)\s', bill_id).group(2)
            bill_type = {
                'B': 'bill',
                'R': 'resolution',
                'JR': 'joint resolution',
                'CR': 'concurrent resolution',
                'MR': 'memorial',
                'CMR': 'concurrent memorial'}[type_spec]

            if row[-1] != self.slug:
                continue

            bill = Bill(bill_id, legislative_session=session,
                        chamber=chamber, title=row[3], classification=bill_type)
            bill.add_source(url)

            primary = row[11]
            if not primary:
                primary = row[12]

            if primary:
                bill.add_sponsorship(primary, classification='primary',
                                     entity_type='person', primary=True)

            version_url = ("ftp://www.arkleg.state.ar.us/Bills/"
                           "%s/Public/Searchable/%s.pdf" % (
                               self.slug, bill_id.replace(' ', '')))
            bill.add_version_link(bill_id, version_url, media_type='application/pdf')

            yield from self.scrape_bill_page(bill)

            self.bills[bill_id] = bill
示例#52
0
    def scrape_bill_info(self, session, chambers):
        info_url = "ftp://ftp.cga.ct.gov/pub/data/bill_info.csv"
        data = self.get(info_url)
        page = open_csv(data)

        chamber_map = {'H': 'lower', 'S': 'upper'}

        for row in page:
            bill_id = row['bill_num']
            chamber = chamber_map[bill_id[0]]

            if chamber not in chambers:
                continue

            # assert that the bill data is from this session, CT is tricky
            assert row['sess_year'] == session

            if re.match(r'^(S|H)J', bill_id):
                bill_type = 'joint resolution'
            elif re.match(r'^(S|H)R', bill_id):
                bill_type = 'resolution'
            else:
                bill_type = 'bill'

            bill = Bill(identifier=bill_id,
                        legislative_session=session,
                        title=row['bill_title'],
                        classification=bill_type,
                        chamber=chamber)
            bill.add_source(info_url)

            for introducer in self._introducers[bill_id]:
                bill.add_sponsorship(name=introducer.decode('utf-8'),
                                     classification='primary',
                                     primary=True,
                                     entity_type='person')

            try:
                for subject in self._subjects[bill_id]:
                    bill.subject.append(subject)

                self.bills[bill_id] = [bill, chamber]

                yield from self.scrape_bill_page(bill)
            except SkipBill:
                self.warning('no such bill: ' + bill_id)
                pass
示例#53
0
    def createBill(self, agenda_item):
        title = agenda_item['Title'].replace('\n', ' ')
        title, primary_role, primary_sponsor, secondary_role, secondary_sponsor = re.match(
            agenda_item_title_re, title).groups()

        bill = {
            'identifier': agenda_item['Item No.'],
            'title': title,
            'legislative_session': agenda_item['session'],
            # TODO: Add agenda_item type to OCD
            'classification': 'bill',
            'from_organization': {
                'name': self.jurisdiction.name
            },
        }

        b = Bill(**bill)
        b.add_source(agenda_item['url'], note='web')

        if primary_sponsor and secondary_sponsor:
            b.add_sponsorship(primary_sponsor, 'mover', 'person', True)
            b.add_sponsorship(secondary_sponsor, 'seconder', 'person', False)

        return b
示例#54
0
    def handle_list_item(self, item):
        bill_id = item.text.strip()
        title = item.xpath("string(../following-sibling::td[1])").strip()
        sponsor = item.xpath("string(../following-sibling::td[2])").strip()
        bill_url = item.attrib["href"] + "/ByCategory"

        if bill_id.startswith(("SB ", "HB ", "SPB ", "HPB ")):
            bill_type = "bill"
        elif bill_id.startswith(("HR ", "SR ")):
            bill_type = "resolution"
        elif bill_id.startswith(("HJR ", "SJR ")):
            bill_type = "joint resolution"
        elif bill_id.startswith(("SCR ", "HCR ")):
            bill_type = "concurrent resolution"
        elif bill_id.startswith(("SM ", "HM ")):
            bill_type = "memorial"
        else:
            raise ValueError("Failed to identify bill type.")

        bill = Bill(
            bill_id,
            self.kwargs["session"],
            title,
            chamber="lower" if bill_id[0] == "H" else "upper",
            classification=bill_type,
        )
        bill.add_source(bill_url)

        # normalize id from HB 0004 to H4
        subj_bill_id = re.sub(r"(H|S)\w+ 0*(\d+)", r"\1\2", bill_id)
        bill.subject = list(self.kwargs["subjects"][subj_bill_id])

        sponsor = re.sub(r"^(?:Rep|Sen)\.\s", "", sponsor)
        for sp in sponsor.split(", "):
            sp = sp.strip()
            bill.add_sponsorship(sp, "primary", "person", True)

        yield from self.scrape_page_items(BillDetail, url=bill_url, obj=bill)

        yield bill
示例#55
0
    def scrape_bill_2012(self, chamber, session, bill_id, url):
        html = self.get(url).text
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)
        # find <a name="Title">, get parent dt, get parent dl, then dd n dl
        title = doc.xpath(
            '//a[@name="Title"][1]/../../dd[1]/text()')[0].strip()

        summary = doc.xpath('//font[@size="3"]/p/text()')[0].strip()

        if 'B' in bill_id:
            _type = ['bill']
        elif 'J' in bill_id:
            _type = ['joint resolution']

        bill = Bill(
            bill_id,
            legislative_session=session,
            classification=_type,
            chamber=chamber,
            title=title,
        )
        bill.add_abstract(summary, note='summary')
        bill.add_source(url)

        self.parse_bill_sponsors(doc, bill)  # sponsors
        self.parse_bill_actions(doc, bill)  # actions
        self.parse_bill_documents(doc, bill)  # documents and versions
        yield from self.parse_bill_votes(doc, bill)  # votes

        # subjects
        subjects = []
        for subj in doc.xpath('//a[contains(@href, "/subjects/")]'):
            subjects.append(subj.text.split('-see also-')[0])
        bill.subject = subjects

        # add bill to collection
        self.save_bill(bill)
示例#56
0
    def scrape_bill(self, session, chamber, bill_url):

        try:
            page = self.lxmlize('{}{}'.format(CO_URL_BASE, bill_url))
        except scrapelib.HTTPError as e:
            if e.response.status_code == 503:
                self.error('Skipping %s w/ 503', bill_url)
                return
            else:
                raise

        bill_number = page.xpath(
            '//div[contains(@class,"field-name-field-bill-number")]'
            '//div[contains(@class,"field-item even")][1]/text()')[0].strip()

        bill_title = page.xpath('//span[@property="dc:title"]/@content')[0]

        bill_summary = page.xpath(
            'string(//div[contains(@class,"field-name-field-bill-summary")])')
        bill_summary = bill_summary.strip()
        bill = Bill(
            bill_number,
            legislative_session=session,
            chamber=chamber,
            title=bill_title,
        )
        if bill_summary:
            bill.add_abstract(bill_summary, 'summary')
        bill.add_source('{}{}'.format(CO_URL_BASE, bill_url))

        self.scrape_sponsors(bill, page)
        self.scrape_actions(bill, page)
        self.scrape_versions(bill, page)
        self.scrape_research_notes(bill, page)
        self.scrape_fiscal_notes(bill, page)
        self.scrape_committee_report(bill, page)
        self.scrape_amendments(bill, page)
        yield bill
        yield from self.scrape_votes(bill, page)
示例#57
0
    def scrape_bill(self, session, chamber, url):
        html = self.get(url).content
        page = lxml.html.fromstring(html)

        title = page.xpath('//div[@id="main_0_header"]//h1/text()')[0].strip()

        parsed = urlparse.urlparse(url)
        bill_id = urlparse.parse_qs(parsed.query)['bId'][0]

        portfolio = self.dd(page, 'Portfolio')
        orig_house = self.dd(page, 'Originating house')
        print(bill_id, title, portfolio, orig_house)

        bill_chamber = self.CHAMBERS[orig_house]

        bill = Bill(bill_id,
                    legislative_session=session,
                    chamber=bill_chamber,
                    title=title,
                    classification='bill')


        sponsor = self.dd(page, 'Sponsor(s)')
        if sponsor:
            bill.add_sponsorship(name=sponsor,
                                classification="Primary",
                                entity_type="person",
                                primary=True)

        self.scrape_bill_actions(page, bill)
        self.scrape_bill_versions(page, bill)
        self.scrape_bill_documents(page, bill)

        bill.add_source(url)

        yield bill
示例#58
0
    def scrape(self):
        for leg_summary in self.legislation(
                created_after=datetime.datetime(2014, 1, 1)):
            leg_type = BILL_TYPES[leg_summary['Type']]

            bill = Bill(identifier=leg_summary['File\xa0#'],
                        title=leg_summary['Title'],
                        legislative_session=None,
                        classification=leg_type,
                        from_organization={"name": "New York City Council"})
            bill.add_source(leg_summary['url'])

            leg_details = self.legDetails(leg_summary['url'])
            history = self.history(leg_summary['url'])

            bill.add_title(leg_details['Name'],
                           note='created by administrative staff')

            if 'Summary' in leg_details:
                bill.add_abstract(leg_details['Summary'], note='')

            if leg_details['Law number']:
                bill.add_identifier(leg_details['Law number'],
                                    note='law number')

            for sponsorship in self._sponsors(leg_details.get('Sponsors', [])):
                sponsor, sponsorship_type, primary = sponsorship
                bill.add_sponsorship(sponsor,
                                     sponsorship_type,
                                     'person',
                                     primary,
                                     entity_id=_make_pseudo_id(name=sponsor))

            for attachment in leg_details.get('Attachments', []):
                bill.add_document_link(attachment['label'],
                                       attachment['url'],
                                       media_type="application/pdf")

            history = list(history)

            if history:
                earliest_action = min(
                    self.toTime(action['Date']) for action in history)

                bill.legislative_session = self.sessions(earliest_action)
            else:
                bill.legislative_session = str(self.SESSION_STARTS[0])

            for action in history:
                action_description = action['Action']
                if not action_description:
                    continue

                action_class = ACTION_CLASSIFICATION[action_description]

                action_date = self.toDate(action['Date'])
                responsible_org = action['Action\xa0By']
                if responsible_org == 'City Council':
                    responsible_org = 'New York City Council'
                elif responsible_org == 'Administration':
                    responsible_org = 'Mayor'

                if responsible_org == 'Town Hall Meeting':
                    continue
                else:
                    act = bill.add_action(
                        action_description,
                        action_date,
                        organization={'name': responsible_org},
                        classification=action_class)

                if 'url' in action['Action\xa0Details']:
                    action_detail_url = action['Action\xa0Details']['url']
                    if action_class == 'committee-referral':
                        action_details = self.actionDetails(action_detail_url)
                        referred_committee = action_details[
                            'Action text'].rsplit(' to the ', 1)[-1]
                        act.add_related_entity(
                            referred_committee,
                            'organization',
                            entity_id=_make_pseudo_id(name=referred_committee))
                    result, votes = self.extractVotes(action_detail_url)
                    if votes:
                        action_vote = VoteEvent(
                            legislative_session=bill.legislative_session,
                            motion_text=action_description,
                            organization={'name': responsible_org},
                            classification=action_class,
                            start_date=action_date,
                            result=result,
                            bill=bill)
                        action_vote.add_source(action_detail_url)

                        for option, voter in votes:
                            action_vote.vote(option, voter)

                        yield action_vote

            text = self.text(leg_summary['url'])

            if text:
                bill.extras = {
                    'local_classification': leg_summary['Type'],
                    'full_text': text
                }
            else:
                bill.extras = {'local_classification': leg_summary['Type']}

            yield bill
示例#59
0
    def scrape_chamber(self, chamber, session):
        chamber_name = "Senate" if chamber == "upper" else "House"
        chamber_letter = chamber_name[0]
        # perhaps we should save this data so we can make one request for both?
        bill_request = self.get(ksapi.url + "bill_status/").text
        bill_request_json = json.loads(bill_request)
        bills = bill_request_json["content"]
        for bill_data in bills:

            bill_id = bill_data["BILLNO"]

            # filter other chambers
            if not bill_id.startswith(chamber_letter):
                continue

            if "CR" in bill_id:
                btype = "concurrent resolution"
            elif "R" in bill_id:
                btype = "resolution"
            elif "B" in bill_id:
                btype = "bill"

            title = bill_data["SHORTTITLE"] or bill_data["LONGTITLE"]

            # main
            bill = Bill(bill_id,
                        session,
                        title,
                        chamber=chamber,
                        classification=btype)
            bill.extras = {"status": bill_data["STATUS"]}

            bill.add_source(ksapi.url + "bill_status/" + bill_id.lower())

            if bill_data["LONGTITLE"] and bill_data["LONGTITLE"] != bill.title:
                bill.add_title(bill_data["LONGTITLE"])

            # An "original sponsor" is the API's expression of "primary sponsor"
            for primary_sponsor in bill_data["ORIGINAL_SPONSOR"]:
                bill.add_sponsorship(
                    name=primary_sponsor,
                    entity_type="organization"
                    if "committee" in primary_sponsor.lower() else "person",
                    primary=True,
                    classification="original sponsor",
                )
            for sponsor in bill_data["SPONSOR_NAMES"]:
                if sponsor in bill_data["ORIGINAL_SPONSOR"]:
                    continue
                bill.add_sponsorship(
                    name=sponsor,
                    entity_type="organization"
                    if "committee" in sponsor.lower() else "person",
                    primary=False,
                    classification="cosponsor",
                )

            # history is backwards
            for event in reversed(bill_data["HISTORY"]):
                actor = "upper" if event["chamber"] == "Senate" else "lower"

                date = event["session_date"]
                # append committee names if present
                if "committee_names" in event:
                    action = (event["status"] + " " +
                              " and ".join(event["committee_names"]))
                else:
                    action = event["status"]

                if event["action_code"] not in ksapi.action_codes:
                    self.warning(
                        "unknown action code on %s: %s %s" %
                        (bill_id, event["action_code"], event["status"]))
                    atype = None
                else:
                    atype = ksapi.action_codes[event["action_code"]]
                bill.add_action(action,
                                date,
                                chamber=actor,
                                classification=atype)

            # Versions are exposed in `bill_data['versions'],
            # but lack any descriptive text or identifiers;
            # continue to scrape these from the HTML
            yield from self.scrape_html(bill, session)

            yield bill