def scrape_bill(self, chamber, session, bill_id, session_id): """ Scrapes documents, actions, vote counts and votes for a given bill. """ bill_json_url = 'https://apps.azleg.gov/api/Bill/?billNumber={}&sessionId={}&' \ 'legislativeBody={}'.format(bill_id, session_id, self.chamber_map[chamber]) response = self.get(bill_json_url) # print(response.content) page = json.loads(response.content.decode('utf-8')) bill_title = page['ShortTitle'] bill_id = page['Number'] internal_id = page['BillId'] bill_type = self.get_bill_type(bill_id) bill = Bill( bill_id, legislative_session=session, chamber=chamber, title=bill_title, classification=bill_type, ) bill = self.scrape_actions(bill, page, chamber) bill = self.scrape_versions(bill, internal_id) bill = self.scrape_sponsors(bill, internal_id) bill = self.scrape_subjects(bill, internal_id) bill_url = 'https://apps.azleg.gov/BillStatus/BillOverview/{}?SessionId={}'.format( internal_id, session_id) bill.add_source(bill_url) bill = self.sort_bill_actions(bill) yield bill
def test_fix_bill_id(): j = create_jurisdiction() j.legislative_sessions.create(name='1900', identifier='1900') org1 = ScrapeOrganization(name='House', classification='lower') bill = ScrapeBill('HB 1', '1900', 'Test Bill ID', classification='bill', chamber='lower') oi = OrganizationImporter('jid') oi.import_data([org1.as_dict()]) from pupa.settings import IMPORT_TRANSFORMERS IMPORT_TRANSFORMERS['bill'] = { 'identifier': lambda x: re.sub(r'([A-Z]*)\s*0*([-\d]+)', r'\1 \2', x, 1) } bi = BillImporter('jid', oi, DumbMockImporter()) bi.import_data([bill.as_dict()]) ve = ScrapeVoteEvent(legislative_session='1900', motion_text='passage', start_date='1900-04-02', classification='passage:bill', result='fail', bill_chamber='lower', bill='HB1', identifier='4', bill_action='passage', organization=org1._id) VoteEventImporter('jid', DumbMockImporter(), oi, bi).import_data([ ve.as_dict(), ]) IMPORT_TRANSFORMERS['bill'] = {} ve = VoteEvent.objects.get() ve.bill.identifier == 'HB 1'
def get_bill(self, bill_id, **kwargs): url = kwargs.pop('url') agenda_item = kwargs.pop('agenda_item') _type = self.get_type(bill_id) bill = Bill(bill_id, self.session, type=_type, **kwargs) bill.add_source(url, note='detail') return bill
def scrape_bill(self, chamber, session, bill_id, session_id): bill_json_url = 'https://apps.azleg.gov/api/Bill/?billNumber={}&sessionId={}&' \ 'legislativeBody={}'.format(bill_id, session_id, self.chamber_map[chamber]) response = self.get(bill_json_url) page = json.loads(response.content.decode('utf-8')) if not page: self.warning('null page for %s', bill_id) return bill_title = page['ShortTitle'] bill_id = page['Number'] internal_id = page['BillId'] bill_type = self.get_bill_type(bill_id) bill = Bill( bill_id, legislative_session=session, chamber=chamber, title=bill_title, classification=bill_type, ) self.scrape_actions(bill, page, chamber) self.scrape_versions_and_documents(bill, internal_id) self.scrape_sponsors(bill, internal_id) self.scrape_subjects(bill, internal_id) yield from self.scrape_votes(bill, page) bill_url = 'https://apps.azleg.gov/BillStatus/BillOverview/{}?SessionId={}'.format( internal_id, session_id) bill.add_source(bill_url) self.sort_bill_actions(bill) yield bill
def scrape(self): for i, page in enumerate(self.searchLegislation()) : for legislation_summary in self.parseSearchResults(page) : title = legislation_summary['Title'].strip() if title == "": continue if legislation_summary['Type'].lower() in ('order', 'claim', 'communication', 'report', 'oath of office') : continue else : bill_type = legislation_summary['Type'].lower() bill_session = self.session(legislation_summary['Intro\xa0Date']) bill = Bill(identifier=legislation_summary['Record #'], legislative_session=bill_session, title=title, classification=bill_type, from_organization=self.jurisdiction.name) bill.add_source(legislation_summary['url']) bill, votes = self.addDetails(bill, legislation_summary['url']) yield bill for vote in votes : yield vote
def test_bill_sponsor_by_identifier(): create_jurisdiction() org = create_org() bill = ScrapeBill('HB 1', '1900', 'Axe & Tack Tax Act', classification='tax bill', chamber='lower') bill.add_sponsorship_by_identifier(name="SNODGRASS", classification='sponsor', entity_type='person', primary=True, identifier="TOTALLY_REAL_ID", scheme="TOTALLY_REAL_SCHEME") oi = OrganizationImporter('jid') pi = PersonImporter('jid') zs = ScrapePerson(name='Zadock Snodgrass') zs.add_identifier(identifier='TOTALLY_REAL_ID', scheme='TOTALLY_REAL_SCHEME') pi.import_data([zs.as_dict()]) za_db = Person.objects.get() Membership.objects.create(person_id=za_db.id, organization_id=org.id) BillImporter('jid', oi, pi).import_data([bill.as_dict()]) obj = Bill.objects.get() (entry,) = obj.sponsorships.all() assert entry.person.name == "Zadock Snodgrass"
def toy_bill(): b = Bill( identifier="HB 2017", legislative_session="2012A", title="A bill for an act to raise the cookie budget by 200%", from_organization="Foo Senate", classification="bill", ) b.add_source("http://uri.example.com/", note="foo") return b
def get_bill(self, bill_id, **kwargs): if bill_id == '1': assert kwargs == {'extra': 'param'} raise self.ContinueScraping else: assert bill_id == '2' assert kwargs == {} b = Bill('1', self.session, 'title') b.add_source('http;//example.com') return b
def scrape_bill(self, row, chamber, session): bill_id = row['LegislationNumber'] # TODO: re-evaluate if these should be separate bills if 'SA' in bill_id or 'HA' in bill_id: self.warning('skipping amendment %s', bill_id) return bill_type = self.classify_bill(bill_id) bill = Bill(identifier=bill_id, legislative_session=session, chamber=chamber, title=row['LongTitle'], classification=bill_type) if row['Synopsis']: bill.add_abstract(row['Synopsis'], 'synopsis') if row['ShortTitle']: bill.add_title(row['ShortTitle'], 'short title') if row['SponsorPersonId']: self.add_sponsor_by_legislator_id(bill, row['SponsorPersonId'], 'primary') # TODO: Is there a way get additional sponsors and cosponsors, and versions/fns via API? html_url = 'https://legis.delaware.gov/BillDetail?LegislationId={}'.format( row['LegislationId'] ) bill.add_source(html_url, note='text/html') html = self.lxmlize(html_url) # Additional Sponsors: '//label[text()="Additional Sponsor(s):"]/following-sibling::div/a' additional_sponsors = html.xpath('//label[text()="Additional Sponsor(s):"]' '/following-sibling::div/a/@href') for sponsor_url in additional_sponsors: sponsor_id = sponsor_url.replace('https://legis.delaware.gov/LegislatorDetail?' 'personId=', '') self.add_sponsor_by_legislator_id(bill, sponsor_id, 'primary') # CoSponsors: '//label[text()="Co-Sponsor(s):"]/following-sibling::div/a' cosponsors = html.xpath('//label[text()="Additional Sponsor(s):"]/' 'following-sibling::div/a/@href') for sponsor_url in cosponsors: sponsor_id = sponsor_url.replace('https://legis.delaware.gov/LegislatorDetail?' 'personId=', '') self.add_sponsor_by_legislator_id(bill, sponsor_id, 'cosponsor') versions = html.xpath('//label[text()="Original Text:"]/following-sibling::div/a/@href') for version_url in versions: media_type = self.mime_from_link(version_url) version_name = 'Bill Text' # on_duplicate='error' bill.add_version_link(version_name, version_url, media_type=media_type) fiscals = html.xpath('//div[contains(@class,"fiscalNote")]/a/@href') for fiscal in fiscals: self.scrape_fiscal_note(bill, fiscal) self.scrape_actions(bill, row['LegislationId']) yield from self.scrape_votes(bill, row['LegislationId'], session) yield bill
def test_bill_chamber_param(): create_jurisdiction() org = create_org() bill = ScrapeBill('HB 1', '1900', 'Axe & Tack Tax Act', classification='tax bill', chamber='lower') oi = OrganizationImporter('jid') BillImporter('jid', oi).import_data([bill.as_dict()]) assert Bill.objects.get().from_organization_id == org.id
def scrape_bill(self, session, bill_id, chamber): # https://malegislature.gov/Bills/189/SD2739 session_for_url = self.replace_non_digits(session) bill_url = 'https://malegislature.gov/Bills/{}/{}'.format(session_for_url, bill_id) try: response = requests.get(bill_url) except requests.exceptions.RequestException as e: self.warning(u'Server Error on {}'.format(bill_url)) return False html = response.text page = lxml.html.fromstring(html) if not page.xpath('//div[contains(@class, "followable")]/h1/text()'): self.warning(u'Server Error on {}'.format(bill_url)) return False bill_title = page.xpath('//div[@id="contentContainer"]/div/div/h2/text()')[0] bill_id = re.sub(r'[^S|H|D|\d]', '', bill_id) bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=bill_title, classification='bill') bill_summary = None if page.xpath('//p[@id="pinslip"]/text()'): bill_summary = page.xpath('//p[@id="pinslip"]/text()')[0] if bill_summary: bill.add_abstract(bill_summary, 'summary') bill.add_source(bill_url) # https://malegislature.gov/Bills/189/SD2739 has a presenter # https://malegislature.gov/Bills/189/S2168 no sponsor # Find the non-blank text of the dt following Sponsor or Presenter, # including any child link text. sponsor = page.xpath( '//dt[text()="Sponsor:" or text()="Presenter:"]/' 'following-sibling::dd/descendant-or-self::*/text()[normalize-space()]') if sponsor: sponsor = sponsor[0].strip() bill.add_sponsorship(sponsor, classification='primary', primary=True, entity_type='person') self.scrape_cosponsors(bill, bill_url) version = page.xpath("//div[contains(@class, 'modalBtnGroup')]/" "a[contains(text(), 'Download PDF') and not(@disabled)]/@href") if version: version_url = "https://malegislature.gov{}".format(version[0]) bill.add_version_link('Bill Text', version_url, media_type='application/pdf') # yield back votes and bill yield from self.scrape_actions(bill, bill_url, session) yield bill
def test_vote_event_bill_actions_two_stage(): # this test is very similar to what we're testing in test_vote_event_bill_actions w/ # ve3 and ve4, that two bills that reference the same action won't conflict w/ the # OneToOneField, but in this case we do it in two stages so that the conflict is found # even if the votes weren't in the same scrape j = create_jurisdiction() j.legislative_sessions.create(name='1900', identifier='1900') org1 = ScrapeOrganization(name='House', classification='lower') bill = ScrapeBill('HB 1', '1900', 'Axe & Tack Tax Act', from_organization=org1._id) bill.add_action(description='passage', date='1900-04-02', chamber='lower') ve1 = ScrapeVoteEvent(legislative_session='1900', motion_text='passage', start_date='1900-04-02', classification='passage:bill', result='pass', bill_chamber='lower', bill='HB 1', bill_action='passage', organization=org1._id) ve2 = ScrapeVoteEvent(legislative_session='1900', motion_text='passage', start_date='1900-04-02', classification='passage:bill', result='pass', bill_chamber='lower', bill='HB 1', bill_action='passage', organization=org1._id) # disambiguate them ve1.pupa_id = 'one' ve2.pupa_id = 'two' oi = OrganizationImporter('jid') oi.import_data([org1.as_dict()]) bi = BillImporter('jid', oi, DumbMockImporter()) bi.import_data([bill.as_dict()]) # first imports just fine VoteEventImporter('jid', DumbMockImporter(), oi, bi).import_data([ ve1.as_dict(), ]) votes = list(VoteEvent.objects.all()) assert len(votes) == 1 assert votes[0].bill_action is not None # when second is imported, ensure that action stays pinned to first just as it would # have if they were both in same import VoteEventImporter('jid', DumbMockImporter(), oi, bi).import_data([ ve1.as_dict(), ve2.as_dict(), ]) votes = list(VoteEvent.objects.all()) assert len(votes) == 2 assert votes[0].bill_action is not None assert votes[1].bill_action is None
def get_bill(self, bill_id, **kwargs): url = 'http://www.denvergov.org/sirepub/item.aspx?itemid=%s' % bill_id self.urls.add(detail=url) bill_id = kwargs.pop('number') bill = Bill(bill_id, self.session, kwargs['title'], 'butt', type=['bills']) bill.add_source(url, note='detail') xpath = '//table[contains(@class, "history")]/tr' for tr in self.urls.detail.xpath(xpath): import pdb; pdb.set_trace() return bill
def scrape_bill(self, bill_page_url): bill_page = lxml.html.fromstring(self.get(bill_page_url).text) title = bill_page.xpath('//span[@id="ctl00_ContentPlaceHolder_SubjectLabel"]/text()') if title: title = title[0] else: self.warning('Missing bill title {}'.format(bill_page_url)) return False bill_no = bill_page.xpath( '//span[@id="ctl00_ContentPlaceHolder_BillNumberLabel"]/a/text()') if bill_no: bill_no = bill_no[0] else: bill_no = bill_page.xpath( '//span[@id="ctl00_ContentPlaceHolder_BillNumberLabel"]/text()') if bill_no: bill_no = bill_no[0] else: self.error('Missing bill number {}'.format(bill_page_url)) return False bill = Bill( bill_no, legislative_session=self.session, chamber='legislature', title=title, classification='bill' ) bill.add_source(bill_page_url) self.parse_versions(bill, bill_page, bill_no) self.parse_acts(bill, bill_page) sponsors = bill_page.xpath('//span[@id="ctl00_ContentPlaceHolder_SponsorsLabel"]/text()') if sponsors: self.assign_sponsors(bill, sponsors[0], 'primary') cosponsors = bill_page.xpath( '//span[@id="ctl00_ContentPlaceHolder_CoSponsorsLabel"]/text()') if cosponsors: self.assign_sponsors(bill, cosponsors[0], 'cosponsor') self.parse_date_actions(bill, bill_page) self.parse_actions(bill, bill_page) yield bill
def scrape_bill_info(self, session, chambers): info_url = "ftp://ftp.cga.ct.gov/pub/data/bill_info.csv" data = self.get(info_url) page = open_csv(data) chamber_map = {'H': 'lower', 'S': 'upper'} for row in page: bill_id = row['bill_num'] chamber = chamber_map[bill_id[0]] if chamber not in chambers: continue # assert that the bill data is from this session, CT is tricky assert row['sess_year'] == session if re.match(r'^(S|H)J', bill_id): bill_type = 'joint resolution' elif re.match(r'^(S|H)R', bill_id): bill_type = 'resolution' else: bill_type = 'bill' bill = Bill(identifier=bill_id, legislative_session=session, title=row['bill_title'], classification=bill_type, chamber=chamber) bill.add_source(info_url) for introducer in self._introducers[bill_id]: bill.add_sponsorship(name=str(introducer), classification='primary', primary=True, entity_type='person') try: for subject in self._subjects[bill_id]: bill.subject.append(subject) self.bills[bill_id] = [bill, chamber] yield from self.scrape_bill_page(bill) except SkipBill: self.warning('no such bill: ' + bill_id) pass
def test_full_vote_event(): j = Jurisdiction.objects.create(id='jid', division_id='did') j.legislative_sessions.create(name='1900', identifier='1900') sp1 = ScrapePerson('John Smith', primary_org='lower') sp2 = ScrapePerson('Adam Smith', primary_org='lower') org = ScrapeOrganization(name='House', classification='lower') bill = ScrapeBill('HB 1', '1900', 'Axe & Tack Tax Act', from_organization=org._id) vote_event = ScrapeVoteEvent(legislative_session='1900', motion_text='passage', start_date='1900-04-01', classification='passage:bill', result='pass', bill_chamber='lower', bill='HB 1', organization=org._id) vote_event.set_count('yes', 20) vote_event.yes('John Smith') vote_event.no('Adam Smith') oi = OrganizationImporter('jid') oi.import_data([org.as_dict()]) pi = PersonImporter('jid') pi.import_data([sp1.as_dict(), sp2.as_dict()]) mi = MembershipImporter('jid', pi, oi, DumbMockImporter()) mi.import_data([sp1._related[0].as_dict(), sp2._related[0].as_dict()]) bi = BillImporter('jid', oi, pi) bi.import_data([bill.as_dict()]) VoteEventImporter('jid', pi, oi, bi).import_data([vote_event.as_dict()]) assert VoteEvent.objects.count() == 1 ve = VoteEvent.objects.get() assert ve.legislative_session == LegislativeSession.objects.get() assert ve.motion_classification == ['passage:bill'] assert ve.bill == Bill.objects.get() count = ve.counts.get() assert count.option == 'yes' assert count.value == 20 votes = list(ve.votes.all()) assert len(votes) == 2 for v in ve.votes.all(): if v.voter_name == 'John Smith': assert v.option == 'yes' assert v.voter == Person.objects.get(name='John Smith') else: assert v.option == 'no' assert v.voter == Person.objects.get(name='Adam Smith')
def parse_bill(self, chamber, session, special, link): bill_num = link.text.strip() type_abbr = re.search('type=(B|R|)', link.attrib['href']).group(1) if type_abbr == 'B': btype = ['bill'] elif type_abbr == 'R': btype = ['resolution'] bill_id = "%s%s %s" % (utils.bill_abbr(chamber), type_abbr, bill_num) url = utils.info_url(chamber, session, special, type_abbr, bill_num) page = self.get(url).text page = lxml.html.fromstring(page) page.make_links_absolute(url) xpath = '/'.join([ '//div[contains(@class, "BillInfo-ShortTitle")]', 'div[@class="BillInfo-Section-Data"]', ]) title = page.xpath(xpath).pop().text_content().strip() if not title: return bill = Bill(bill_id, legislative_session=session, title=title, chamber=chamber, classification=btype) bill.add_source(url) self.parse_bill_versions(bill, page) self.parse_history(bill, chamber, utils.history_url(chamber, session, special, type_abbr, bill_num)) # only fetch votes if votes were seen in history # if vote_count: yield from self.parse_votes( bill, utils.vote_url(chamber, session, special, type_abbr, bill_num), ) # Dedupe sources. sources = bill.sources for source in sources: if 1 < sources.count(source): sources.remove(source) yield bill
def _recursively_process_bills(self, request_session, chamber, session, first_item=1): ''' Once a search has been initiated, this function will save a Bill object for every Paper from the given chamber ''' url = 'http://legislature.maine.gov/LawMakerWeb/searchresults.asp' r = request_session.get(url, params={'StartWith': first_item}) r.raise_for_status() bills = lxml.html.fromstring(r.text).xpath('//tr/td/b/a') if bills: for bill in bills: bill_id_slug = bill.xpath('./@href')[0] if bill_id_slug == 'summary.asp?ID=280068396': continue bill_url = 'http://legislature.maine.gov/LawMakerWeb/{}'.format( bill_id_slug) bill_id = bill.text[:2] + " " + bill.text[2:] if bill_id in BLACKLISTED_BILL_IDS[session]: continue bill = Bill( identifier=bill_id, legislative_session=session, title="", chamber=chamber, ) bill.add_source(bill_url) yield from self.scrape_bill(bill, chamber) yield bill # Make a recursive call to this function, for the next page PAGE_SIZE = 25 yield from self._recursively_process_bills( request_session=request_session, chamber=chamber, session=session, first_item=first_item + PAGE_SIZE)
def scrape_bill(self, chamber, session, bill_id, url): html = self.get(url).text doc = lxml.html.fromstring(html) doc.make_links_absolute(url) try: title = doc.xpath('//h3[@class="h3billright"]')[0].text_content() # TODO: grab summary (none present at time of writing) except IndexError: if 'Unable to retrieve the requested information. Please try again' in html: self.warning('Soft error page, skipping.') return else: raise if 'B' in bill_id: _type = ['bill'] elif 'J' in bill_id: _type = ['joint resolution'] else: raise ValueError('unknown bill type ' + bill_id) bill = Bill( bill_id, legislative_session=session, chamber=chamber, title=title, classification=_type) bill.add_source(url) # process sponsors sponsors = _get_td(doc, 'All Sponsors:').text_content() sponsors = sponsors.replace('Delegates ', '') sponsors = sponsors.replace('Delegate ', '') sponsors = sponsors.replace('Senator ', '') sponsors = sponsors.replace('Senators ', '') sponsor_type = 'primary' for sponsor in re.split(', (?:and )?', sponsors): sponsor = sponsor.strip() if not sponsor: continue bill.add_sponsorship( sponsor, sponsor_type, primary=sponsor_type == 'primary', entity_type='person', ) sponsor_type = 'cosponsor' # subjects subject_list = [] for heading in ('Broad Subject(s):', 'Narrow Subject(s):'): subjects = _get_td(doc, heading).xpath('a/text()') subject_list += [s.split(' -see also-')[0] for s in subjects if s] bill.subject = subject_list # documents self.scrape_documents(bill, url.replace('stab=01', 'stab=02')) # actions self.scrape_actions(bill, url.replace('stab=01', 'stab=03')) yield bill
def handle_page(self): bills = self.doc.xpath('//ul[@class="linkSect"]/li') for bill in bills: link = bill.getchildren()[0] bill_id = str(link.text_content()) if not bill_id.startswith(("S", "H")): continue # create a bill desc = bill.xpath("text()")[0].strip() chamber = {"H": "lower", "S": "upper"}[bill_id[0]] bill_type = { "B": "bill", "J": "joint resolution", "R": "resolution" }[bill_id[1]] bill = Bill( bill_id, self.kwargs["session"], desc, chamber=chamber, classification=bill_type, ) bill_url = link.get("href") sponsor_url = BASE_URL + URL_PATTERNS["sponsors"].format( self.kwargs["session_id"], bill_id.replace(" ", "")) list( self.scrape_page_items(BillSponsorPage, url=sponsor_url, obj=bill)) yield from self.scrape_page_items(BillDetailPage, url=bill_url, obj=bill) bill.subject = self.kwargs["subjects"][bill_id] bill.add_source(bill_url) yield bill next_url = self.doc.xpath('//a/b[text()="More..."]/../@href') if next_url: yield from self.scrape_page_items(BillListPage, url=next_url[0], **self.kwargs)
def handle_list_item(self, item): bill_id = item.text.strip() title = item.xpath("string(../following-sibling::td[1])").strip() sponsor = item.xpath("string(../following-sibling::td[2])").strip() bill_url = item.attrib['href'] + '/ByCategory' if bill_id.startswith(('SB ', 'HB ', 'SPB ', 'HPB ')): bill_type = 'bill' elif bill_id.startswith(('HR ', 'SR ')): bill_type = 'resolution' elif bill_id.startswith(('HJR ', 'SJR ')): bill_type = 'joint resolution' elif bill_id.startswith(('SCR ', 'HCR ')): bill_type = 'concurrent resolution' elif bill_id.startswith(('SM ', 'HM ')): bill_type = 'memorial' else: raise ValueError('Failed to identify bill type.') bill = Bill(bill_id, self.kwargs['session'], title, chamber='lower' if bill_id[0] == 'H' else 'upper', classification=bill_type) bill.add_source(bill_url) # normalize id from HB 0004 to H4 subj_bill_id = re.sub('(H|S)\w+ 0*(\d+)', r'\1\2', bill_id) bill.subject = list(self.kwargs['subjects'][subj_bill_id]) sponsor = re.sub(r'^(?:Rep|Sen)\.\s', "", sponsor) for sp in sponsor.split(', '): bill.add_sponsorship(sp, 'primary', 'person', True) yield from self.scrape_page_items(BillDetail, url=bill_url, obj=bill) yield bill
def scrape_bill(self, chamber, session): url = "ftp://www.arkleg.state.ar.us/dfadooas/LegislativeMeasures.txt" page = self.get(url).text page = unicode_csv_reader(StringIO(page), delimiter='|') for row in page: bill_chamber = {'H': 'lower', 'S': 'upper'}[row[0]] if bill_chamber != chamber: continue bill_id = "%s%s %s" % (row[0], row[1], row[2]) type_spec = re.match(r'(H|S)([A-Z]+)\s', bill_id).group(2) bill_type = { 'B': 'bill', 'R': 'resolution', 'JR': 'joint resolution', 'CR': 'concurrent resolution', 'MR': 'memorial', 'CMR': 'concurrent memorial' }[type_spec] if row[-1] != self.slug: continue bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=row[3], classification=bill_type) bill.add_source(url) primary = row[11] if not primary: primary = row[12] if primary: bill.add_sponsorship(primary, classification='primary', entity_type='person', primary=True) # ftp://www.arkleg.state.ar.us/Bills/ # TODO: Keep on eye on this post 2017 to see if they apply R going forward. session_code = '2017R' if session == '2017' else session version_url = ("ftp://www.arkleg.state.ar.us/Bills/" "%s/Public/%s.pdf" % (session_code, bill_id.replace(' ', ''))) bill.add_version_link(bill_id, version_url, media_type='application/pdf') yield from self.scrape_bill_page(bill) self.bills[bill_id] = bill
def scrape_bill(self, session, session_slug, chamber, url): page = lxml.html.fromstring(self.get(url).text) bill_no = page.xpath('//*[@id="item-header"]/text()')[0].strip() # state bill id internal_id = re.search(r"\/Bill\/(\d+)\/Overview", url).group(1) # bill data gets filled in from another call bill_data_base = ( "https://www.leg.state.nv.us/App/NELIS/REL/{}/Bill/" "FillSelectedBillTab?selectedTab=Overview&billKey={}&_={}") bill_data_url = bill_data_base.format(session_slug, internal_id, time.time() * 1000) bill_page = lxml.html.fromstring(self.get(bill_data_url).text) short_title = self.get_header_field(bill_page, "Summary:").text short_title = short_title.replace(u"\u00a0", " ") bill = Bill( identifier=bill_no, legislative_session=session, title=short_title, chamber=chamber, ) long_title = self.get_header_field(bill_page, "Title:").text if long_title is not None: bill.add_abstract(long_title, "Summary") sponsor_div = self.get_header_field(bill_page, "Primary Sponsor") if sponsor_div is not None: self.add_sponsors(sponsor_div, bill, "primary") cosponsor_div = self.get_header_field(bill_page, "Co-Sponsor") if cosponsor_div is not None: self.add_sponsors(cosponsor_div, bill, "cosponsor") self.add_actions(bill_page, bill, chamber) self.add_versions(session_slug, internal_id, bill) bill.subject = list(set(self.subject_mapping[bill_no])) bdr = self.extract_bdr(short_title) if bdr: bill.extras["BDR"] = bdr bill.extras["NV_ID"] = internal_id bill.add_source(url) yield bill
def scrape_chamber(self, chamber, session): chamber_abbrev = {'upper': 'SF', 'lower': 'HB'}[chamber] url = ("http://legisweb.state.wy.us/%s/billreference/" "BillReference.aspx?type=%s" % (session, chamber_abbrev)) page = self.lxmlize(url) for tr in page.xpath( "//table[contains(@id,'cphContent_gvBills')]//tr")[1:]: bill_id = tr.xpath("string(td[1])").strip() title = tr.xpath("string(td[2])").strip() if bill_id[0:2] in ['SJ', 'HJ']: bill_type = 'joint resolution' else: bill_type = 'bill' bill = Bill(bill_id, legislative_session=session, title=title, chamber=chamber, classification=bill_type) yield from self.scrape_digest(bill, chamber) # versions for a in (tr.xpath('td[8]//a') + tr.xpath('td[11]//a') + tr.xpath('td[12]//a')): # skip references to other bills if a.text.startswith('See'): continue bill.add_version_link(a.text, a.get('href'), media_type='application/pdf') # documents fnote = tr.xpath('td[9]//a') if fnote: bill.add_document_link('Fiscal Note', fnote[0].get('href')) summary = tr.xpath('td[14]//a') if summary: bill.add_document_link('Summary', summary[0].get('href')) bill.add_source(url) yield bill
def get_bill(self, bill_id, **kwargs): url = 'http://www.denvergov.org/sirepub/item.aspx?itemid=%s' % bill_id self.urls.add(detail=url) bill_id = kwargs.pop('number') bill = Bill(bill_id, self.session, kwargs['title'], 'butt', type=['bills']) bill.add_source(url, note='detail') xpath = '//table[contains(@class, "history")]/tr' for tr in self.urls.detail.xpath(xpath): import pdb pdb.set_trace() return bill
def scrape_bill(self, chamber, session): url = "ftp://www.arkleg.state.ar.us/SessionInformation/LegislativeMeasures.txt" page = csv.reader(get_utf_16_ftp_content(url).splitlines(), delimiter="|") for row in page: bill_chamber = {"H": "lower", "S": "upper"}[row[0]] if bill_chamber != chamber: continue bill_id = "%s%s %s" % (row[0], row[1], row[2]) type_spec = re.match(r"(H|S)([A-Z]+)\s", bill_id).group(2) bill_type = { "B": "bill", "R": "resolution", "JR": "joint resolution", "CR": "concurrent resolution", "MR": "memorial", "CMR": "concurrent memorial", }[type_spec] if row[-1] != self.slug: continue bill = Bill( bill_id, legislative_session=session, chamber=chamber, title=row[3], classification=bill_type, ) bill.add_source(url) primary = row[11] if not primary: primary = row[12] if primary: bill.add_sponsorship( primary, classification="primary", entity_type="person", primary=True, ) version_url = ( "ftp://www.arkleg.state.ar.us/Bills/" "%s/Public/Searchable/%s.pdf" % (self.slug, bill_id.replace(" ", "")) ) bill.add_version_link(bill_id, version_url, media_type="application/pdf") yield from self.scrape_bill_page(bill) self.bills[bill_id] = bill
def test_bill_type_setting(): # default b = Bill(identifier="some bill", legislative_session="session", title="the title") assert b.classification == ["bill"] # string -> list b = Bill(identifier="some bill", legislative_session="session", title="the title", classification="string") assert b.classification == ["string"] # list unmodified b = Bill(identifier="some bill", legislative_session="session", title="the title", classification=["two", "items"]) assert b.classification == ["two", "items"] # tuple -> list b = Bill(identifier="some bill", legislative_session="session", title="the title", classification=("two", "items")) assert b.classification == ["two", "items"]
def scrape_bill(self, chamber, session): url = "ftp://www.arkleg.state.ar.us/SessionInformation/LegislativeMeasures.txt" page = csv.reader(get_utf_16_ftp_content(url).splitlines(), delimiter='|') for row in page: bill_chamber = {'H': 'lower', 'S': 'upper'}[row[0]] if bill_chamber != chamber: continue bill_id = "%s%s %s" % (row[0], row[1], row[2]) type_spec = re.match(r'(H|S)([A-Z]+)\s', bill_id).group(2) bill_type = { 'B': 'bill', 'R': 'resolution', 'JR': 'joint resolution', 'CR': 'concurrent resolution', 'MR': 'memorial', 'CMR': 'concurrent memorial' }[type_spec] if row[-1] != self.slug: continue bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=row[3], classification=bill_type) bill.add_source(url) primary = row[11] if not primary: primary = row[12] if primary: bill.add_sponsorship(primary, classification='primary', entity_type='person', primary=True) version_url = ("ftp://www.arkleg.state.ar.us/Bills/" "%s/Public/Searchable/%s.pdf" % (self.slug, bill_id.replace(' ', ''))) bill.add_version_link(bill_id, version_url, media_type='application/pdf') yield from self.scrape_bill_page(bill) self.bills[bill_id] = bill
def test_bill_update(): create_jurisdiction() create_org() bill = ScrapeBill('HB 1', '1900', 'First Bill') oi = OrganizationImporter('jid') _, what = BillImporter('jid', oi).import_item(bill.as_dict()) assert what == 'insert' _, what = BillImporter('jid', oi).import_item(bill.as_dict()) assert what == 'noop' # ensure no new object was created assert Bill.objects.count() == 1 # test basic update bill = ScrapeBill('HB 1', '1900', '1st Bill') _, what = BillImporter('jid', oi).import_item(bill.as_dict()) assert what == 'update' assert Bill.objects.get().title == '1st Bill'
def test_bill_update(): create_jurisdiction() create_org() bill = ScrapeBill('HB 1', '1900', 'First Bill', chamber='lower') oi = OrganizationImporter('jid') _, what = BillImporter('jid', oi).import_item(bill.as_dict()) assert what == 'insert' _, what = BillImporter('jid', oi).import_item(bill.as_dict()) assert what == 'noop' # ensure no new object was created assert Bill.objects.count() == 1 # test basic update bill = ScrapeBill('HB 1', '1900', '1st Bill', chamber='lower') _, what = BillImporter('jid', oi).import_item(bill.as_dict()) assert what == 'update' assert Bill.objects.get().title == '1st Bill'
def test_bill_sponsor_limit_lookup(): create_jurisdiction() org = create_org() bill = ScrapeBill('HB 1', '1900', 'Axe & Tack Tax Act', classification='tax bill', chamber='lower') bill.add_sponsorship_by_identifier(name="SNODGRASS", classification='sponsor', entity_type='person', primary=True, identifier="TOTALLY_REAL_ID", scheme="TOTALLY_REAL_SCHEME") oi = OrganizationImporter('jid') pi = PersonImporter('jid') zs = ScrapePerson(name='Zadock Snodgrass', birth_date="1800-01-01") zs.add_identifier(identifier='TOTALLY_REAL_ID', scheme='TOTALLY_REAL_SCHEME') pi.import_data([zs.as_dict()]) za_db = Person.objects.get() Membership.objects.create(person_id=za_db.id, organization_id=org.id) zs2 = ScrapePerson(name='Zadock Snodgrass', birth_date="1900-01-01") zs2.add_identifier(identifier='TOTALLY_REAL_ID', scheme='TOTALLY_REAL_SCHEME') # This is contrived and perhaps broken, but we're going to check this. # We *really* don't want to *ever* cross jurisdiction bounds. PersonImporter('another-jurisdiction').import_data([zs.as_dict()]) BillImporter('jid', oi, pi).import_data([bill.as_dict()]) obj = Bill.objects.get() (entry, ) = obj.sponsorships.all() assert entry.person.name == "Zadock Snodgrass" assert entry.person.birth_date == "1800-01-01"
def test_bill_action_extras(): create_jurisdiction() create_org() bill = ScrapeBill('HB 1', '1900', 'Axe & Tack Tax Act', classification='tax bill', chamber='lower') bill.add_action('sample', '1900-01-01', chamber='lower', extras={'test': 3}) oi = OrganizationImporter('jid') pi = PersonImporter('jid') BillImporter('jid', oi, pi).import_data([bill.as_dict()]) b = Bill.objects.get() assert b.actions.all()[0].extras == {'test': 3}
def scrape_bill(self, chamber, session, bill_id): bill_num = bill_id.split()[1] url = ("%s/GetLegislation?biennium=%s&billNumber" "=%s" % (self._base_url, self.biennium, bill_num)) page = self.get(url) page = lxml.etree.fromstring(page.content) page = xpath(page, "//wa:Legislation")[0] title = xpath(page, "string(wa:LongDescription)") bill_type = xpath( page, "string(wa:ShortLegislationType/wa:LongLegislationType)") bill_type = bill_type.lower() if bill_type == 'gubernatorial appointment': return bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=title, classification=[bill_type]) fake_source = ("http://apps.leg.wa.gov/billinfo/" "summary.aspx?bill=%s&year=%s" % ( bill_num, session[0:4])) bill.add_source(fake_source) try: for version in self.versions[bill_id]: bill.add_version_link(note=version['note'], url=version['url'], media_type=version['media_type']) except KeyError: self.warning("No versions were found for {}".format(bill_id)) try: for document in self.documents[bill_num]: bill.add_document_link(note=document['note'], url=document['url'], media_type=document['media_type']) except KeyError: pass self.scrape_sponsors(bill) self.scrape_actions(bill, bill_num) self.scrape_hearings(bill, bill_num) yield from self.scrape_votes(bill) bill.subject = list(set(self._subjects[bill_id])) yield bill
def test_bill_sponsor_limit_lookup(): create_jurisdiction() org = create_org() bill = ScrapeBill('HB 1', '1900', 'Axe & Tack Tax Act', classification='tax bill', chamber='lower') bill.add_sponsorship_by_identifier(name="SNODGRASS", classification='sponsor', entity_type='person', primary=True, identifier="TOTALLY_REAL_ID", scheme="TOTALLY_REAL_SCHEME") oi = OrganizationImporter('jid') pi = PersonImporter('jid') zs = ScrapePerson(name='Zadock Snodgrass', birth_date="1800-01-01") zs.add_identifier(identifier='TOTALLY_REAL_ID', scheme='TOTALLY_REAL_SCHEME') pi.import_data([zs.as_dict()]) za_db = Person.objects.get() Membership.objects.create(person_id=za_db.id, organization_id=org.id) zs2 = ScrapePerson(name='Zadock Snodgrass', birth_date="1900-01-01") zs2.add_identifier(identifier='TOTALLY_REAL_ID', scheme='TOTALLY_REAL_SCHEME') # This is contrived and perhaps broken, but we're going to check this. # We *really* don't want to *ever* cross jurisdiction bounds. PersonImporter('another-jurisdiction').import_data([zs.as_dict()]) BillImporter('jid', oi, pi).import_data([bill.as_dict()]) obj = Bill.objects.get() (entry,) = obj.sponsorships.all() assert entry.person.name == "Zadock Snodgrass" assert entry.person.birth_date == "1800-01-01"
def scrape_bill(self, session, session_slug, chamber, url): page = lxml.html.fromstring(self.get(url).text) bill_no = page.xpath('//*[@id="item-header"]/text()')[0].strip() # state bill id internal_id = re.search(r'\/Bill\/(\d+)\/Overview', url).group(1) # bill data gets filled in from another call bill_data_base = 'https://www.leg.state.nv.us/App/NELIS/REL/{}/Bill/' \ 'FillSelectedBillTab?selectedTab=Overview&billKey={}&_={}' bill_data_url = bill_data_base.format( session_slug, internal_id, time.time() * 1000) bill_page = lxml.html.fromstring(self.get(bill_data_url).text) short_title = self.get_header_field(bill_page, 'Summary:').text short_title = short_title.replace(u'\u00a0', ' ') bill = Bill( identifier=bill_no, legislative_session=session, title=short_title, chamber=chamber ) long_title = self.get_header_field(bill_page, 'Title:').text if long_title is not None: bill.add_abstract(long_title, 'Summary') sponsor_div = self.get_header_field(bill_page, 'Primary Sponsor') if sponsor_div is not None: self.add_sponsors(sponsor_div, bill, 'primary') cosponsor_div = self.get_header_field(bill_page, 'Co-Sponsor') if cosponsor_div is not None: self.add_sponsors(cosponsor_div, bill, 'cosponsor') self.add_actions(bill_page, bill, chamber) self.add_versions(session_slug, internal_id, bill) bill.subject = list(set(self.subject_mapping[bill_no])) bdr = self.extract_bdr(short_title) if bdr: bill.extras['BDR'] = bdr bill.extras['NV_ID'] = internal_id bill.add_source(url) yield bill
def scrape_bill(self, session, chamber, bill_url): try: page = self.lxmlize('{}{}'.format(CO_URL_BASE, bill_url)) except scrapelib.HTTPError as e: if e.response.status_code == 503: self.error('Skipping %s w/ 503', bill_url) return else: raise bill_number = page.xpath('//div[contains(@class,"field-name-field-bill-number")]' '//div[contains(@class,"field-item even")][1]/text()')[0].strip() bill_title = page.xpath('//span[@property="dc:title"]/@content')[0] bill_summary = page.xpath( 'string(//div[contains(@class,"field-name-field-bill-summary")])') bill_summary = bill_summary.strip() bill = Bill( bill_number, legislative_session=session, chamber=chamber, title=bill_title, ) if bill_summary: bill.add_abstract(bill_summary, 'summary') bill.add_source('{}{}'.format(CO_URL_BASE, bill_url)) self.scrape_sponsors(bill, page) self.scrape_actions(bill, page) self.scrape_versions(bill, page) self.scrape_research_notes(bill, page) self.scrape_fiscal_notes(bill, page) self.scrape_committee_report(bill, page) self.scrape_amendments(bill, page) yield bill yield from self.scrape_votes(bill, page)
def handle_page(self): bills = self.doc.xpath('//ul[@class="linkSect"]/li') for bill in bills: link = bill.getchildren()[0] bill_id = str(link.text_content()) if not bill_id.startswith(('S', 'H')): continue # create a bill desc = bill.xpath('text()')[0].strip() chamber = { 'H': 'lower', 'S': 'upper', }[bill_id[0]] bill_type = { 'B': 'bill', 'J': 'joint resolution', 'R': 'resolution' }[bill_id[1]] bill = Bill(bill_id, self.kwargs['session'], desc, chamber=chamber, classification=bill_type) bill_url = link.get('href') sponsor_url = BASE_URL + URL_PATTERNS['sponsors'].format( self.kwargs['session_id'], bill_id.replace(' ', ''), ) list(self.scrape_page_items(BillSponsorPage, url=sponsor_url, obj=bill)) yield from self.scrape_page_items(BillDetailPage, url=bill_url, obj=bill) bill.subject = self.kwargs['subjects'][bill_id] bill.add_source(bill_url) yield bill next_url = self.doc.xpath('//a/b[text()="More..."]/../@href') if next_url: yield from self.scrape_page_items(BillListPage, url=next_url[0], **self.kwargs)
def test_no_whitespace_in_uri(): b = Bill(identifier="HB 2017", legislative_session="2012A", title="A bill for an act to raise the cookie budget by 200%", from_organization="Foo Senate", classification="bill") b.add_source("http://uri.example.com/fail here", note="foo") with pytest.raises(ScrapeValueError): b.validate()
def scrape_bill(self, chamber, session, url): html = self.get(url).content page = lxml.html.fromstring(html) page.make_links_absolute(self.BASE_URL) if page.xpath('//h2[@style="font-size:1.3rem;"]/a[1]/text()'): bill_id = page.xpath( '//h2[@style="font-size:1.3rem;"]/a[1]/text()')[0].strip() elif page.xpath('//h2[@style="font-size:1.3rem;"]/text()'): bill_id = page.xpath( '//h2[@style="font-size:1.3rem;"]/text()')[0].strip() else: self.warning("No bill id for {}".format(url)) return title = page.xpath( '//dt[contains(text(), "Title")]/following-sibling::dd[1]/text()' )[0].strip() if "B" in bill_id: _type = ["bill"] elif "J" in bill_id: _type = ["joint resolution"] elif "HS" in bill_id: _type = ["resolution"] else: raise ValueError("unknown bill type " + bill_id) bill = Bill( bill_id, legislative_session=session, chamber=chamber, title=title, classification=_type, ) bill.add_source(url) self.scrape_bill_subjects(bill, page) self.scrape_bill_sponsors(bill, page) self.scrape_bill_actions(bill, page) # fiscal note if page.xpath( '//dt[contains(text(), "Analysis")]/following-sibling::dd[1]/a' ): fiscal_note = page.xpath( '//dt[contains(text(), "Analysis")]/following-sibling::dd[1]/a' )[0] fiscal_url = fiscal_note.get("href") fiscal_title = fiscal_note.text_content() bill.add_document_link( fiscal_title, fiscal_url, media_type="application/pdf", ) # yield from self.parse_bill_votes_new(doc, bill) yield bill
def scrape_bill(self, chamber, session, bill_id, session_id): bill_json_url = ( "https://apps.azleg.gov/api/Bill/?billNumber={}&sessionId={}&" "legislativeBody={}".format(bill_id, session_id, self.chamber_map[chamber])) response = self.get(bill_json_url) page = json.loads(response.content.decode("utf-8")) if not page: self.warning("null page for %s", bill_id) return bill_title = page["ShortTitle"] bill_id = page["Number"] internal_id = page["BillId"] bill_type = self.get_bill_type(bill_id) bill = Bill( bill_id, legislative_session=session, chamber=chamber, title=bill_title, classification=bill_type, ) self.scrape_actions(bill, page, chamber) self.scrape_versions_and_documents(bill, internal_id) self.scrape_sponsors(bill, internal_id) self.scrape_subjects(bill, internal_id) yield from self.scrape_votes(bill, page) bill_url = "https://apps.azleg.gov/BillStatus/BillOverview/{}?SessionId={}".format( internal_id, session_id) bill.add_source(bill_url) bill.actions = sorted(bill.actions, key=lambda action: action["date"]) yield bill
def _recursively_process_bills( self, request_session, chamber, session, first_item=1): ''' Once a search has been initiated, this function will save a Bill object for every Paper from the given chamber ''' url = 'http://legislature.maine.gov/LawMakerWeb/searchresults.asp' r = request_session.get(url, params={'StartWith': first_item}) r.raise_for_status() bills = lxml.html.fromstring(r.text).xpath('//tr/td/b/a') if bills: for bill in bills: bill_id_slug = bill.xpath('./@href')[0] bill_url = 'http://legislature.maine.gov/LawMakerWeb/{}'.format(bill_id_slug) bill_id = bill.text[:2] + " " + bill.text[2:] bill = Bill( identifier=bill_id, legislative_session=session, title="", chamber=chamber, ) bill.add_source(bill_url) yield from self.scrape_bill(bill, chamber) yield bill # Make a recursive call to this function, for the next page PAGE_SIZE = 25 yield from self._recursively_process_bills( request_session=request_session, chamber=chamber, session=session, first_item=first_item + PAGE_SIZE )
def scrape_bill_list(self, chamber, session, url): if 'joint_resolution' in url: bill_type = 'joint resolution' elif 'resolution' in url: bill_type = 'resolution' elif 'bill' in url: bill_type = 'bill' try: data = self.get(url).text except scrapelib.HTTPError: self.warning('skipping URL %s' % url) return doc = lxml.html.fromstring(data) doc.make_links_absolute(url) bill_list = doc.xpath('//ul[@class="infoLinks"]/li/div[@class="row-fluid"]') for b in bill_list: bill_url = b.xpath('./div[@class="span3"]/a/@href')[0] bill_id = bill_url.rsplit('/', 1)[-1] bill_id = bill_id.upper() title = b.xpath( './div[@class="span6"]/text()' )[0].replace(' - Relating to: ', '').strip() bill = Bill( bill_id, legislative_session=session, title=title, chamber=chamber, classification=bill_type, ) bill.subject = list(set(self.subjects[bill_id])) yield from self.scrape_bill_history(bill, bill_url, chamber) yield bill
def scrape_bill_list(self, chamber, session, url): if 'joint_resolution' in url: bill_type = 'joint resolution' elif 'resolution' in url: bill_type = 'resolution' elif 'bill' in url: bill_type = 'bill' try: data = self.get(url).text except scrapelib.HTTPError: self.warning('skipping URL %s' % url) return doc = lxml.html.fromstring(data) doc.make_links_absolute(url) bill_list = doc.xpath( '//ul[@class="infoLinks"]/li/div[@class="row-fluid"]') for b in bill_list: bill_url = b.xpath('./div[@class="span3"]/a/@href')[0] bill_id = bill_url.rsplit('/', 1)[-1] bill_id = bill_id.upper() title = b.xpath('./div[@class="span6"]/text()')[0].replace( ' - Relating to: ', '').strip() bill = Bill( bill_id, legislative_session=session, title=title, chamber=chamber, classification=bill_type, ) bill.subject = list(set(self.subjects[bill_id])) yield from self.scrape_bill_history(bill, bill_url, chamber) yield bill
def test_fix_bill_id(): create_jurisdiction() create_org() bill = ScrapeBill('HB1', '1900', 'Test Bill ID', classification='bill', chamber='lower') oi = OrganizationImporter('jid') pi = PersonImporter('jid') from pupa.settings import IMPORT_TRANSFORMERS IMPORT_TRANSFORMERS['bill'] = { 'identifier': lambda x: re.sub(r'([A-Z]*)\s*0*([-\d]+)', r'\1 \2', x, 1) } bi = BillImporter('jid', oi, pi) bi.import_data([bill.as_dict()]) IMPORT_TRANSFORMERS['bill'] = {} b = Bill.objects.get() assert b.identifier == 'HB 1'
def test_from_organization(): # none set assert ((get_pseudo_id( Bill('HB 1', '2014', 'Some Bill').from_organization) == { 'classification': 'legislature' })) # chamber set assert (get_pseudo_id( Bill('SB 1', '2014', 'Some Bill', chamber='upper').from_organization) == { 'classification': 'upper' }) # org direct set assert Bill('HB 1', '2014', 'Some Bill', from_organization='test').from_organization == 'test' # can't set both with pytest.raises(ValueError): Bill('HB 1', '2014', 'Some Bill', from_organization='upper', chamber='upper')
def scrape_chamber(self, chamber, session): chamber_abbrev = {'upper': 'SF', 'lower': 'HB'}[chamber] url = ("http://legisweb.state.wy.us/%s/billreference/" "BillReference.aspx?type=%s" % (session, chamber_abbrev)) page = self.lxmlize(url) for tr in page.xpath("//table[contains(@id,'cphContent_gvBills')]//tr")[1:]: bill_id = tr.xpath("string(td[1])").strip() title = tr.xpath("string(td[2])").strip() if bill_id[0:2] in ['SJ', 'HJ']: bill_type = 'joint resolution' else: bill_type = 'bill' bill = Bill(bill_id, legislative_session=session, title=title, chamber=chamber, classification=bill_type) yield from self.scrape_digest(bill, chamber) # versions for a in (tr.xpath('td[8]//a') + tr.xpath('td[11]//a') + tr.xpath('td[12]//a')): # skip references to other bills if a.text.startswith('See'): continue bill.add_version_link(a.text, a.get('href'), media_type='application/pdf') # documents fnote = tr.xpath('td[9]//a') if fnote: bill.add_document_link('Fiscal Note', fnote[0].get('href')) summary = tr.xpath('td[14]//a') if summary: bill.add_document_link('Summary', summary[0].get('href')) bill.add_source(url) yield bill
def scrape_bill(self, chamber, session, bill_id, url): html = self.get(url).text doc = lxml.html.fromstring(html) doc.make_links_absolute(url) title = doc.xpath('//h3[@class="h3billright"]')[0].text_content() # TODO: grab summary (none present at time of writing) if 'B' in bill_id: _type = ['bill'] elif 'J' in bill_id: _type = ['joint resolution'] else: raise ValueError('unknown bill type ' + bill_id) bill = Bill( bill_id, legislative_session=session, chamber=chamber, title=title, classification=_type) bill.add_source(url) # process sponsors sponsors = _get_td(doc, 'All Sponsors:').text_content() sponsors = sponsors.replace('Delegates ', '') sponsors = sponsors.replace('Delegate ', '') sponsors = sponsors.replace('Senator ', '') sponsors = sponsors.replace('Senators ', '') sponsor_type = 'primary' for sponsor in re.split(', (?:and )?', sponsors): sponsor = sponsor.strip() if not sponsor: continue bill.add_sponsorship( sponsor, sponsor_type, primary=sponsor_type == 'primary', entity_type='person', ) sponsor_type = 'cosponsor' # subjects subject_list = [] for heading in ('Broad Subject(s):', 'Narrow Subject(s):'): subjects = _get_td(doc, heading).xpath('a/text()') subject_list += [s.split(' -see also-')[0] for s in subjects if s] bill.subject = subject_list # documents yield from self.scrape_documents(bill, url.replace('stab=01', 'stab=02')) # actions self.scrape_actions(bill, url.replace('stab=01', 'stab=03')) yield bill
def scrape_bill_info(self, session, chambers): info_url = "ftp://ftp.cga.ct.gov/pub/data/bill_info.csv" data = self.get(info_url) page = open_csv(data) chamber_map = {"H": "lower", "S": "upper"} for row in page: bill_id = row["bill_num"] chamber = chamber_map[bill_id[0]] if chamber not in chambers: continue if re.match(r"^(S|H)J", bill_id): bill_type = "joint resolution" elif re.match(r"^(S|H)R", bill_id): bill_type = "resolution" else: bill_type = "bill" bill = Bill( identifier=bill_id, legislative_session=session, title=row["bill_title"], classification=bill_type, chamber=chamber, ) bill.add_source(info_url) for introducer in self._introducers[bill_id]: introducer = string.capwords( introducer.decode("utf-8").replace("Rep. ", "").replace( "Sen. ", "")) if "Dist." in introducer: introducer = " ".join(introducer.split()[:-2]) bill.add_sponsorship( name=introducer, classification="primary", primary=True, entity_type="person", ) try: for subject in self._subjects[bill_id]: bill.subject.append(subject) self.bills[bill_id] = [bill, chamber] yield from self.scrape_bill_page(bill) except SkipBill: self.warning("no such bill: " + bill_id) pass
def scrape_bill(self, chamber, session): url = "ftp://www.arkleg.state.ar.us/dfadooas/LegislativeMeasures.txt" page = self.get(url).text page = unicode_csv_reader(StringIO(page), delimiter='|') for row in page: bill_chamber = {'H': 'lower', 'S': 'upper'}[row[0]] if bill_chamber != chamber: continue bill_id = "%s%s %s" % (row[0], row[1], row[2]) type_spec = re.match(r'(H|S)([A-Z]+)\s', bill_id).group(2) bill_type = { 'B': 'bill', 'R': 'resolution', 'JR': 'joint resolution', 'CR': 'concurrent resolution', 'MR': 'memorial', 'CMR': 'concurrent memorial'}[type_spec] if row[-1] != self.slug: continue bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=row[3], classification=bill_type) bill.add_source(url) primary = row[11] if not primary: primary = row[12] if primary: bill.add_sponsorship(primary, classification='primary', entity_type='person', primary=True) # ftp://www.arkleg.state.ar.us/Bills/ # TODO: Keep on eye on this post 2017 to see if they apply R going forward. session_code = '2017R' if session == '2017' else session version_url = ("ftp://www.arkleg.state.ar.us/Bills/" "%s/Public/%s.pdf" % ( session_code, bill_id.replace(' ', ''))) bill.add_version_link(bill_id, version_url, media_type='application/pdf') yield from self.scrape_bill_page(bill) self.bills[bill_id] = bill
def scrape_bill(self, chamber, session): url = "ftp://www.arkleg.state.ar.us/SessionInformation/LegislativeMeasures.txt" page = csv.reader(get_utf_16_ftp_content(url).splitlines(), delimiter='|') for row in page: bill_chamber = {'H': 'lower', 'S': 'upper'}[row[0]] if bill_chamber != chamber: continue bill_id = "%s%s %s" % (row[0], row[1], row[2]) type_spec = re.match(r'(H|S)([A-Z]+)\s', bill_id).group(2) bill_type = { 'B': 'bill', 'R': 'resolution', 'JR': 'joint resolution', 'CR': 'concurrent resolution', 'MR': 'memorial', 'CMR': 'concurrent memorial'}[type_spec] if row[-1] != self.slug: continue bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=row[3], classification=bill_type) bill.add_source(url) primary = row[11] if not primary: primary = row[12] if primary: bill.add_sponsorship(primary, classification='primary', entity_type='person', primary=True) version_url = ("ftp://www.arkleg.state.ar.us/Bills/" "%s/Public/Searchable/%s.pdf" % ( self.slug, bill_id.replace(' ', ''))) bill.add_version_link(bill_id, version_url, media_type='application/pdf') yield from self.scrape_bill_page(bill) self.bills[bill_id] = bill
def scrape_bill_info(self, session, chambers): info_url = "ftp://ftp.cga.ct.gov/pub/data/bill_info.csv" data = self.get(info_url) page = open_csv(data) chamber_map = {'H': 'lower', 'S': 'upper'} for row in page: bill_id = row['bill_num'] chamber = chamber_map[bill_id[0]] if chamber not in chambers: continue # assert that the bill data is from this session, CT is tricky assert row['sess_year'] == session if re.match(r'^(S|H)J', bill_id): bill_type = 'joint resolution' elif re.match(r'^(S|H)R', bill_id): bill_type = 'resolution' else: bill_type = 'bill' bill = Bill(identifier=bill_id, legislative_session=session, title=row['bill_title'], classification=bill_type, chamber=chamber) bill.add_source(info_url) for introducer in self._introducers[bill_id]: bill.add_sponsorship(name=introducer.decode('utf-8'), classification='primary', primary=True, entity_type='person') try: for subject in self._subjects[bill_id]: bill.subject.append(subject) self.bills[bill_id] = [bill, chamber] yield from self.scrape_bill_page(bill) except SkipBill: self.warning('no such bill: ' + bill_id) pass
def createBill(self, agenda_item): title = agenda_item['Title'].replace('\n', ' ') title, primary_role, primary_sponsor, secondary_role, secondary_sponsor = re.match( agenda_item_title_re, title).groups() bill = { 'identifier': agenda_item['Item No.'], 'title': title, 'legislative_session': agenda_item['session'], # TODO: Add agenda_item type to OCD 'classification': 'bill', 'from_organization': { 'name': self.jurisdiction.name }, } b = Bill(**bill) b.add_source(agenda_item['url'], note='web') if primary_sponsor and secondary_sponsor: b.add_sponsorship(primary_sponsor, 'mover', 'person', True) b.add_sponsorship(secondary_sponsor, 'seconder', 'person', False) return b
def handle_list_item(self, item): bill_id = item.text.strip() title = item.xpath("string(../following-sibling::td[1])").strip() sponsor = item.xpath("string(../following-sibling::td[2])").strip() bill_url = item.attrib["href"] + "/ByCategory" if bill_id.startswith(("SB ", "HB ", "SPB ", "HPB ")): bill_type = "bill" elif bill_id.startswith(("HR ", "SR ")): bill_type = "resolution" elif bill_id.startswith(("HJR ", "SJR ")): bill_type = "joint resolution" elif bill_id.startswith(("SCR ", "HCR ")): bill_type = "concurrent resolution" elif bill_id.startswith(("SM ", "HM ")): bill_type = "memorial" else: raise ValueError("Failed to identify bill type.") bill = Bill( bill_id, self.kwargs["session"], title, chamber="lower" if bill_id[0] == "H" else "upper", classification=bill_type, ) bill.add_source(bill_url) # normalize id from HB 0004 to H4 subj_bill_id = re.sub(r"(H|S)\w+ 0*(\d+)", r"\1\2", bill_id) bill.subject = list(self.kwargs["subjects"][subj_bill_id]) sponsor = re.sub(r"^(?:Rep|Sen)\.\s", "", sponsor) for sp in sponsor.split(", "): sp = sp.strip() bill.add_sponsorship(sp, "primary", "person", True) yield from self.scrape_page_items(BillDetail, url=bill_url, obj=bill) yield bill
def scrape_bill_2012(self, chamber, session, bill_id, url): html = self.get(url).text doc = lxml.html.fromstring(html) doc.make_links_absolute(url) # find <a name="Title">, get parent dt, get parent dl, then dd n dl title = doc.xpath( '//a[@name="Title"][1]/../../dd[1]/text()')[0].strip() summary = doc.xpath('//font[@size="3"]/p/text()')[0].strip() if 'B' in bill_id: _type = ['bill'] elif 'J' in bill_id: _type = ['joint resolution'] bill = Bill( bill_id, legislative_session=session, classification=_type, chamber=chamber, title=title, ) bill.add_abstract(summary, note='summary') bill.add_source(url) self.parse_bill_sponsors(doc, bill) # sponsors self.parse_bill_actions(doc, bill) # actions self.parse_bill_documents(doc, bill) # documents and versions yield from self.parse_bill_votes(doc, bill) # votes # subjects subjects = [] for subj in doc.xpath('//a[contains(@href, "/subjects/")]'): subjects.append(subj.text.split('-see also-')[0]) bill.subject = subjects # add bill to collection self.save_bill(bill)
def scrape_bill(self, session, chamber, bill_url): try: page = self.lxmlize('{}{}'.format(CO_URL_BASE, bill_url)) except scrapelib.HTTPError as e: if e.response.status_code == 503: self.error('Skipping %s w/ 503', bill_url) return else: raise bill_number = page.xpath( '//div[contains(@class,"field-name-field-bill-number")]' '//div[contains(@class,"field-item even")][1]/text()')[0].strip() bill_title = page.xpath('//span[@property="dc:title"]/@content')[0] bill_summary = page.xpath( 'string(//div[contains(@class,"field-name-field-bill-summary")])') bill_summary = bill_summary.strip() bill = Bill( bill_number, legislative_session=session, chamber=chamber, title=bill_title, ) if bill_summary: bill.add_abstract(bill_summary, 'summary') bill.add_source('{}{}'.format(CO_URL_BASE, bill_url)) self.scrape_sponsors(bill, page) self.scrape_actions(bill, page) self.scrape_versions(bill, page) self.scrape_research_notes(bill, page) self.scrape_fiscal_notes(bill, page) self.scrape_committee_report(bill, page) self.scrape_amendments(bill, page) yield bill yield from self.scrape_votes(bill, page)
def scrape_bill(self, session, chamber, url): html = self.get(url).content page = lxml.html.fromstring(html) title = page.xpath('//div[@id="main_0_header"]//h1/text()')[0].strip() parsed = urlparse.urlparse(url) bill_id = urlparse.parse_qs(parsed.query)['bId'][0] portfolio = self.dd(page, 'Portfolio') orig_house = self.dd(page, 'Originating house') print(bill_id, title, portfolio, orig_house) bill_chamber = self.CHAMBERS[orig_house] bill = Bill(bill_id, legislative_session=session, chamber=bill_chamber, title=title, classification='bill') sponsor = self.dd(page, 'Sponsor(s)') if sponsor: bill.add_sponsorship(name=sponsor, classification="Primary", entity_type="person", primary=True) self.scrape_bill_actions(page, bill) self.scrape_bill_versions(page, bill) self.scrape_bill_documents(page, bill) bill.add_source(url) yield bill
def scrape(self): for leg_summary in self.legislation( created_after=datetime.datetime(2014, 1, 1)): leg_type = BILL_TYPES[leg_summary['Type']] bill = Bill(identifier=leg_summary['File\xa0#'], title=leg_summary['Title'], legislative_session=None, classification=leg_type, from_organization={"name": "New York City Council"}) bill.add_source(leg_summary['url']) leg_details = self.legDetails(leg_summary['url']) history = self.history(leg_summary['url']) bill.add_title(leg_details['Name'], note='created by administrative staff') if 'Summary' in leg_details: bill.add_abstract(leg_details['Summary'], note='') if leg_details['Law number']: bill.add_identifier(leg_details['Law number'], note='law number') for sponsorship in self._sponsors(leg_details.get('Sponsors', [])): sponsor, sponsorship_type, primary = sponsorship bill.add_sponsorship(sponsor, sponsorship_type, 'person', primary, entity_id=_make_pseudo_id(name=sponsor)) for attachment in leg_details.get('Attachments', []): bill.add_document_link(attachment['label'], attachment['url'], media_type="application/pdf") history = list(history) if history: earliest_action = min( self.toTime(action['Date']) for action in history) bill.legislative_session = self.sessions(earliest_action) else: bill.legislative_session = str(self.SESSION_STARTS[0]) for action in history: action_description = action['Action'] if not action_description: continue action_class = ACTION_CLASSIFICATION[action_description] action_date = self.toDate(action['Date']) responsible_org = action['Action\xa0By'] if responsible_org == 'City Council': responsible_org = 'New York City Council' elif responsible_org == 'Administration': responsible_org = 'Mayor' if responsible_org == 'Town Hall Meeting': continue else: act = bill.add_action( action_description, action_date, organization={'name': responsible_org}, classification=action_class) if 'url' in action['Action\xa0Details']: action_detail_url = action['Action\xa0Details']['url'] if action_class == 'committee-referral': action_details = self.actionDetails(action_detail_url) referred_committee = action_details[ 'Action text'].rsplit(' to the ', 1)[-1] act.add_related_entity( referred_committee, 'organization', entity_id=_make_pseudo_id(name=referred_committee)) result, votes = self.extractVotes(action_detail_url) if votes: action_vote = VoteEvent( legislative_session=bill.legislative_session, motion_text=action_description, organization={'name': responsible_org}, classification=action_class, start_date=action_date, result=result, bill=bill) action_vote.add_source(action_detail_url) for option, voter in votes: action_vote.vote(option, voter) yield action_vote text = self.text(leg_summary['url']) if text: bill.extras = { 'local_classification': leg_summary['Type'], 'full_text': text } else: bill.extras = {'local_classification': leg_summary['Type']} yield bill
def scrape_chamber(self, chamber, session): chamber_name = "Senate" if chamber == "upper" else "House" chamber_letter = chamber_name[0] # perhaps we should save this data so we can make one request for both? bill_request = self.get(ksapi.url + "bill_status/").text bill_request_json = json.loads(bill_request) bills = bill_request_json["content"] for bill_data in bills: bill_id = bill_data["BILLNO"] # filter other chambers if not bill_id.startswith(chamber_letter): continue if "CR" in bill_id: btype = "concurrent resolution" elif "R" in bill_id: btype = "resolution" elif "B" in bill_id: btype = "bill" title = bill_data["SHORTTITLE"] or bill_data["LONGTITLE"] # main bill = Bill(bill_id, session, title, chamber=chamber, classification=btype) bill.extras = {"status": bill_data["STATUS"]} bill.add_source(ksapi.url + "bill_status/" + bill_id.lower()) if bill_data["LONGTITLE"] and bill_data["LONGTITLE"] != bill.title: bill.add_title(bill_data["LONGTITLE"]) # An "original sponsor" is the API's expression of "primary sponsor" for primary_sponsor in bill_data["ORIGINAL_SPONSOR"]: bill.add_sponsorship( name=primary_sponsor, entity_type="organization" if "committee" in primary_sponsor.lower() else "person", primary=True, classification="original sponsor", ) for sponsor in bill_data["SPONSOR_NAMES"]: if sponsor in bill_data["ORIGINAL_SPONSOR"]: continue bill.add_sponsorship( name=sponsor, entity_type="organization" if "committee" in sponsor.lower() else "person", primary=False, classification="cosponsor", ) # history is backwards for event in reversed(bill_data["HISTORY"]): actor = "upper" if event["chamber"] == "Senate" else "lower" date = event["session_date"] # append committee names if present if "committee_names" in event: action = (event["status"] + " " + " and ".join(event["committee_names"])) else: action = event["status"] if event["action_code"] not in ksapi.action_codes: self.warning( "unknown action code on %s: %s %s" % (bill_id, event["action_code"], event["status"])) atype = None else: atype = ksapi.action_codes[event["action_code"]] bill.add_action(action, date, chamber=actor, classification=atype) # Versions are exposed in `bill_data['versions'], # but lack any descriptive text or identifiers; # continue to scrape these from the HTML yield from self.scrape_html(bill, session) yield bill