def scrape_bill(self, chamber, session, bill_id): biennium = "%s-%s" % (session[0:4], session[7:9]) bill_num = bill_id.split()[1] url = "%s/GetLegislation?biennium=%s&billNumber" "=%s" % (self._base_url, biennium, bill_num) with self.urlopen(url) as page: page = lxml.etree.fromstring(page).xpath("//wa:Legislation", namespaces=self._ns)[0] title = page.xpath("string(wa:LongDescription)", namespaces=self._ns) bill_type = page.xpath( "string(wa:ShortLegislationType/wa:LongLegislationType)", namespaces=self._ns ).lower() if bill_type == "gubernatorial appointment": return bill = Bill(session, chamber, bill_id, title, type=[bill_type]) sponsor = page.xpath("string(wa:Sponsor)", namespaces=self._ns).strip("() \t\r\n") bill.add_sponsor("sponsor", sponsor) chamber_name = {"lower": "House", "upper": "Senate"}[chamber] version_url = "http://www.leg.wa.gov/pub/billinfo/2011-12/" "Htm/Bills/%s %ss/%s.htm" % ( chamber_name, bill_type.title(), bill_num, ) bill.add_version(bill_id, version_url) self.scrape_actions(bill) self.save_bill(bill)
def scrape_bill(self, chamber, session, bill_id, bill_type): url = '%s?r=%s' % (self.base_url, bill_id) with self.urlopen(url) as html: doc = lxml.html.fromstring(html) # search for Titulo, accent over i messes up lxml, so use 'tulo' title = doc.xpath( u'//td/b[contains(text(),"tulo")]/../following-sibling::td/text()' ) if not title: raise NoSuchBill() bill = Bill(session, chamber, bill_id, title[0], type=bill_type) author = doc.xpath( u'//td/b[contains(text(),"Autor")]/../text()')[0] bill.add_sponsor('primary', author.strip()) action_table = doc.xpath('//table')[-1] for row in action_table[1:]: tds = row.xpath('td') # ignore row missing date if len(tds) != 2: continue date = datetime.datetime.strptime(tds[0].text_content(), "%m/%d/%Y") action = tds[1].text_content() bill.add_action(chamber, action, date) # also has an associated version if tds[1].xpath('a'): bill.add_version(action, tds[1].xpath('a/@href')[0]) bill.add_source(url) self.save_bill(bill)
def _parse_bill(self, session, chamber, source_url, line): if line: (type, combined_id, number, title, relating_to) = line.split("\xe4") if (type == 'HB' and chamber == 'lower') or (type == 'SB' and chamber == 'upper'): # # basic bill info bill_id = "%s %s" % (type, number.zfill(4)) bill = Bill(session, chamber, bill_id, title) bill.add_source(source_url) # # add actions if self.actionsByBill.has_key(bill_id): for a in self.actionsByBill[bill_id]: bill.add_action(a['actor'], a['action'], a['date']) if self.load_versions_sponsors: # add versions and sponsors versionsSponsors = self.versionsSponsorsParser.fetch_and_parse( self, session, bill_id) #print "versionsSponsors: %s" % str(versionsSponsors) if versionsSponsors: for ver in versionsSponsors['versions']: bill.add_version(ver['name'], ver['url']) sponsorType = 'primary' if len(versionsSponsors['sponsors']) > 1: sponsorType = 'cosponsor' for name in versionsSponsors['sponsors']: bill.add_sponsor(sponsorType, name) # save - writes out JSON self.save_bill(bill)
def scrape_bill_info(self, chamber, session): info_url = "ftp://ftp.cga.ct.gov/pub/data/bill_info.csv" page = self.urlopen(info_url) page = csv.DictReader(StringIO.StringIO(page)) abbrev = {'upper': 'S', 'lower': 'H'}[chamber] for row in page: bill_id = row['bill_num'] if not bill_id[0] == abbrev: continue if re.match(r'^(S|H)J', bill_id): bill_type = 'joint resolution' elif re.match(r'^(S|H)R', bill_id): bill_type = 'resolution' else: bill_type = 'bill' bill = Bill(session, chamber, bill_id, row['bill_title'].decode('latin-1'), type=bill_type) bill.add_source(info_url) self.scrape_bill_page(bill) for introducer in self._introducers[bill_id]: bill.add_sponsor('introducer', introducer) bill['subjects'] = self._subjects[bill_id] self.bills[bill_id] = bill
def get_bill_information(self, bill_id, chamber, session): with self.urlopen(BILL_INFO_URL, 'POST', body="hListBills=" + bill_id) as bill_info_page: self.log("Got bill info") page = lxml.html.fromstring(bill_info_page) # TODO: check whether page is error page and raise custom exception defined above bs = page.xpath('//div/b') for b in bs: containing_div = b.getparent() if b.text == "BY": l = containing_div.text_content().strip(u'BY\xa0').split(',') sponsors = map(lambda x: x.strip(' '), l) if b.text.strip(u',\xa0') == "ENTITLED": title = containing_div.text_content().lstrip(u'ENTITLED,\xa0') divs = page.xpath('//div') bill_type = "" for div in divs: text = div.text_content() for ind, reg in enumerate(self.type_regs): if reg.match(text): bill_type = self.bill_types[ind] bill = Bill(session, chamber, bill_id, title, type=bill_type) for ind, sponsor in enumerate(sponsors): if ind == 0: bill.add_sponsor('primary', sponsor) else: bill.add_sponsor('cosponsor', sponsor) return bill
def get_bill_information(self, bill_id, chamber, session): with self.urlopen(BILL_INFO_URL, 'POST', body="hListBills=" + bill_id) as bill_info_page: self.log("Got bill info") page = lxml.html.fromstring(bill_info_page) # TODO: check whether page is error page and raise custom exception defined above bs = page.xpath('//div/b') for b in bs: containing_div = b.getparent() if b.text == "BY": l = containing_div.text_content().strip(u'BY\xa0').split( ',') sponsors = map(lambda x: x.strip(' '), l) if b.text.strip(u',\xa0') == "ENTITLED": title = containing_div.text_content().lstrip( u'ENTITLED,\xa0') divs = page.xpath('//div') bill_type = "" for div in divs: text = div.text_content() for ind, reg in enumerate(self.type_regs): if reg.match(text): bill_type = self.bill_types[ind] bill = Bill(session, chamber, bill_id, title, type=bill_type) for ind, sponsor in enumerate(sponsors): if ind == 0: bill.add_sponsor('primary', sponsor) else: bill.add_sponsor('cosponsor', sponsor) return bill
def scrape2009(self, url, year, chamberName, session, number): "e.g. http://www.legis.ga.gov/legis/2009_10/sum/sum/sb1.htm" with self.lxml_context(url) as page: # Bill name = page.cssselect('#legislation h1')[0].text_content().strip() bill_id = name.split(' - ')[0].strip() bill = Bill(session, chamberName, bill_id, name) # Sponsorships for a in page.cssselect("#sponsors a"): bill.add_sponsor('', a.text_content().strip()) # Actions for row in page.cssselect('#history tr')[1:]: date = row[0].text_content().strip() action_text = row[1].text_content().strip() if '/' not in date: continue date = datetime.datetime.strptime(date, '%m/%d/%Y') if action_text.startswith('Senate'): bill.add_action('upper', action_text, date) elif action_text.startswith('House'): bill.add_action('lower', action_text, date) # Versions for row in page.cssselect('#versions a'): bill.add_version(a.text_content(), urlparse.urljoin(url, a.get('href'))) self.save_bill(bill)
def scrape_bill(self, session, chamber, bill_type, bill_url): with self.urlopen(bill_url) as page: page = lxml.html.fromstring(page) page.make_links_absolute(bill_url) # split "SB1 SD2 HD2" to get SB1 bill_id = page.xpath('//a[@class="headerlink"]')[0].text.split()[0] table = page.xpath('//table[@cellspacing="4px"]')[0] title = get_table_text(table, "Measure Title") subjects = get_table_text(table, "Report Title").split('; ') description = get_table_text(table, "Description") sponsors = get_table_text(table, "Introducer(s)") bill = Bill(session, chamber, bill_id, title, subjects=subjects, type=bill_type, description=description) for sponsor in sponsors.split(', '): if sponsor.endswith(' (BR)'): sponsor = sponsor[:-5] bill.add_sponsor('primary', sponsor) # actions actions = [] table = page.xpath('//table[contains(@id, "GridView1")]')[0] for row in table.xpath('tr'): action_params = {} cells = row.xpath('td') if len(cells) == 3: ch = cells[1].xpath('font')[0].text action_params['actor'] = house[ch] action_params['action'] = cells[2].xpath('font')[0].text action_date = cells[0].xpath('font')[0].text action_params['date'] = datetime.strptime( action_date, "%m/%d/%Y") action_params['type'] = categorize_action( action_params['action']) actions.append(action_params) for action_params in actions: bill.add_action(**action_params) self.parse_vote(bill, action_params['action'], action_params['actor'], action_params['date']) # Add version document if not on a javascript link. try: bill_version = page.xpath( '//a[contains(@id, "HyperLinkPDF")]')[0].attrib['href'] bill.add_version('Current version', bill_version) except IndexError: # href not found. pass bill.add_source(bill_url) self.save_bill(bill)
def scrape_bill(self, chamber, session, doc_type, url): doc = self.url_to_doc(url) # bill id, title, synopsis bill_num = re.findall('DocNum=(\d+)', url)[0] bill_type = DOC_TYPES[doc_type[1:]] bill_id = doc_type + bill_num title = doc.xpath( '//span[text()="Short Description:"]/following-sibling::span[1]/text()' )[0].strip() synopsis = doc.xpath( '//span[text()="Synopsis As Introduced"]/following-sibling::span[1]/text()' )[0].strip() bill = Bill(session, chamber, bill_id, title, type=bill_type, synopsis=synopsis) bill.add_source(url) # sponsors sponsor_list = build_sponsor_list(doc.xpath('//a[@class="content"]')) # don't add just yet; we can make them better using action data # actions action_tds = doc.xpath( '//a[@name="actions"]/following-sibling::table[1]/td') for date, actor, action in group(action_tds, 3): date = datetime.datetime.strptime(date.text_content().strip(), "%m/%d/%Y") actor = actor.text_content() if actor == 'House': actor = 'lower' elif actor == 'Senate': actor = 'upper' action = action.text_content() bill.add_action(actor, action, date, **_categorize_action(action)) if action.lower().find('sponsor') != -1: self.refine_sponsor_list(actor, action, sponsor_list, bill_id) # now add sponsors for spontype, sponsor, chamber in sponsor_list: if chamber: bill.add_sponsor(spontype, sponsor, chamber=chamber) else: bill.add_sponsor(spontype, sponsor) # versions version_url = doc.xpath('//a[text()="Full Text"]/@href')[0] self.scrape_documents(bill, version_url) # if there's more than 1 votehistory link, there are votes to grab if len(doc.xpath('//a[contains(@href, "votehistory")]')) > 1: votes_url = doc.xpath('//a[text()="Votes"]/@href')[0] self.scrape_votes(session, bill, votes_url) self.save_bill(bill)
def scrape_bill(self, chamber, session): url = "ftp://www.arkleg.state.ar.us/dfadooas/LegislativeMeasures.txt" page = self.urlopen(url).decode("latin-1") page = unicode_csv_reader(StringIO.StringIO(page), delimiter="|") for row in page: bill_chamber = {"H": "lower", "S": "upper"}[row[0]] if bill_chamber != chamber: continue bill_id = "%s%s %s" % (row[0], row[1], row[2]) type_spec = re.match(r"(H|S)([A-Z]+)\s", bill_id).group(2) bill_type = { "B": "bill", "R": "resolution", "JR": "joint resolution", "CR": "concurrent resolution", "MR": "memorial resolution", "CMR": "concurrent memorial resolution", }[type_spec] if row[-1] != self.slug: continue bill = Bill(session, chamber, bill_id, row[3], type=bill_type) bill.add_source(url) bill.add_sponsor("lead sponsor", row[11]) version_url = "ftp://www.arkleg.state.ar.us/Bills/" "%s/Public/%s.pdf" % (session, bill_id.replace(" ", "")) bill.add_version(bill_id, version_url) self.scrape_bill_page(bill) self.bills[bill_id] = bill
def scrape(self, chamber, session): self.validate_session(session) if chamber == 'upper': other_chamber = 'lower' bill_id = 'SB 1' else: other_chamber = 'upper' bill_id = 'HB 1' b1 = Bill(session, chamber, bill_id, 'A super bill') b1.add_source('http://example.com/') b1.add_version('As Introduced', 'http://example.com/SB1.html') b1.add_document('Google', 'http://google.com') b1.add_sponsor('primary', 'Bob Smith') b1.add_sponsor('secondary', 'Johnson, Sally') d1 = datetime.datetime.strptime('1/29/2010', '%m/%d/%Y') v1 = Vote('upper', d1, 'Final passage', True, 2, 0, 0) v1.yes('Smith') v1.yes('Johnson') d2 = datetime.datetime.strptime('1/30/2010', '%m/%d/%Y') v2 = Vote('lower', d2, 'Final passage', False, 0, 1, 1) v2.no('Bob Smith') v2.other('S. Johnson') b1.add_vote(v1) b1.add_vote(v2) b1.add_action(chamber, 'introduced', d1) b1.add_action(chamber, 'read first time', d2) b1.add_action(other_chamber, 'introduced', d2) self.save_bill(b1)
def scrape_bill_info(self, chamber, session): info_url = "ftp://ftp.cga.ct.gov/pub/data/bill_info.csv" page = self.urlopen(info_url) page = csv.DictReader(StringIO.StringIO(page)) abbrev = {'upper': 'S', 'lower': 'H'}[chamber] for row in page: bill_id = row['bill_num'] if not bill_id[0] == abbrev: continue if re.match(r'^(S|H)J', bill_id): bill_type = 'joint resolution' elif re.match(r'^(S|H)R', bill_id): bill_type = 'resolution' else: bill_type = 'bill' bill = Bill(session, chamber, bill_id, row['bill_title'], type=bill_type) bill.add_source(info_url) self.scrape_bill_page(bill) for introducer in self._introducers[bill_id]: bill.add_sponsor('introducer', introducer) self.bills[bill_id] = bill
def scrape_bill(self, chamber, bill): bill_id = bill['id'].replace('w/','with ') page = lxml.html.fromstring(self.urlopen(bill['url'])) page.make_links_absolute(bill['url']) title_row = page.xpath('//tr[td/b[contains(font,"Long Title")]]')[0] # text_content() == make sure any tags in the title don't cause issues title = title_row.xpath('td[@width="79%"]/font')[0].text_content() # now we can create a bill object b = Bill(bill['session'], bill['chamber'], bill_id, title) b.add_source(bill['url']) sponsors_row = page.xpath('//tr[td/b[contains(font,"Primary Sponsor")]]')[0] sponsor = sponsors_row.xpath('td[@width="31%"]/font')[0].text if sponsor != None: b.add_sponsor('primary', sponsor) # scraping these and co-sponsors, but not doing anything with them until # it's decided whether or not to attempt to split 'em up additional = sponsors_row.xpath('td[@width="48%"]/font') additional_sponsors = additional[0].text if len(additional) > 0 else "" additional_sponsors = additional_sponsors.replace('   ','') cosponsors_row = page.xpath('//tr[td/b[contains(font,"CoSponsors")]]')[0] cosponsors = cosponsors_row.xpath('td[@width="79%"]/font')[0].text cosponsors = cosponsors if cosponsors != '{ NONE...}' else '' introduced_row = page.xpath('//tr[td/b[contains(font,"Introduced On")]]') if len(introduced_row) > 0: introduced = introduced_row[0].expath('/td[@width="31%"]/font')[0].text introduced = datetime.strptime(introduced, '%b %d, %Y') b.add_action(bill['chamber'], 'introduced', introduced, 'bill:introduced') actions = page.xpath('//table[preceding-sibling::b[contains(font,"Actions History:")]]/tr/td[@width="79%"]/font') if len(actions) > 0: actions = actions[0].text_content().split('\n') for act in actions: act = act.partition(' - ') date = datetime.strptime(act[0], '%b %d, %Y') b.add_action(bill['chamber'], act[2], date) # resources = page.xpath('//tr[td/b[contains(font, "Full text of Legislation")]]') # save vote urls for scraping later vote_urls = [] voting_reports = page.xpath('//tr[td/b[contains(font, "Voting Reports")]]') if(len(voting_reports) > 0): for report in voting_reports[0].xpath('td/font/a'): vote_urls.append(report.attrib['href']) # Scrape votes for url in vote_urls: vote = self.scrape_votes(chamber, title, bill_id, url) b.add_vote(vote) # Save bill self.save_bill(b)
def scrape_regular_row(self, chamber, session, row): """Returns bill attributes from row.""" params = {} params['session'] = session params['chamber'] = chamber b = row.xpath('td/font/a[contains(@id, "HyperLink1")]') if b: # Ignore if no match bill_status_url = b[0].attrib['href'] bill_url = row.xpath('td/font/span[contains(@id, "_Label2")]')[0].text params['bill_id'] = b[0].xpath('font')[0].text.split()[0] params['title'] = row.xpath('td/font/span[contains(@id, "_Label1")]/u/font')[0].text subject = row.xpath('td/font/span[contains(@id, "_Label6")]')[0].text subject = subject.replace('RELATING TO ', '') # Remove lead text params['subjects'] = [subject.replace('.', '')] params['description'] = row.xpath('td/font/span[contains(@id, "_Label2")]')[0].text sponsors = row.xpath('td/font/span[contains(@id, "_Label7")]')[0].text params['companion'] = row.xpath('td/font/span[contains(@id, "_Label8")]')[0].text bill = Bill(**params) for sponsor in sponsors.split(', '): bill.add_sponsor('primary', sponsor) actions = self.scrape_actions(bill, bill_status_url) bill.add_source(bill_status_url) self.save_bill(bill) return
def parse_senate_billpage(self, bill_url, year): with self.urlopen(bill_url) as bill_page: bill_page = BeautifulSoup(bill_page) # get all the info needed to record the bill bill_id = bill_page.find(id="lblBillNum").b.font.contents[0] bill_title = bill_page.find(id="lblBillTitle").font.string bill_desc = bill_page.find(id="lblBriefDesc").font.contents[0] bill_lr = bill_page.find(id="lblLRNum").font.string bill = Bill(year, 'upper', bill_id, bill_desc, bill_url=bill_url, bill_lr=bill_lr, official_title=bill_title) bill.add_source(bill_url) # Get the primary sponsor bill_sponsor = bill_page.find(id="hlSponsor").i.font.contents[0] bill_sponsor_link = bill_page.find(id="hlSponsor").href bill.add_sponsor('primary', bill_sponsor, sponsor_link=bill_sponsor_link) # cosponsors show up on their own page, if they exist cosponsor_tag = bill_page.find(id="hlCoSponsors") if cosponsor_tag and 'href' in cosponsor_tag: self.parse_senate_cosponsors(bill, cosponsor_tag['href']) # get the actions action_url = bill_page.find(id="hlAllActions")['href'] self.parse_senate_actions(bill, action_url) # stored on a separate page versions_url = bill_page.find(id="hlFullBillText") if versions_url: self.parse_senate_bill_versions(bill, versions_url['href']) self.save_bill(bill)
def scrape_2009RS_row(self, chamber, session, row): """Returns bill attributes from row.""" params = {} params['session'] = session params['chamber'] = chamber b = row.xpath('td/font/a[contains(@id, "HyperLink1")]') if b: # Ignore if no match bill_status_url = b[0].attrib['href'] bill_url = row.xpath( 'td/font/span[contains(@id, "_Label2")]')[0].text params['bill_id'] = b[0].xpath('font')[0].text params['title'] = row.xpath( 'td/font/span[contains(@id, "_Label1")]/u/font')[0].text subject = row.xpath( 'td/font/span[contains(@id, "_Label6")]')[0].text subject = subject.replace('RELATING TO ', '') # Remove lead text params['subject'] = subject.replace('.', '') params['description'] = row.xpath( 'td/font/span[contains(@id, "_Label2")]')[0].text sponsors = row.xpath( 'td/font/span[contains(@id, "_Label7")]')[0].text params['companion'] = row.xpath( 'td/font/span[contains(@id, "_Label8")]')[0].text bill = Bill(**params) bill.add_sponsor('primary', sponsors) actions = self.scrape_actions(bill, bill_status_url) bill.add_source(bill_status_url) self.save_bill(bill) return
def scrape_current(self, chamber, term): chamber_name = 'Senate' if chamber == 'upper' else 'House' chamber_letter = chamber_name[0] # perhaps we should save this data so we can make one request for both? with self.urlopen(ksapi.url + 'bill_status/') as bill_request: bill_request_json = json.loads(bill_request) bills = bill_request_json['content'] for bill_data in bills: bill_id = bill_data['BILLNO'] # filter other chambers if not bill_id.startswith(chamber_letter): continue if 'CR' in bill_id: btype = 'concurrent resolution' elif 'R' in bill_id: btype = 'resolution' elif 'B' in bill_id: btype = 'bill' # main bill = Bill(term, chamber, bill_id, bill_data['SHORTTITLE'], type=btype, status=bill_data['STATUS']) bill.add_source(ksapi.url + 'bill_status/' + bill_id.lower()) if bill_data['LONGTITLE']: bill.add_title(bill_data['LONGTITLE']) for sponsor in bill_data['SPONSOR_NAMES']: stype = ('primary' if len(bill_data['SPONSOR_NAMES']) == 1 else 'cosponsor') bill.add_sponsor(stype, sponsor) # history is backwards for event in reversed(bill_data['HISTORY']): actor = ('upper' if event['chamber'] == 'Senate' else 'lower') date = datetime.datetime.strptime(event['occurred_datetime'], "%Y-%m-%dT%H:%M:%S") # append committee names if present if 'committee_names' in event: action = (event['status'] + ' ' + ' and '.join(event['committee_names'])) else: action = event['status'] if event['action_code'] not in ksapi.action_codes: self.warning('unknown action code on %s: %s %s' % (bill_id, event['action_code'], event['status'])) atype = 'other' else: atype = ksapi.action_codes[event['action_code']] bill.add_action(actor, action, date, type=atype) self.scrape_html(bill) self.save_bill(bill)
def process_bill(self, data): chamber = parse_psuedo_id(data['from_organization'])['classification'] if chamber == 'legislature': chamber = 'upper' bill = Bill(data['legislative_session'], chamber, data['identifier'], data['title'], subjects=data['subject'], type=data['classification']) if data['abstracts']: bill['summary'] = data['abstracts'][0]['abstract'] bill.update(**data['extras']) for action in data['actions']: actor = parse_psuedo_id(action['organization_id'])['classification'] legislators = [] committees = [] for rel in action['related_entities']: if rel['entity_type'] == 'organization': committees.append(rel['name']) elif rel['entity_type'] == 'person': legislators.append(rel['name']) bill.add_action(actor, action['description'], parse_date(action['date']), type=_action_categories(action['classification']), committees=committees, legislators=legislators, **action.get('extras', {}), ) for source in data['sources']: bill.add_source(source['url']) for sponsor in data['sponsorships']: bill.add_sponsor(sponsor['classification'], sponsor['name'], ) for version in data['versions']: for link in version['links']: bill.add_version(version['note'], link['url'], mimetype=link['media_type'], date=parse_date(version['date']), **version.get('extras', {})) for doc in data['documents']: for link in doc['links']: bill.add_document(doc['note'], link['url'], mimetype=link['media_type'], date=parse_date(doc['date']), **doc.get('extras', {})) for title in data['other_titles']: bill.add_title(title['title']) for related in data['related_bills']: bill.add_companion(related['identifier'], related['legislative_session'], chamber ) self.save_bill(bill)
def scrape1999(self, url, year, chamberName, session, number): "e.g. http://www.legis.ga.gov/legis/1999_00/leg/sum/sb1.htm" with self.lxml_context(url) as lxml: # Grab the interesting tables on the page. tables = page.cssselect('table') # Bill name = tables[1].cssselect('a')[0].text_content().split('-', 1)[1] bill = Bill(session, chamberName, number, name) # Versions bill.add_version('Current', url.replace('/sum/', '/fulltext/')) # Sponsorships for a in tables[2].cssselect('a'): bill.add_sponsor('', a.text_content().strip()) # Actions for row in tables[-1].cssselect('tr'): senate_date = row[0].text_content().strip() action_text = row[1].text_content().strip() house_date = row[2].text_content().strip() if '/' not in senate_date and '/' not in house_date: continue if senate_date: bill.add_action('upper', action_text, senate_date) if house_date: bill.add_action('lower', action_text, house_date) self.save_bill(bill)
def scrape_bill(self, chamber, session): url = "ftp://www.arkleg.state.ar.us/dfadooas/LegislativeMeasures.txt" page = self.urlopen(url).decode('latin-1') page = unicode_csv_reader(StringIO.StringIO(page), delimiter='|') for row in page: bill_chamber = {'H': 'lower', 'S': 'upper'}[row[0]] if bill_chamber != chamber: continue bill_id = "%s%s %s" % (row[0], row[1], row[2]) type_spec = re.match(r'(H|S)([A-Z]+)\s', bill_id).group(2) bill_type = { 'B': 'bill', 'R': 'resolution', 'JR': 'joint resolution', 'CR': 'concurrent resolution', 'MR': 'memorial resolution', 'CMR': 'concurrent memorial resolution'}[type_spec] bill = Bill('2011', chamber, bill_id, row[3], type=bill_type) bill.add_source(url) bill.add_sponsor('lead sponsor', row[11]) version_url = ("ftp://www.arkleg.state.ar.us/Bills/" "%s/Public/%s.pdf" % ( session, bill_id.replace(' ', ''))) bill.add_version(bill_id, version_url) self.scrape_votes(bill) self.bills[bill_id] = bill
def scrape_bill(self, chamber, session, bill_id, url): try: page = lxml.html.fromstring(self.urlopen(url)) except scrapelib.HTTPError as e: self.warning("error (%s) fetching %s, skipping" % (e, url)) return title = page.xpath("string(//span[contains(@id, 'PlaceHolder1_txtST')])").strip() if "JR" in bill_id: bill_type = ["joint resolution"] elif "CR" in bill_id: bill_type = ["concurrent resolution"] elif "R" in bill_id: bill_type = ["resolution"] else: bill_type = ["bill"] bill = Bill(session, chamber, bill_id, title, type=bill_type) bill.add_source(url) bill["subjects"] = self.subject_map[bill_id] for link in page.xpath("//a[contains(@id, 'Auth')]"): name = link.xpath("string()").strip() if "otherAuth" in link.attrib["id"]: bill.add_sponsor("coauthor", name) else: bill.add_sponsor("author", name) act_table = page.xpath("//table[contains(@id, 'Actions')]")[0] for tr in act_table.xpath("tr")[2:]: action = tr.xpath("string(td[1])").strip() if not action or action == "None": continue date = tr.xpath("string(td[3])").strip() date = datetime.datetime.strptime(date, "%m/%d/%Y").date() actor = tr.xpath("string(td[4])").strip() if actor == "H": actor = "lower" elif actor == "S": actor = "upper" bill.add_action(actor, action, date, type=action_type(action)) version_table = page.xpath("//table[contains(@id, 'Versions')]")[0] for link in version_table.xpath(".//a[contains(@href, '.DOC')]"): version_url = link.attrib["href"] if "COMMITTEE REPORTS" in version_url: continue name = link.text.strip() bill.add_version(name, version_url) for link in page.xpath(".//a[contains(@href, '_VOTES')]"): self.scrape_votes(bill, urlescape(link.attrib["href"])) self.save_bill(bill)
def scrape2003(self, url, year, chamberName, session, number): "e.g. http://www.legis.ga.gov/legis/2003_04/sum/sum/sb1.htm" with self.lxml_context(url) as page: # Grab the interesting tables on the page. tables = page.cssselect('center table') # Bill name = tables[0].text_content().split('-', 1)[1] bill = Bill(session, chamberName, number, name) # Sponsorships for a in tables[1].cssselect('a'): bill.add_sponsor('', a.text_content().strip()) # Actions center = page.cssselect('center table center')[0] for row in center.cssselect('table')[-2].cssselect('tr')[2:]: date = row[0].text_content().strip() action_text = row[1].text_content().strip() if '/' not in date: continue if action_text.startswith('Senate'): bill.add_action('upper', action_text, date) elif action_text.startswith('House'): bill.add_action('lower', action_text, date) # Versions for row in center.cssselect('table')[-1].cssselect('a'): bill.add_version(a.text_content(), urlparse.urljoin(url, a.get('href'))) self.save_bill(bill)
def scrape_bill_info(self, chamber, session): info_url = "ftp://ftp.cga.ct.gov/pub/data/bill_info.csv" page = self.urlopen(info_url) page = csv.DictReader(StringIO.StringIO(page)) abbrev = {"upper": "S", "lower": "H"}[chamber] for row in page: bill_id = row["bill_num"] if not bill_id[0] == abbrev: continue if re.match(r"^(S|H)J", bill_id): bill_type = "joint resolution" elif re.match(r"^(S|H)R", bill_id): bill_type = "resolution" else: bill_type = "bill" bill = Bill(session, chamber, bill_id, row["bill_title"].decode("latin-1"), type=bill_type) bill.add_source(info_url) self.scrape_bill_page(bill) for introducer in self._introducers[bill_id]: bill.add_sponsor("introducer", introducer) bill["subjects"] = self._subjects[bill_id] self.bills[bill_id] = bill
def scrape2001(self, url, year, chamberName, session, number): "e.g. http://www.legis.ga.gov/legis/2001_02/sum/sb1.htm" with self.lxml_context(url) as page: # Grab the interesting tables on the page. tables = page.cssselect("table center table") # Bill name = tables[0].text_content().split("-", 1)[1] bill = Bill(session, chamberName, number, name) # Sponsorships for a in tables[1].cssselect("a"): bill.add_sponsor("", a.text_content().strip()) # Actions center = page.cssselect("table center")[-1] for row in center.cssselect("table table")[0].cssselect("tr")[2:]: date = row[0].text_content().strip() action_text = row[1].text_content().strip() if "/" not in date: continue if action_text.startswith("Senate"): action_text = action_text.split(" ", 1)[1].strip() bill.add_action("upper", action_text, date) elif action_text.startswith("House"): action_text = action_text.split(" ", 1)[1].strip() bill.add_action("lower", action_text, date) # Versions for row in center.cssselect("table table")[1].cssselect("a"): bill.add_version(a.text_content(), urlparse.urljoin(url, a.get("href"))) self.save_bill(bill)
def _parse_bill(self, session, chamber, source_url, line): if line: (type, combined_id, number, title, relating_to) = line.split("\xe4") if (type == 'HB' and chamber == 'lower') or (type == 'SB' and chamber == 'upper'): # # basic bill info bill_id = "%s %s" % (type, number.zfill(4)) bill = Bill(session, chamber, bill_id, title) bill.add_source(source_url) # # add actions if self.actionsByBill.has_key(bill_id): for a in self.actionsByBill[bill_id]: bill.add_action(a['actor'], a['action'], a['date']) if self.load_versions_sponsors: # add versions and sponsors versionsSponsors = self.versionsSponsorsParser.fetch_and_parse(self, session, bill_id) #print "versionsSponsors: %s" % str(versionsSponsors) if versionsSponsors: for ver in versionsSponsors['versions']: bill.add_version(ver['name'], ver['url']) sponsorType = 'primary' if len(versionsSponsors['sponsors']) > 1: sponsorType = 'cosponsor' for name in versionsSponsors['sponsors']: bill.add_sponsor(sponsorType, name) # save - writes out JSON self.save_bill(bill)
def scrape_bill(self, chamber, session, doc_type, url, bill_type=None): try: doc = self.lxmlize(url) except scrapelib.HTTPError as e: assert '500' in e.args[0], "Unexpected error when accessing page: {}".format(e) self.warning("500 error for bill page; skipping bill") return # bill id, title, summary bill_num = re.findall('DocNum=(\d+)', url)[0] bill_type = bill_type or DOC_TYPES[doc_type[1:]] bill_id = doc_type + bill_num title = doc.xpath('//span[text()="Short Description:"]/following-sibling::span[1]/text()')[0].strip() summary = doc.xpath('//span[text()="Synopsis As Introduced"]/following-sibling::span[1]/text()')[0].strip() bill = Bill(session, chamber, bill_id, title, type=bill_type, summary=summary) bill.add_source(url) # sponsors sponsor_list = build_sponsor_list(doc.xpath('//a[@class="content"]')) # don't add just yet; we can make them better using action data # actions action_tds = doc.xpath('//a[@name="actions"]/following-sibling::table[1]/td') for date, actor, action in group(action_tds, 3): date = datetime.datetime.strptime(date.text_content().strip(), "%m/%d/%Y") actor = actor.text_content() if actor == 'House': actor = 'lower' elif actor == 'Senate': actor = 'upper' action = action.text_content() bill.add_action(actor, action, date, **_categorize_action(action)) if action.lower().find('sponsor') != -1: self.refine_sponsor_list(actor, action, sponsor_list, bill_id) # now add sponsors for spontype, sponsor, chamber, official_type in sponsor_list: if chamber: bill.add_sponsor(spontype, sponsor, official_type=official_type, chamber=chamber) else: bill.add_sponsor(spontype, sponsor, official_type=official_type) # versions version_url = doc.xpath('//a[text()="Full Text"]/@href')[0] self.scrape_documents(bill, version_url) # if there's more than 1 votehistory link, there are votes to grab if len(doc.xpath('//a[contains(@href, "votehistory")]')) > 1: votes_url = doc.xpath('//a[text()="Votes"]/@href')[0] self.scrape_votes(session, bill, votes_url) self.save_bill(bill)
def scrape1999(self, url, year, chamberName, session, number): "e.g. http://www.legis.ga.gov/legis/1999_00/leg/sum/sb1.htm" with self.lxml_context(url) as lxml: # Grab the interesting tables on the page. tables = page.cssselect("table") # Bill name = tables[1].cssselect("a")[0].text_content().split("-", 1)[1] bill = Bill(session, chamberName, number, name) # Versions bill.add_version("Current", url.replace("/sum/", "/fulltext/")) # Sponsorships for a in tables[2].cssselect("a"): bill.add_sponsor("", a.text_content().strip()) # Actions for row in tables[-1].cssselect("tr"): senate_date = row[0].text_content().strip() action_text = row[1].text_content().strip() house_date = row[2].text_content().strip() if "/" not in senate_date and "/" not in house_date: continue if senate_date: bill.add_action("upper", action_text, senate_date) if house_date: bill.add_action("lower", action_text, house_date) self.save_bill(bill)
def scrape_bills(self, chamber, session, subjects): idex = START_IDEX[chamber] FROM = "ctl00$rilinContent$txtBillFrom" TO = "ctl00$rilinContent$txtBillTo" YEAR = "ctl00$rilinContent$cbYear" blocks = "FOO" # Ugh. while len(blocks) > 0: default_headers = get_default_headers(SEARCH_URL) default_headers[FROM] = idex default_headers[TO] = idex + MAXQUERY default_headers[YEAR] = session idex += MAXQUERY #headers = urllib.urlencode( default_headers ) blocks = self.parse_results_page( self.urlopen(SEARCH_URL, method="POST", body=default_headers)) blocks = blocks[1:-1] blocks = self.digest_results_page(blocks) for block in blocks: bill = blocks[block] subs = [] try: subs = subjects[bill['bill_id']] except KeyError: pass title = bill['title'][len("ENTITLED, "):] billid = bill['bill_id'] try: subs = subjects[bill['bill_id']] except KeyError: subs = [] for b in BILL_NAME_TRANSLATIONS: if billid[:len(b)] == b: billid = BILL_NAME_TRANSLATIONS[b] + \ billid[len(b)+1:].split()[0] b = Bill(session, chamber, billid, title, type=self.get_type_by_name(bill['bill_id']), subjects=subs) self.process_actions(bill['actions'], b) sponsors = bill['sponsors'][len("BY"):].strip() sponsors = sponsors.split(",") sponsors = [s.strip() for s in sponsors] for href in bill['bill_id_hrefs']: b.add_version(href.text, href.attrib['href'], mimetype="application/pdf") for sponsor in sponsors: b.add_sponsor("primary", sponsor) b.add_source(SEARCH_URL) self.save_bill(b)
def scrape_bill(self, chamber, session, bill_id, bill_type): url = '%s?r=%s' % (self.base_url, bill_id) with self.urlopen(url) as html: doc = lxml.html.fromstring(html) # search for Titulo, accent over i messes up lxml, so use 'tulo' title = doc.xpath(u'//td/b[contains(text(),"tulo")]/../following-sibling::td/text()') if not title: raise NoSuchBill() bill = Bill(session, chamber, bill_id, title[0], type=bill_type) author = doc.xpath(u'//td/b[contains(text(),"Autor")]/../text()')[0] bill.add_sponsor('primary', author.strip()) action_table = doc.xpath('//table')[-1] for row in action_table[1:]: tds = row.xpath('td') # ignore row missing date if len(tds) != 2: continue date = datetime.datetime.strptime(tds[0].text_content(), "%m/%d/%Y") action = tds[1].text_content() bill.add_action(chamber, action, date) # also has an associated version if tds[1].xpath('a'): bill.add_version(action, tds[1].xpath('a/@href')[0]) bill.add_source(url) self.save_bill(bill)
def scrape(self, chamber, session): year = year_from_session(session) url = bills_url(year) with self.urlopen(url) as bills_page_html: bills_page = lxml.html.fromstring(bills_page_html) table_rows = bills_page.cssselect('tr') # Eliminate empty rows table_rows = table_rows[0:len(table_rows):2] for row in table_rows: row_elements = row.cssselect('td') bill_document = row_elements[0] bill_document.make_links_absolute(BASE_URL) element, attribute, link, pos = bill_document.iterlinks().next( ) bill_id = element.text_content().rstrip('.pdf') bill_document_link = link title_and_sponsors = row_elements[1] title_match = re.search('([A-Z][a-z]+.+[a-z])[A-Z]', title_and_sponsors.text_content()) sponsors_match = re.search('[a-z]([A-Z]+.+)', title_and_sponsors.text_content()) title = title_match.group(1) sponsors = sponsors_match.group(1) separated_sponsors = sponsors.split('--') bill = Bill(session, chamber, bill_id, title) bill.add_version('current', bill_document_link) if separated_sponsors[1] == '(NONE)': bill.add_sponsor('primary', separated_sponsors[0]) else: bill.add_sponsor('cosponsor', separated_sponsors[0]) bill.add_sponsor('cosponsor', separated_sponsors[1]) versions_page_element = row_elements[2] versions_page_element.make_links_absolute(BASE_URL) element, attribute, link, pos = versions_page_element.iterlinks( ).next() bill.add_source(link) self.scrape_versions(link, bill) actions_page_element = row_elements[3] element, attribute, link, pos = actions_page_element.iterlinks( ).next() frame_link = BASE_URL + link.split('?Open&target=')[1] self.scrape_actions(frame_link, bill) votes_page_element = row_elements[7] element, attribute, link, pos = votes_page_element.iterlinks( ).next() frame_link = BASE_URL + link.split('?Open&target=')[1] self.scrape_votes(frame_link, chamber, bill)
def scrape_assem_bills(self, chamber, insert, session, year): doc_type = { 1: 'bill', 3: 'resolution', 5: 'concurrent resolution', 6: 'joint resolution' } for docnum, bill_type in doc_type.iteritems(): parentpage_url = 'http://www.leg.state.nv.us/Session/%s/Reports/HistListBills.cfm?DoctypeID=%s' % ( insert, docnum) links = self.scrape_links(parentpage_url) count = 0 for link in links: count = count + 1 page_path = 'http://www.leg.state.nv.us/Session/%s/Reports/%s' % ( insert, link) page = self.urlopen(page_path) page = page.replace(u"\xa0", " ") root = lxml.html.fromstring(page) bill_id = root.xpath( 'string(/html/body/div[@id="content"]/table[1]/tr[1]/td[1]/font)' ) title = root.xpath( 'string(/html/body/div[@id="content"]/table[1]/tr[5]/td)') bill = Bill(session, chamber, bill_id, title, type=bill_type) bill['subjects'] = self.subject_mapping[bill_id] bill_text = root.xpath( "string(/html/body/div[@id='content']/table[6]/tr/td[2]/a/@href)" ) text_url = "http://www.leg.state.nv.us" + bill_text bill.add_version("Bill Text", text_url, mimetype='application/pdf') primary, secondary = self.scrape_sponsors(page) for leg in primary: bill.add_sponsor('primary', leg) for leg in secondary: bill.add_sponsor('cosponsor', leg) minutes_count = 2 for mr in root.xpath('//table[4]/tr/td[3]/a'): minutes = mr.xpath("string(@href)") minutes_url = "http://www.leg.state.nv.us" + minutes minutes_date_path = "string(//table[4]/tr[%s]/td[2])" % minutes_count minutes_date = mr.xpath(minutes_date_path).split() minutes_date = minutes_date[0] + minutes_date[ 1] + minutes_date[2] + " Minutes" bill.add_document(minutes_date, minutes_url) minutes_count = minutes_count + 1 self.scrape_actions(root, bill, "lower") self.scrape_votes(page, bill, insert, year) bill.add_source(page_path) self.save_bill(bill)
def get_bill_info(self, chamber, session, bill_detail_url, version_list_url): """Extracts all the requested info for a given bill. Calls the parent's methods to enter the results into JSON files. """ if chamber == "House": chamber = 'lower' else: chamber = 'upper' with self.urlopen(bill_detail_url) as bill_html: doc = lxml.html.fromstring(bill_html) bill_id = doc.xpath('//title/text()')[0].split()[0] bill_title = doc.xpath('//font[@size=-1]/text()')[0] bill_type = {'F': 'bill', 'R':'resolution', 'C': 'concurrent resolution'}[bill_id[1]] bill = Bill(session, chamber, bill_id, bill_title, type=bill_type) bill['subjects'] = self._subject_mapping[bill_id] bill.add_source(bill_detail_url) # grab sponsors sponsors = doc.xpath('//table[@summary="Show Authors"]/descendant::a/text()') if sponsors: primary_sponsor = sponsors[0].strip() bill.add_sponsor('primary', primary_sponsor, chamber=chamber) cosponsors = sponsors[1:] for leg in cosponsors: bill.add_sponsor('cosponsor', leg.strip(), chamber=chamber) # Add Actions performed on the bill. bill_actions = self.extract_bill_actions(doc, chamber) for action in bill_actions: kwargs = {} if 'committee' in action: kwargs['committees'] = action['committees'] bill.add_action(action['action_chamber'], action['action_text'], action['action_date'], type=action['action_type'], **kwargs) # Get all versions of the bill. # Versions of a bill are on a separate page, linked to from the column # labeled, "Bill Text", on the search results page. with self.urlopen(version_list_url) as version_html: if 'resolution' in version_html.response.url: bill.add_version('resolution text', version_html.response.url, mimetype='text/html') else: version_doc = lxml.html.fromstring(version_html) for v in version_doc.xpath('//a[starts-with(@href, "/bin/getbill.php")]'): version_url = urlparse.urljoin(VERSION_URL_BASE, v.get('href'), mimetype='text/html') bill.add_version(v.text.strip(), version_url) self.save_bill(bill)
def scrape_bill(self, chamber, session, doc_type, url): html = self.urlopen(url) doc = lxml.html.fromstring(html) doc.make_links_absolute(url) # bill id, title, synopsis bill_num = re.findall('DocNum=(\d+)', url)[0] bill_type = DOC_TYPES[doc_type[1:]] bill_id = doc_type + bill_num title = doc.xpath( '//span[text()="Short Description:"]/following-sibling::span[1]/text()' )[0].strip() synopsis = doc.xpath( '//span[text()="Synopsis As Introduced"]/following-sibling::span[1]/text()' )[0].strip() bill = Bill(session, chamber, bill_id, title, type=bill_type, synopsis=synopsis) # sponsors for sponsor in doc.xpath('//a[@class="content"]/text()'): bill.add_sponsor('cosponsor', sponsor) # actions action_tds = doc.xpath( '//a[@name="actions"]/following-sibling::table[1]/td') for date, actor, action in group(action_tds, 3): date = datetime.datetime.strptime(date.text_content().strip(), "%m/%d/%Y") actor = actor.text_content() if actor == 'House': actor = 'lower' elif actor == 'Senate': actor = 'upper' action = action.text_content() bill.add_action(actor, action, date, type=_categorize_action(action)) # versions version_url = doc.xpath('//a[text()="Full Text"]/@href')[0] self.scrape_documents(bill, version_url) # if there's more than 1 votehistory link, there are votes to grab if len(doc.xpath('//a[contains(@href, "votehistory")]')) > 1: votes_url = doc.xpath('//a[text()="Votes"]/@href')[0] self.scrape_votes(bill, votes_url) bill.add_source(votes_url) bill.add_source(url) self.save_bill(bill)
def scrape(self, chamber, session): self.site_id = self.metadata['session_details'][session]['internal_id'] chamber_piece = { 'upper': 'Senate', 'lower': 'House+of+Representatives' }[chamber] # resolutions # http://alisondb.legislature.state.al.us/acas/SESSResosBySelectedMatterTransResults.asp?WhichResos=Senate&TransCodes={All}&LegDay={All}%22&GetBillsTrans=Get+Resolutions+by+Transaction url = 'http://alisondb.legislature.state.al.us/acas/SESSBillsBySelectedMatterTransResults.asp?TransCodes={All}&LegDay={All}&WhichBills=%s' % chamber_piece self.refresh_session() with self.urlopen(url) as html: doc = lxml.html.fromstring(html) # bills are all their own table with cellspacing=4 (skip first) bill_tables = doc.xpath('//table[@cellspacing="4"]') for bt in bill_tables[1:]: # each table has 3 rows: detail row, description, blank details, desc, _ = bt.xpath('tr') # first <tr> has img, button, sponsor, topic, current house # current status, committee, committee2, last action _, button, sponsor, topic, _, _, com1, com2, _ = details.xpath( 'td') # pull bill_id out of script tag (gross) bill_id = bill_id_re.search(button.text_content()).group() oid = btn_re.search(button.text_content()).groups()[0] sponsor = sponsor.text_content() topic = topic.text_content() com1 = com1.text_content() com2 = com2.text_content() desc = desc.text_content() # create bill bill = Bill(session, chamber, bill_id, desc.strip(), topic=topic) bill.add_sponsor(sponsor, 'primary') self.get_sponsors(bill, oid) self.get_actions(bill, oid) # craft bill URL session_fragment = '2010rs' type_fragment = 'bills' bill_id_fragment = bill_id.lower() bill_text_url = 'http://alisondb.legislature.state.al.us/acas/searchableinstruments/%s/%s/%s.htm' % ( session_fragment, type_fragment, bill_id_fragment) bill.add_version('bill text', bill_text_url) self.save_bill(bill)
def scrape_bill(self, chamber, session, bill_id, bill_type): url = '%s?r=%s' % (self.base_url, bill_id) with self.urlopen(url) as html: doc = lxml.html.fromstring(html) # search for Titulo, accent over i messes up lxml, so use 'tulo' title = doc.xpath( u'//td/b[contains(text(),"tulo")]/../following-sibling::td/text()' ) if not title: raise NoSuchBill() bill = Bill(session, chamber, bill_id, title[0], type=bill_type) author = doc.xpath( u'//td/b[contains(text(),"Autor")]/../text()')[0] for aname in author.split(','): bill.add_sponsor('primary', self.clean_name(aname).strip()) co_authors = doc.xpath( u'//td/b[contains(text(),"Co-autor")]/../text()') if len(co_authors) != 0: for co_author in co_authors[1].split(','): bill.add_sponsor('cosponsor', self.clean_name(co_author).strip()) action_table = doc.xpath('//table')[-1] for row in action_table[1:]: tds = row.xpath('td') # ignore row missing date if len(tds) != 2: continue date = datetime.datetime.strptime(tds[0].text_content(), "%m/%d/%Y") action = tds[1].text_content().strip() #parse the text to see if it's a new version or a unrelated document #if has - let's *shrug* assume it's a vote document #get url of action action_url = tds[1].xpath('a/@href') atype, action = self.parse_action(chamber, bill, action, action_url, date) if atype == 'bill:passed' and action_url: vote_chamber = None for pattern, vote_chamber in _voteChambers: if re.match(pattern, action): break else: self.warning('coudnt find voteChamber pattern') if vote_chamber == 'lower' and len(action_url) > 0: vote = self.scrape_votes(action_url[0], action, date, vote_chamber) if not vote[0] == None: vote[0].add_source(action_url[0]) bill.add_vote(vote[0]) else: self.warning('Problem Reading vote: %s,%s' % (vote[1], bill_id)) bill.add_source(url) self.save_bill(bill)
def scrape_xml(self, chamber, session): start_letter = "S" if chamber == "upper" else "H" sponsor_type_dict = {"3": "senate cosponsor", "4": "sponsor", "5": "sponsor"} version_url = "http://www1.legis.ga.gov/legis/%s/versions/" % session summary_url = "http://www1.legis.ga.gov/legis/%s/list/BillSummary.xml" % session xml = self.urlopen(summary_url) doc = lxml.etree.fromstring(xml) for bxml in doc.xpath("//Bill"): type = bxml.get("Type") # if this is from the other chamber skip it if not type.startswith(start_letter): continue bill_id = type + bxml.get("Num") + bxml.get("Suffix") if type in ("HB", "SB"): type = "bill" elif type in ("HR", "SR"): type = "resolution" else: raise ValueError("unknown type: %s" % type) # use short_title as title and long as description title = bxml.xpath("Short_Title/text()")[0] description = bxml.xpath("Title/text()")[0] bill = Bill(session, chamber, bill_id, title, type=type, description=description) bill.add_source(summary_url) for sponsor in bxml.xpath("Sponsor"): sponsor_name, code = sponsor.text.rsplit(" ", 1) sponsor_name = sponsor_name.replace(",", ", ") bill.add_sponsor(sponsor_type_dict[sponsor.get("Type")], sponsor_name, _code=code) for version in bxml.xpath("Versions/Version"): # NOTE: it is possible to get PDF versions by using .get('Id') # ex. URL: legis.ga.gov/Legislation/20112012/108025.pdf # for now we just get HTML description, file_id = version.xpath("*/text()") bill.add_version(description, version_url + file_id) for action in bxml.xpath("StatusHistory/Status"): date = datetime.datetime.strptime(action.get("StatusDate"), "%Y-%m-%dT%H:%M:%S") code = action.get("StatusCode") if code in ("EFF", "Signed Gov"): actor = "executive" elif code[0] == "S": actor = "upper" elif code[0] == "H": actor = "lower" atype = self._action_codes[code] bill.add_action(actor, action.text, date, atype) self.save_bill(bill)
def scrape_bill(self, chamber, session, bill_id, url): page = lxml.html.fromstring(self.urlopen(url)) title = page.xpath( "string(//span[contains(@id, 'PlaceHolder1_txtST')])").strip() if 'JR' in bill_id: bill_type = ['joint resolution'] elif 'CR' in bill_id: bill_type = ['concurrent resolution'] elif 'R' in bill_id: bill_type = ['resolution'] else: bill_type = ['bill'] bill = Bill(session, chamber, bill_id, title, type=bill_type) bill.add_source(url) bill['subjects'] = self.subject_map[bill_id] for link in page.xpath("//a[contains(@id, 'Auth')]"): name = link.xpath("string()").strip() if 'otherAuth' in link.attrib['id']: bill.add_sponsor('coauthor', name) else: bill.add_sponsor('author', name) act_table = page.xpath("//table[contains(@id, 'Actions')]")[0] for tr in act_table.xpath("tr")[2:]: action = tr.xpath("string(td[1])").strip() if not action or action == 'None': continue date = tr.xpath("string(td[3])").strip() date = datetime.datetime.strptime(date, "%m/%d/%Y").date() actor = tr.xpath("string(td[4])").strip() if actor == 'H': actor = 'lower' elif actor == 'S': actor = 'upper' bill.add_action(actor, action, date, type=action_type(action)) version_table = page.xpath("//table[contains(@id, 'Versions')]")[0] for link in version_table.xpath(".//a[contains(@href, '.DOC')]"): version_url = link.attrib['href'] if 'COMMITTEE REPORTS' in version_url: continue name = link.text.strip() bill.add_version(name, version_url) for link in page.xpath(".//a[contains(@href, '_VOTES')]"): self.scrape_votes(bill, urlescape(link.attrib['href'])) self.save_bill(bill)
def scrape_bill(self, session, bill_id, chamber): #https://malegislature.gov/Bills/189/SD2739 session_for_url = self.replace_non_digits(session) bill_url = u'https://malegislature.gov/Bills/{}/{}'.format(session_for_url, bill_id) print bill_url try: response = requests.get(bill_url) except requests.exceptions.RequestException as e: self.warning(u'Server Error on {}'.format(bill_url)) return False html = response.text page = lxml.html.fromstring(html) if page.xpath('//div[contains(@class, "followable")]/h1/text()'): bill_number = page.xpath('//div[contains(@class, "followable")]/h1/text()')[0] else: self.warning(u'Server Error on {}'.format(bill_url)) return False bill_title = page.xpath('//div[@id="contentContainer"]/div/div/h2/text()')[0] bill_summary = '' if page.xpath('//p[@id="pinslip"]/text()'): bill_summary = page.xpath('//p[@id="pinslip"]/text()')[0] bill_id = re.sub(r'[^S|H|\d]','',bill_id) bill = Bill(session, chamber,bill_id, bill_title, summary=bill_summary) bill.add_source(bill_url) #https://malegislature.gov/Bills/189/SD2739 has a presenter #https://malegislature.gov/Bills/189/S2168 no sponsor # Find the non-blank text of the dt following Sponsor or Presenter, # including any child link text. sponsor = page.xpath('//dt[text()="Sponsor:" or text()="Presenter:"]/following-sibling::dd/descendant-or-self::*/text()[normalize-space()]') if sponsor: sponsor = sponsor[0].strip() bill.add_sponsor('primary', sponsor) has_cosponsor = page.xpath('//a[starts-with(normalize-space(.),"Petitioners")]') if has_cosponsor: self.scrape_cosponsors(bill, bill_url) version = page.xpath("//div[contains(@class, 'modalBtnGroup')]/a[contains(text(), 'Download PDF') and not(@disabled)]/@href") if version: version_url = "https://malegislature.gov{}".format(version[0]) bill.add_version('Bill Text', version_url, mimetype='application/pdf') self.scrape_actions(bill, bill_url) self.save_bill(bill)
def get_bill_info(self, chamber, session, bill_detail_url, version_list_url): """Extracts all the requested info for a given bill. Calls the parent's methods to enter the results into JSON files. """ if chamber == "House": chamber = 'lower' else: chamber = 'upper' with self.urlopen(bill_detail_url) as bill_html: doc = lxml.html.fromstring(bill_html) bill_id = doc.xpath('//title/text()')[0].split()[0] bill_title = doc.xpath('//font[@size=-1]/text()')[0] bill_type = { 'F': 'bill', 'R': 'resolution', 'C': 'concurrent resolution' }[bill_id[1]] bill = Bill(session, chamber, bill_id, bill_title, type=bill_type) bill['subjects'] = self._subject_mapping[bill_id] bill.add_source(bill_detail_url) # grab sponsors sponsors = doc.xpath( '//table[@summary="Show Authors"]/descendant::a/text()') if sponsors: primary_sponsor = sponsors[0].strip() bill.add_sponsor('primary', primary_sponsor) cosponsors = sponsors[1:] for leg in cosponsors: bill.add_sponsor('cosponsor', leg.strip()) # Add Actions performed on the bill. bill_actions = self.extract_bill_actions(doc, chamber) for action in bill_actions: bill.add_action(action['action_chamber'], action['action_text'], action['action_date'], type=action['action_type']) # Get all versions of the bill. # Versions of a bill are on a separate page, linked to from the column # labeled, "Bill Text", on the search results page. with self.urlopen(version_list_url) as version_html: if 'resolution' in version_html.response.url: bill.add_version('resolution text', version_html.response.url) else: version_doc = lxml.html.fromstring(version_html) for v in version_doc.xpath( '//a[starts-with(@href, "/bin/getbill.php")]'): version_url = urlparse.urljoin(VERSION_URL_BASE, v.get('href')) bill.add_version(v.text.strip(), version_url) self.save_bill(bill)
def scrape(self, chamber, session): self.log(self.metadata['session_details']) self.site_id = self.metadata['session_details'][session]['internal_id'] chamber_piece = {'upper': 'Senate', 'lower': 'House+of+Representatives'}[chamber] # resolutions # http://alisondb.legislature.state.al.us/acas/SESSResosBySelectedMatterTransResults.asp?WhichResos=Senate&TransCodes={All}&LegDay={All}%22&GetBillsTrans=Get+Resolutions+by+Transaction url = 'http://alisondb.legislature.state.al.us/acas/SESSBillsBySelectedMatterTransResults.asp?TransCodes={All}&LegDay={All}&WhichBills=%s' % chamber_piece cookie = self.refresh_session() agent = FakeFirefoxURLopener() agent.addheader('Cookie', cookie) page = agent.open(url) doc = lxml.html.fromstring(page.read()) # bills are all their own table with cellspacing=4 (skip first) bill_tables = doc.xpath('//table[@cellspacing="4"]') for bt in bill_tables[1:]: # each table has 3 rows: detail row, description, blank details, desc, _ = bt.xpath('tr') # first <tr> has img, button, sponsor, topic, current house # current status, committee, committee2, last action _, button, sponsor, topic, _, _, com1, com2, _ = details.xpath('td') # pull bill_id out of script tag (gross) bill_id = bill_id_re.search(button.text_content()).group() self.log(bill_id) oid = btn_re.search(button.text_content()).groups()[0] sponsor = sponsor.text_content() topic = topic.text_content() com1 = com1.text_content() com2 = com2.text_content() desc = desc.text_content() # create bill bill = Bill(session, chamber, bill_id, desc.strip(), topic=topic) bill.add_sponsor(sponsor, 'primary') self.get_sponsors(bill, oid) self.get_actions(bill, oid) # craft bill URL session_fragment = '2010rs' type_fragment = 'bills' bill_id_fragment = bill_id.lower() bill_text_url = 'http://alisondb.legislature.state.al.us/acas/searchableinstruments/%s/%s/%s.htm' % ( session_fragment, type_fragment, bill_id_fragment) bill.add_version('bill text', bill_text_url) self.save_bill(bill)
def scrape_bill(self, term, bill_url): with self.urlopen(bill_url) as page: page = lxml.html.fromstring(page) chamber1 = page.xpath('//span[@id="lblBillSponsor"]/a[1]')[0].text if len(page.xpath('//span[@id="lblCoBillSponsor"]/a[1]')) > 0: chamber2 = page.xpath('//span[@id="lblCoBillSponsor"]/a[1]')[0].text if '*' in chamber1: bill_id = chamber1.replace(' ', '')[1:len(chamber1)] secondary_bill_id = chamber2.replace(' ', '') else: bill_id = chamber2.replace(' ', '')[1:len(chamber2)] secondary_bill_id = chamber1.replace(' ', '') primary_chamber = 'lower' if 'H' in bill_id else 'upper' else: primary_chamber = 'lower' if 'H' in chamber1 else 'upper' bill_id = chamber1.replace(' ', '')[1:len(chamber1)] secondary_bill_id = None title = page.xpath("//span[@id='lblAbstract']")[0].text bill = Bill(term, primary_chamber, bill_id, title, secondary_bill_id=secondary_bill_id) bill.add_source(bill_url) # Primary Sponsor sponsor = page.xpath("//span[@id='lblBillSponsor']")[0].text_content().split("by")[-1] sponsor = sponsor.replace('*','').strip() bill.add_sponsor('primary',sponsor) # Co-sponsors unavailable for scraping (loaded into page via AJAX) # Full summary doc summary = page.xpath("//span[@id='lblBillSponsor']/a")[0] bill.add_document('Full summary', summary.get('href')) # Actions tables = page.xpath("//table[@id='tabHistoryAmendments_tabHistory_gvBillActionHistory']") actions_table = tables[0] action_rows = actions_table.xpath("tr[position()>1]") for ar in action_rows: action_taken = ar.xpath("td")[0].text action_date = datetime.datetime.strptime(ar.xpath("td")[1].text.strip(), '%m/%d/%Y') #NEED TO ADD SECONDARY ACTIONS bill.add_action(primary_chamber, action_taken, action_date) votes_link = page.xpath("//span[@id='lblBillVotes']/a") if(len(votes_link) > 0): votes_link = votes_link[0].get('href') bill = self.scrape_votes(bill, sponsor, 'http://wapp.capitol.tn.gov/apps/Billinfo/%s' % (votes_link,)) self.save_bill(bill)
def scrape_bill(self, chamber, session, bill_id, bill_type): url = '%s?r=%s' % (self.base_url, bill_id) html = self.urlopen(url) if "error '80020009'" in html: self.warning('asp error on page, skipping %s', bill_id) return doc = lxml.html.fromstring(html) # search for Titulo, accent over i messes up lxml, so use 'tulo' title = doc.xpath(u'//td/b[contains(text(),"tulo")]/../following-sibling::td/text()') if not title: raise NoSuchBill() bill = Bill(session, chamber, bill_id, title[0], type=bill_type) author = doc.xpath(u'//td/b[contains(text(),"Autor")]/../text()')[0] for aname in author.split(','): aname = self.clean_name(aname).strip() if aname: bill.add_sponsor('primary', aname) co_authors = doc.xpath(u'//td/b[contains(text(),"Co-autor")]/../text()') if len(co_authors) != 0: for co_author in co_authors[1].split(','): bill.add_sponsor('cosponsor', self.clean_name(co_author).strip()); action_table = doc.xpath('//table')[-1] for row in action_table[1:]: tds = row.xpath('td') # ignore row missing date if len(tds) != 2: continue if tds[0].text_content(): date = datetime.datetime.strptime(tds[0].text_content(), "%m/%d/%Y") action = tds[1].text_content().strip() #parse the text to see if it's a new version or a unrelated document #if has - let's *shrug* assume it's a vote document #get url of action action_url = tds[1].xpath('a/@href') atype,action = self.parse_action(chamber,bill,action,action_url,date) if atype == 'bill:passed' and action_url: vote_chamber = None for pattern, vote_chamber in _voteChambers: if re.match(pattern,action): break else: self.warning('coudnt find voteChamber pattern') if vote_chamber == 'lower' and len(action_url) > 0: vote = self.scrape_votes(action_url[0], action,date, vote_chamber) if not vote[0] == None: vote[0].add_source(action_url[0]) bill.add_vote(vote[0]) else: self.warning('Problem Reading vote: %s,%s' % (vote[1], bill_id)) bill.add_source(url) self.save_bill(bill)
def scrape_bill_page(self, chamber, session, bill_url, bill_type): page = self.lxmlize(bill_url) author = self.get_one_xpath(page, "//a[@id='ctl00_PageBody_LinkAuthor']/text()") sbp = lambda x: self.scrape_bare_page(page.xpath("//a[contains(text(), '%s')]" % (x))[0].attrib["href"]) authors = [x.text for x in sbp("Authors")] try: digests = sbp("Digests") except IndexError: digests = [] try: versions = sbp("Text") except IndexError: versions = [] title = page.xpath("//span[@id='ctl00_PageBody_LabelShortTitle']/text()")[0] actions = page.xpath("//div[@id='ctl00_PageBody_PanelBillInfo']/" "/table[@style='font-size:small']/tr") bill_id = page.xpath("//span[@id='ctl00_PageBody_LabelBillID']/text()")[0] bill_type = {"B": "bill", "CR": "concurrent resolution"}[bill_type[1:]] bill = Bill(session, chamber, bill_id, title, type=bill_type) bill.add_source(bill_url) authors.remove(author) bill.add_sponsor("primary", author) for author in authors: bill.add_sponsor("cosponsor", author) for digest in digests: bill.add_document(digest.text, digest.attrib["href"], mimetype="application/pdf") for version in versions: bill.add_version(version.text, version.attrib["href"], mimetype="application/pdf") flags = {"prefiled": ["bill:filed"], "referred to the committee": ["committee:referred"]} for action in actions: date, chamber, page, text = [x.text for x in action.xpath(".//td")] date += "/%s" % (session) # Session is April --> June. Prefiles # look like they're in January at earliest. date = dt.datetime.strptime(date, "%m/%d/%Y") chamber = {"S": "upper", "H": "lower", "J": "joint"}[chamber] cat = [] for flag in flags: if flag in text.lower(): cat += flags[flag] if cat == []: cat = ["other"] bill.add_action(chamber, text, date, cat) self.save_bill(bill)
def scrape_bills(self, chamber, session, subjects): idex = bill_start_numbers(session)[chamber] FROM="ctl00$rilinContent$txtBillFrom" TO="ctl00$rilinContent$txtBillTo" YEAR="ctl00$rilinContent$cbYear" blocks = "FOO" # Ugh. while len(blocks) > 0: default_headers = get_default_headers( SEARCH_URL ) default_headers[FROM] = idex default_headers[TO] = idex + MAXQUERY default_headers[YEAR] = session idex += MAXQUERY #headers = urllib.urlencode( default_headers ) blocks = self.parse_results_page(self.post(SEARCH_URL, data=default_headers).text) blocks = blocks[1:-1] blocks = self.digest_results_page(blocks) for block in blocks: bill = blocks[block] subs = [] try: subs = subjects[bill['bill_id']] except KeyError: pass title = bill['title'][len("ENTITLED, "):] billid = bill['bill_id'] try: subs = subjects[bill['bill_id']] except KeyError: subs = [] for b in BILL_NAME_TRANSLATIONS: if billid[:len(b)] == b: billid = BILL_NAME_TRANSLATIONS[b] + \ billid[len(b)+1:].split()[0] b = Bill(session, chamber, billid, title, type=self.get_type_by_name(bill['bill_id']), subjects=subs ) self.process_actions( bill['actions'], b ) sponsors = bill['sponsors'][len("BY"):].strip() sponsors = sponsors.split(",") sponsors = [ s.strip() for s in sponsors ] for href in bill['bill_id_hrefs']: b.add_version( href.text, href.attrib['href'], mimetype="application/pdf" ) for sponsor in sponsors: b.add_sponsor("primary", sponsor) b.add_source( SEARCH_URL ) self.save_bill(b)
def scrape_bill(self, chamber, term, bill_id, url, title, subject=None): self.logger.info('GET ' + url) resp = self.get(url) html = resp.text doc = lxml.html.fromstring(html) doc.make_links_absolute(url) bill = Bill(term, chamber, bill_id, title) bill.add_source(url) if subject is not None: bill['subjects'] = [subject] # Sponsors sponsor_map = { 'author': 'primary', 'co-author': 'cosponsor', 'sponsor': 'cosponsor', 'co-sponsor': 'cosponsor', } for div in doc.xpath('//div[contains(@class, "bill-author-info")]'): name = div.xpath('string(b)').strip() sp_type = sponsor_map[div.xpath('string(p)').strip().lower()] bill.add_sponsor(sp_type, name) # Actions for li in doc.xpath('//div[@id="bill-actions"]//li')[::-1]: if li.text_content() == 'None currently available.': continue chamber_str = li.xpath('string(strong)').strip() action_chamber = dict(H='lower', S='upper')[chamber_str] action_date = li.xpath('string(span[@class="document-date"])') action_date = datetime.datetime.strptime(action_date.strip(), '%m/%d/%Y') action_text = li.xpath('string(span[2])').strip() if not action_text.strip(): continue kwargs = dict(date=action_date, actor=action_chamber, action=action_text) kwargs.update(**self.categorizer.categorize(action_text)) bill.add_action(**kwargs) # Documents (including votes) for doc_type, doc_meta in BillDocuments(self, doc): if doc_type == 'version': bill.add_version(doc_meta.title or doc_meta.text, url=doc_meta.url, mimetype='application/pdf') elif doc_type == 'document': bill.add_document(doc_meta.title or doc_meta.text, url=doc_meta.url, mimetype='application/pdf') elif doc_type == 'rollcall': self.add_rollcall(chamber, bill, doc_meta) self.save_bill(bill)
def scrape_bill(self, chamber, session, bill_id, url): try: page = self.urlopen(url) except scrapelib.HTTPError: self.warning("couldn't open %s, skipping bill" % url) return page = lxml.html.fromstring(page) page.make_links_absolute(url) header = page.xpath('//h3/br')[0].tail.replace(' ', ' ') title, primary_sponsor = header.split(' -- ') if bill_id.startswith('H.B.') or bill_id.startswith('S.B.'): bill_type = ['bill'] elif bill_id.startswith('H.R.') or bill_id.startswith('S.R.'): bill_type = ['resolution'] elif bill_id.startswith('H.C.R.') or bill_id.startswith('S.C.R.'): bill_type = ['concurrent resolution'] elif bill_id.startswith('H.J.R.') or bill_id.startswith('S.J.R.'): bill_type = ['joint resolution'] for flag in SUB_BLACKLIST: if flag in bill_id: bill_id = bill_id.replace(flag, " ") bill_id = re.sub("\s+", " ", bill_id).strip() bill = Bill(session, chamber, bill_id, title, type=bill_type) bill.add_sponsor('primary', primary_sponsor) bill.add_source(url) for link in page.xpath( '//a[contains(@href, "bills/") and text() = "HTML"]'): name = link.getprevious().tail.strip() bill.add_version(name, link.attrib['href'], mimetype="text/html") next = link.getnext() if next.text == "PDF": bill.add_version(name, next.attrib['href'], mimetype="application/pdf") for link in page.xpath( "//a[contains(@href, 'fnotes') and text() = 'HTML']"): bill.add_document("Fiscal Note", link.attrib['href']) subjects = [] for link in page.xpath("//a[contains(@href, 'RelatedBill')]"): subjects.append(link.text.strip()) bill['subjects'] = subjects status_link = page.xpath('//a[contains(@href, "billsta")]')[0] self.parse_status(bill, status_link.attrib['href']) self.save_bill(bill)
def scrape_assem_bills(self, chamber, insert, session, year): doc_type = {1: 'bill', 3: 'resolution', 5: 'concurrent resolution', 6: 'joint resolution',9:'petition'} for docnum, bill_type in doc_type.iteritems(): parentpage_url = 'http://www.leg.state.nv.us/Session/%s/Reports/HistListBills.cfm?DoctypeID=%s' % (insert, docnum) links = self.scrape_links(parentpage_url) count = 0 for link in links: count = count + 1 page_path = 'http://www.leg.state.nv.us/Session/%s/Reports/%s' % (insert, link) page = self.get(page_path).text page = page.replace(u"\xa0", " ") root = lxml.html.fromstring(page) root.make_links_absolute("http://www.leg.state.nv.us/") bill_id = root.xpath('string(/html/body/div[@id="content"]/table[1]/tr[1]/td[1]/font)') title = self.get_node( root, '//div[@id="content"]/table/tr[preceding-sibling::tr/td/' 'b[contains(text(), "By:")]]/td/em/text()') bill = Bill(session, chamber, bill_id, title, type=bill_type) bill['subjects'] = list(set(self.subject_mapping[bill_id])) billtext = root.xpath("//b[text()='Bill Text']")[0].getparent().getnext() text_urls = billtext.xpath("./a") for text_url in text_urls: version_name = text_url.text.strip() version_url = text_url.attrib['href'] bill.add_version(version_name, version_url, mimetype='application/pdf') primary, secondary = self.scrape_sponsors(page) for leg in primary: bill.add_sponsor('primary', leg) for leg in secondary: bill.add_sponsor('cosponsor', leg) minutes_count = 2 for mr in root.xpath('//table[4]/tr/td[3]/a'): minutes = mr.xpath("string(@href)") minutes_url = "http://www.leg.state.nv.us" + minutes minutes_date_path = "string(//table[4]/tr[%s]/td[2])" % minutes_count minutes_date = mr.xpath(minutes_date_path).split() minutes_date = minutes_date[0] + minutes_date[1] + minutes_date[2] + " Minutes" bill.add_document(minutes_date, minutes_url) minutes_count = minutes_count + 1 self.scrape_actions(root, bill, "lower") self.scrape_votes(page, page_path, bill, insert, year) bill.add_source(page_path) self.save_bill(bill)
def scrape_bill(self, chamber, session, billid, histurl, year): if year[0] != 'R': session = year else: session = self.metadata['session_details'][year]['sub_sessions'][ int(year[0]) - 1] with self.urlopen(histurl) as data: soup = BeautifulSoup(cleansource(data)) basicinfo = soup.findAll('div', id='bhistleft')[0] hist = basicinfo.table sponsor = None title = None for b in basicinfo.findAll('b'): if b.next.startswith('SUMMARY'): title = b.findNextSiblings(text=True)[0].strip() elif b.next.startswith('SPONSOR'): for a in b.findNextSiblings('a'): if not issponsorlink(a): break sponsor = cleansponsor(a.contents[0]) bill = Bill(session, chamber, billid, title) if sponsor: bill.add_sponsor('primary', sponsor) for row in hist.findAll('tr'): link = row.td.a vlink = urlbase % link['href'] vname = link.contents[0].strip() bill.add_version(vname, vlink) history = soup.findAll('div', id='bhisttab')[0].table rows = history.findAll('tr')[1:] for row in rows: tds = row.findAll('td') if len(tds) < 2: # This is not actually an action continue date, action = row.findAll('td')[:2] date = dt.datetime.strptime(date.contents[0], '%m/%d/%y') action = action.contents[0].strip() if 'House' in action: actor = 'lower' elif 'Senate' in action: actor = 'upper' else: # for lack of a better actor = chamber bill.add_action(actor, action, date) self.save_bill(bill)
def scrape_senate_bills(self, chamber, insert, session, year): doc_type = {2: 'bill', 4: 'resolution', 7: 'concurrent resolution', 8: 'joint resolution'} for docnum, bill_type in doc_type.iteritems(): parentpage_url = 'http://www.leg.state.nv.us/Session/%s/Reports/HistListBills.cfm?DoctypeID=%s' % (insert, docnum) links = self.scrape_links(parentpage_url) count = 0 for link in links: count = count + 1 page_path = 'http://www.leg.state.nv.us/Session/%s/Reports/%s' % (insert, link) page = self.get(page_path).text page = page.replace(u"\xa0", " ") root = lxml.html.fromstring(page) bill_id = root.xpath('string(/html/body/div[@id="content"]/table[1]/tr[1]/td[1]/font)') title = self.get_node( root, '//div[@id="content"]/table/tr[preceding-sibling::tr/td/' 'b[contains(text(), "By:")]]/td/em/text()') bill = Bill(session, chamber, bill_id, title, type=bill_type) bill['subjects'] = list(set(self.subject_mapping[bill_id])) for table in root.xpath('//div[@id="content"]/table'): if 'Bill Text' in table.text_content(): bill_text = table.xpath("string(tr/td[2]/a/@href)") text_url = "http://www.leg.state.nv.us" + bill_text bill.add_version("Bill Text", text_url, mimetype='application/pdf') primary, secondary = self.scrape_sponsors(page) for leg in primary: bill.add_sponsor('primary', leg) for leg in secondary: bill.add_sponsor('cosponsor', leg) minutes_count = 2 for mr in root.xpath('//table[4]/tr/td[3]/a'): minutes = mr.xpath("string(@href)") minutes_url = "http://www.leg.state.nv.us" + minutes minutes_date_path = "string(//table[4]/tr[%s]/td[2])" % minutes_count minutes_date = mr.xpath(minutes_date_path).split() minutes_date = minutes_date[0] + minutes_date[1] + minutes_date[2] + " Agenda" bill.add_document(minutes_date, minutes_url) minutes_count = minutes_count + 1 self.scrape_actions(root, bill, "upper") self.scrape_votes(page, page_path, bill, insert, year) bill.add_source(page_path) self.save_bill(bill)
def parse_bill(self, chamber, session, bill_id, bill_info_url): with self.urlopen(bill_info_url) as bill_info_data: bill_info = self.soup_parser(bill_info_data) version_url = '%s/bill.doc' % bill_id version_link = bill_info.find(href=version_url) if not version_link: # This bill was withdrawn return bill_title = version_link.findNext('p').contents[0].strip() bill = Bill(session, chamber, bill_id, bill_title) bill.add_version("Most Recent Version", session_url(session) + version_url) bill.add_source(bill_info_url) sponsor_links = bill_info.findAll(href=re.compile( 'legislator/[SH]\d+\.htm')) for sponsor_link in sponsor_links: bill.add_sponsor('primary', sponsor_link.contents[0].strip()) action_p = version_link.findAllNext('p')[-1] for action in action_p.findAll(text=True): action = action.strip() if (not action or action == 'last action' or 'Prefiled' in action): continue action_date = action.split('-')[0] action_date = dt.datetime.strptime(action_date, '%b %d') # Fix: action_date = action_date.replace( year=int('20' + session[2:4])) action = '-'.join(action.split('-')[1:]) if action.endswith('House') or action.endswith('(H)'): actor = 'lower' elif action.endswith('Senate') or action.endswith('(S)'): actor = 'upper' else: actor = chamber bill.add_action(actor, action, action_date) vote_link = bill_info.find(href=re.compile('.*/vote_history.pdf')) if vote_link: bill.add_document( 'vote_history.pdf', bill_info_url.replace('.htm', '') + "/vote_history.pdf") self.save_bill(bill)
def scrape_bill(self, session, chamber, bill_type, bill_url): with self.urlopen(bill_url) as page: page = lxml.html.fromstring(page) page.make_links_absolute(bill_url) # split "SB1 SD2 HD2" to get SB1 bill_id = page.xpath('//a[@id="LinkButtonMeasure"]')[0].text_content().split()[0] title = page.xpath('//span[@id="ListView1_ctrl0_measure_titleLabel"]')[0].text subjects = page.xpath('//span[@id="ListView1_ctrl0_report_titleLabel"]')[0].text.split('; ') subjects = [s.strip() for s in subjects if s.strip()] description = page.xpath('//span[@id="ListView1_ctrl0_descriptionLabel"]')[0].text sponsors = page.xpath('//span[@id="ListView1_ctrl0_introducerLabel"]')[0].text referral = page.xpath('//span[contains(@id, "referral")]/text()')[0] bill = Bill(session, chamber, bill_id, title, subjects=subjects, type=bill_type, description=description, referral=referral) for sponsor in sponsors.split(', '): if sponsor.endswith(' (BR)'): sponsor = sponsor[:-5] bill.add_sponsor('primary', sponsor) # actions actions = [] table = page.xpath('//table[@id="GridViewStatus"]')[0] for row in table.xpath('tr'): action_params = {} cells = row.xpath('td') if len(cells) == 3: ch = cells[1].xpath('font')[0].text action_params['actor'] = house[ch] action_params['action'] = cells[2].xpath('font')[0].text action_date = cells[0].xpath('font')[0].text action_params['date'] = datetime.strptime(action_date, "%m/%d/%Y") action_params['type'] = categorize_action(action_params['action']) actions.append(action_params) for action_params in actions: bill.add_action(**action_params) self.parse_vote(bill, action_params['action'], action_params['actor'], action_params['date']) # add versions try: for version in page.xpath('//a[contains(@id, "StatusLink")]'): bill.add_version(version.text.replace('_', ' '), version.get('href')) except IndexError: # href not found. pass bill.add_source(bill_url) self.save_bill(bill)
def parse_senate_billpage(self, bill_url, year): bill_page = self.urlopen(bill_url) bill_page = lxml.html.fromstring(bill_page) # get all the info needed to record the bill # TODO probably still needs to be fixed bill_id = bill_page.xpath('//*[@id="lblBillNum"]')[0].text_content() bill_title = bill_page.xpath('//*[@id="lblBillTitle"]')[0].text_content() bill_desc = bill_page.xpath('//*[@id="lblBriefDesc"]')[0].text_content() bill_lr = bill_page.xpath('//*[@id="lblLRNum"]')[0].text_content() #print "bill id = "+ bill_id bill_type = "bill" triplet = bill_id[:3] if triplet in bill_types: bill_type = bill_types[triplet] subs = [] bid = bill_id.replace(" ", "") if bid in self.subjects: subs = self.subjects[bid] self.log("With subjects for this bill") self.log(bid) bill = Bill(year, 'upper', bill_id, bill_desc, bill_lr=bill_lr, type=bill_type, subjects=subs) bill.add_source(bill_url) # Get the primary sponsor sponsor = bill_page.xpath('//*[@id="hlSponsor"]')[0] bill_sponsor = sponsor.text_content() bill_sponsor_link = sponsor.attrib.get('href') bill.add_sponsor('primary', bill_sponsor, sponsor_link=bill_sponsor_link) # cosponsors show up on their own page, if they exist cosponsor_tag = bill_page.xpath('//*[@id="hlCoSponsors"]') if len(cosponsor_tag) > 0 and cosponsor_tag[0].attrib.has_key('href'): self.parse_senate_cosponsors(bill, cosponsor_tag[0].attrib['href']) # get the actions action_url = bill_page.xpath('//*[@id="hlAllActions"]') if len(action_url) > 0: action_url = action_url[0].attrib['href'] #print "actions = %s" % action_url self.parse_senate_actions(bill, action_url) # stored on a separate page versions_url = bill_page.xpath('//*[@id="hlFullBillText"]') if len(versions_url) > 0 and versions_url[0].attrib.has_key('href'): self.parse_senate_bill_versions(bill, versions_url[0].attrib['href']) self.save_bill(bill)