def parse_sponsors(): with open("tests/local/LegislationDetail.aspx?ID=1255978&GUID=8051C1E6-DED6-433B-AC9A-0FE436051C9F") as f: soup = BeautifulSoup(f) config = Config(hostname="chicago.legistar.com", fulltext=True).defaults(DEFAULT_CONFIG) scraper = LegistarScraper(config) legislation_details = scraper.parseLegislationDetail(soup) assert_equal(legislation_details[0]["Sponsors"][1], u"Moreno, Proco Joe")
def supports_simple_initial_search_form(): config = {"hostname": "phila.legistar.com", "fulltext": True} scraper = LegistarScraper(config) summaries = scraper.searchLegislation("") try: summaries.next() except StopIteration: fail("no legislation found")
def philly_sponsors(): config = Config(hostname="phila.legistar.com", sponsor_links=False).defaults(DEFAULT_CONFIG) scraper = LegistarScraper(config) legislation_summary = { "URL": "http://phila.legistar.com/LegislationDetail.aspx?ID=1233260&GUID=DC103FB6-FF9D-4250-B0CE-111B80E8B80C" } legislation_details = scraper.expandLegislationSummary(legislation_summary) assert_equal(legislation_details[0]["Sponsors"][0], u"Councilmember DiCicco")
def supports_simple_initial_search_form(): config = Config(hostname="phila.legistar.com", fulltext=True).defaults(DEFAULT_CONFIG) scraper = LegistarScraper(config) summaries = scraper.searchLegislation("") try: summaries.next() except StopIteration: fail("no legislation found")
def supports_fetching_calendar(): config = Config(hostname="phila.legistar.com", fulltext=False).defaults(DEFAULT_CONFIG) scraper = LegistarScraper(config) events = scraper.councilCalendar("all") try: events.next() except StopIteration: fail("no events found")
def supports_fetching_council_members(): config = Config(hostname="phila.legistar.com", fulltext=True).defaults(DEFAULT_CONFIG) scraper = LegistarScraper(config) members = scraper.councilMembers() try: members.next() except StopIteration: fail("no council members found")
def link_address_is_none(): config = Config(hostname='phila.legistar.com', sponsor_links=False).defaults(DEFAULT_CONFIG) scraper = LegistarScraper(config) from BeautifulSoup import BeautifulSoup link = BeautifulSoup('<html><a></a></html>').find('a') address = scraper._get_link_address(link) assert_is_none(address)
def recognize_dates(): config = {"hostname": "phila.legistar.com", "date_format": "%m/%d/%Y", "fulltext": True} scraper = LegistarScraper(config) summaries = scraper.searchLegislation("") summary = summaries.next() import datetime assert_is_instance(summary["File Created"], datetime.datetime)
def link_address_is_none(): config = Config(hostname="phila.legistar.com", sponsor_links=False).defaults(DEFAULT_CONFIG) scraper = LegistarScraper(config) from BeautifulSoup import BeautifulSoup link = BeautifulSoup("<html><a></a></html>").find("a") address = scraper._get_link_address(link) assert_is_none(address)
def parse_sponsors(): config = {"hostname": "chicago.legistar.com", "fulltext": True} scraper = LegistarScraper(config) legislation_summary = { "URL": "http://chicago.legistar.com/LegislationDetail.aspx?ID=1255978&GUID=8051C1E6-DED6-433B-AC9A-0FE436051C9F" } legislation_details = scraper.expandLegislationSummary(legislation_summary) assert_equal(legislation_details[0]["Sponsors"][1], u"Moreno, Proco Joe")
def recognize_dates(): config = Config(hostname="phila.legistar.com", sponsor_links=False, date_format="%m/%d/%Y").defaults(DEFAULT_CONFIG) scraper = LegistarScraper(config) summaries = scraper.searchLegislation("") summary = summaries.next() import datetime assert_is_instance(summary["File Created"], datetime.datetime)
def supports_advanced_initial_search_form(): config = Config(hostname="chicago.legistar.com", fulltext=True).defaults(DEFAULT_CONFIG) scraper = LegistarScraper(config) summaries = scraper.searchLegislation("") try: summaries.next() except StopIteration: # fail('no legislation found') assert False
def paging_through_results(): config = {"hostname": "chicago.legistar.com", "fulltext": True} scraper = LegistarScraper(config) summaries = list(scraper.searchLegislation("pub")) # Making summaries a list forces the scraper to iterate completely through # the generator for s in summaries: print s["Record #"] assert_greater(len(summaries), 100)
def link_address_is_href(): config = {"hostname": "phila.legistar.com", "fulltext": True} scraper = LegistarScraper(config) from BeautifulSoup import BeautifulSoup link = BeautifulSoup('<html><a href="http://www.google.com"></a></html>').find("a") address = scraper._get_link_address(link) assert_equal(address, "http://www.google.com")
def parse_detail_keys(): config = Config(hostname="phila.legistar.com", fulltext=True).defaults(DEFAULT_CONFIG) scraper = LegistarScraper(config) summary = { "URL": "http://phila.legistar.com/LegislationDetail.aspx?ID=1265815&GUID=97CBBF7C-A123-4808-9D50-A1E340BE5BC1" } detail = scraper.expandLegislationSummary(summary) assert_in(u"Version", detail[0].keys()) assert_not_in(u"CITY COUNCIL", detail[0].keys())
def link_address_is_none(): config = {"hostname": "phila.legistar.com", "fulltext": True} scraper = LegistarScraper(config) from BeautifulSoup import BeautifulSoup link = BeautifulSoup("<html><a></a></html>").find("a") address = scraper._get_link_address(link) assert_is_none(address)
def link_address_is_onclick(): config = {"hostname": "phila.legistar.com", "fulltext": True} scraper = LegistarScraper(config) from BeautifulSoup import BeautifulSoup link = BeautifulSoup("<html><a onclick=\"radopen('http://www.google.com');\"></a></html>").find("a") address = scraper._get_link_address(link) assert_equal(address, "http://www.google.com")
def supports_fetching_council_members(): config = Config(hostname='phila.legistar.com', fulltext=True).defaults(DEFAULT_CONFIG) scraper = LegistarScraper(config) members = scraper.councilMembers() try: members.next() except StopIteration: fail('no council members found')
def paging_through_legislation(): config = Config(hostname="chicago.legistar.com", fulltext=True).defaults(DEFAULT_CONFIG) scraper = LegistarScraper(config) summaries = list(scraper.searchLegislation("pub")) # Making summaries a list forces the scraper to iterate completely through # the generator for s in summaries: print s["Record #"] assert_greater(len(summaries), 100)
def supports_fetching_calendar(): config = Config(hostname='phila.legistar.com', fulltext=False).defaults(DEFAULT_CONFIG) scraper = LegistarScraper(config) events = scraper.councilCalendar('all') try: events.next() except StopIteration: fail('no events found')
def supports_simple_initial_search_form(): config = Config(hostname='phila.legistar.com', fulltext=True).defaults(DEFAULT_CONFIG) scraper = LegistarScraper(config) summaries = scraper.searchLegislation('') try: summaries.next() except StopIteration: fail('no legislation found')
def attachments_list(): config = Config(hostname='phila.legistar.com', sponsor_links=False).defaults(DEFAULT_CONFIG) scraper = LegistarScraper(config) detail = scraper.expandLegislationSummary({ 'URL': 'http://phila.legistar.com/LegislationDetail.aspx?ID=1243262&GUID=01021C5A-3624-4E5D-AA32-9822D1F5DA29&Options=ID|Text|&Search=' }) # Attachments value should be a list assert_is_instance(detail[0]['Attachments'], list)
def parse_sponsors(): config = Config(hostname='chicago.legistar.com', fulltext=True).defaults(DEFAULT_CONFIG) scraper = LegistarScraper(config) legislation_summary = { 'URL': 'http://chicago.legistar.com/LegislationDetail.aspx?ID=1255978&GUID=8051C1E6-DED6-433B-AC9A-0FE436051C9F' } legislation_details = scraper.expandLegislationSummary(legislation_summary) assert_equal(legislation_details[0]["Sponsors"][1], u'Moreno, Proco Joe')
def attachments_list(): config = {"hostname": "phila.legistar.com", "fulltext": True} scraper = LegistarScraper(config) detail = scraper.expandLegislationSummary( { "URL": "http://phila.legistar.com/LegislationDetail.aspx?ID=1243262&GUID=01021C5A-3624-4E5D-AA32-9822D1F5DA29&Options=ID|Text|&Search=" } ) # Attachments value should be a list assert_is_instance(detail[0]["Attachments"], list)
def attachments_list(): config = Config(hostname="phila.legistar.com", sponsor_links=False).defaults(DEFAULT_CONFIG) scraper = LegistarScraper(config) detail = scraper.expandLegislationSummary( { "URL": "http://phila.legistar.com/LegislationDetail.aspx?ID=1243262&GUID=01021C5A-3624-4E5D-AA32-9822D1F5DA29&Options=ID|Text|&Search=" } ) # Attachments value should be a list assert_is_instance(detail[0]["Attachments"], list)
def parse_sponsors(): with open( 'tests/local/LegislationDetail.aspx?ID=1255978&GUID=8051C1E6-DED6-433B-AC9A-0FE436051C9F' ) as f: soup = BeautifulSoup(f) config = Config(hostname='chicago.legistar.com', fulltext=True).defaults(DEFAULT_CONFIG) scraper = LegistarScraper(config) legislation_details = scraper.parseLegislationDetail(soup) assert_equal(legislation_details[0]["Sponsors"][1], u'Moreno, Proco Joe')
def supports_advanced_initial_search_form(): config = Config(hostname='chicago.legistar.com', fulltext=True).defaults(DEFAULT_CONFIG) scraper = LegistarScraper(config) summaries = scraper.searchLegislation('') try: summaries.next() except StopIteration: #fail('no legislation found') assert False
def no_attachments_list(): config = {"hostname": "phila.legistar.com", "fulltext": True} scraper = LegistarScraper(config) detail = scraper.expandLegislationSummary( { "URL": "http://phila.legistar.com/LegislationDetail.aspx?ID=1254964&GUID=AF8A4E91-4DF6-41A2-80B4-EFC94A2AFF89&Options=ID|Text|&Search=" } ) # Legislation with no attachments should have no attachment key assert_not_in("Attachments", detail[0])
def paging_through_legislation(): config = Config(hostname='chicago.legistar.com', fulltext=True).defaults(DEFAULT_CONFIG) scraper = LegistarScraper(config) summaries = list(scraper.searchLegislation('pub')) # Making summaries a list forces the scraper to iterate completely through # the generator for s in summaries: print s['Record #'] assert_greater(len(summaries), 100)
def link_address_is_onclick(): config = Config(hostname="phila.legistar.com", sponsor_links=False).defaults(DEFAULT_CONFIG) scraper = LegistarScraper(config) from BeautifulSoup import BeautifulSoup link = BeautifulSoup("<html><a onclick=\"radopen('http://www.google.com');\"></a></html>").find("a") address = scraper._get_link_address(link) assert_equal(address, "http://www.google.com")
def parse_detail_keys(): config = Config(hostname='phila.legistar.com', fulltext=True).defaults(DEFAULT_CONFIG) scraper = LegistarScraper(config) summary = { 'URL': 'http://phila.legistar.com/LegislationDetail.aspx?ID=1265815&GUID=97CBBF7C-A123-4808-9D50-A1E340BE5BC1' } detail = scraper.expandLegislationSummary(summary) assert_in(u'Version', detail[0].keys()) assert_not_in(u'CITY COUNCIL', detail[0].keys())
def no_attachments_list(): config = Config(hostname='phila.legistar.com', sponsor_links=False).defaults(DEFAULT_CONFIG) scraper = LegistarScraper(config) detail = scraper.expandLegislationSummary({ 'URL': 'http://phila.legistar.com/LegislationDetail.aspx?ID=1254964&GUID=AF8A4E91-4DF6-41A2-80B4-EFC94A2AFF89&Options=ID|Text|&Search=' }) # Legislation with no attachments should have no attachment key assert_not_in('Attachments', detail[0])
def recognize_dates(): config = Config( hostname='phila.legistar.com', sponsor_links=False, date_format='%m/%d/%Y', ).defaults(DEFAULT_CONFIG) scraper = LegistarScraper(config) summaries = scraper.searchLegislation('') summary = summaries.next() import datetime assert_is_instance(summary['File Created'], datetime.datetime)
def link_address_is_href(): config = Config(hostname='phila.legistar.com', sponsor_links=False).defaults(DEFAULT_CONFIG) scraper = LegistarScraper(config) from BeautifulSoup import BeautifulSoup link = BeautifulSoup( '<html><a href="http://www.google.com"></a></html>').find('a') address = scraper._get_link_address(link) assert_equal(address, 'http://www.google.com')
def no_attachments_list(): config = Config(hostname="phila.legistar.com", sponsor_links=False).defaults(DEFAULT_CONFIG) scraper = LegistarScraper(config) detail = scraper.expandLegislationSummary( { "URL": "http://phila.legistar.com/LegislationDetail.aspx?ID=1254964&GUID=AF8A4E91-4DF6-41A2-80B4-EFC94A2AFF89&Options=ID|Text|&Search=" } ) # Legislation with no attachments should have no attachment key assert_not_in("Attachments", detail[0])
def philly_sponsors(): config = Config(hostname='phila.legistar.com', sponsor_links=False).defaults(DEFAULT_CONFIG) scraper = LegistarScraper(config) legislation_summary = { 'URL': 'http://phila.legistar.com/LegislationDetail.aspx?ID=1233260&GUID=DC103FB6-FF9D-4250-B0CE-111B80E8B80C' } legislation_details = scraper.expandLegislationSummary(legislation_summary) assert_equal(legislation_details[0]["Sponsors"][0], u'Councilmember DiCicco')
def history_row_url(): config = Config(hostname='chicago.legistar.com', fulltext=True).defaults(DEFAULT_CONFIG) scraper = LegistarScraper(config) detail = scraper.expandLegislationSummary({ 'URL': 'http://chicago.legistar.com/LegislationDetail.aspx?ID=1255978&GUID=8051C1E6-DED6-433B-AC9A-0FE436051C9F&Options=Advanced&Search=' }) assert_equal( detail[1][0]['Action Details']['url'], 'https://chicago.legistar.com/HistoryDetail.aspx?ID=6534991&GUID=253AA818-B592-4594-8237-0A617AA41766' )
def history_row_url(): config = Config(hostname="chicago.legistar.com", fulltext=True).defaults(DEFAULT_CONFIG) scraper = LegistarScraper(config) detail = scraper.expandLegislationSummary( { "URL": "http://chicago.legistar.com/LegislationDetail.aspx?ID=1255978&GUID=8051C1E6-DED6-433B-AC9A-0FE436051C9F&Options=Advanced&Search=" } ) assert_equal( detail[1][0]["Action Details"]["url"], "https://chicago.legistar.com/HistoryDetail.aspx?ID=6534991&GUID=253AA818-B592-4594-8237-0A617AA41766", )
def history_row_url(): config = {"hostname": "chicago.legistar.com", "fulltext": True} scraper = LegistarScraper(config) detail = scraper.expandLegislationSummary( { "URL": "http://chicago.legistar.com/LegislationDetail.aspx?ID=1255978&GUID=8051C1E6-DED6-433B-AC9A-0FE436051C9F&Options=Advanced&Search=" } ) assert_equal( detail[1][0]["URL"], "http://chicago.legistar.com/HistoryDetail.aspx?ID=6534991&GUID=253AA818-B592-4594-8237-0A617AA41766", )
def can_get_history_detail_using_summary_row(): config = {"hostname": "phila.legistar.com", "fulltext": True} scraper = LegistarScraper(config) legislation_summary = { "URL": "http://phila.legistar.com/LegislationDetail.aspx?ID=1236768&GUID=EB92A4C2-469A-4D73-97C0-A620BBDDD5BE&Options=ID|Text|&Search=" } legislation_details = scraper.expandLegislationSummary(legislation_summary) history_summary = legislation_details[1][2] attrs, votes = scraper.expandHistorySummary(history_summary) ayes = [vote for vote in votes if vote["Vote"] == "Ayes"] assert_equal(len(ayes), 14) assert_equal(attrs["Result"], "Pass")
def can_get_history_detail_using_summary_row(): config = Config(hostname="chicago.legistar.com", sponsor_links=False).defaults(DEFAULT_CONFIG) scraper = LegistarScraper(config) legislation_summary = { "URL": "https://chicago.legistar.com/LegislationDetail.aspx?ID=1450228&GUID=97689689-D0EA-47A2-8474-09B3A71C221B&Options=Advanced&Search=" } legislation_details = scraper.expandLegislationSummary(legislation_summary) history_summary = legislation_details[1][0]["Action Details"] attrs, votes = scraper.expandHistorySummary(history_summary) ayes = [vote for vote in votes if vote["Vote"] == "Yea"] assert_equal(len(ayes), 49) assert_equal(attrs["Result"], "Pass")
def philly_topics(): """Tests that scraper works for Philly legislation with and without topics""" config = Config(hostname="philly.legistar.com").defaults(DEFAULT_CONFIG) scraper = LegistarScraper(config) legislation_with_topics = { "URL": "http://phila.legistar.com/LegislationDetail.aspx?ID=1433307&GUID=773A9C3F-ABA5-4D6C-B901-A9EEE3B1B8B0" } legislation_details = scraper.expandLegislationSummary(legislation_with_topics) assert_equal(legislation_details[0]["Topics"], [u"LIQUOR BY THE DRINK TAX", u"SCHOOL TAX AUTHORIZATION"]) legislation_no_topics = { "URL": "http://phila.legistar.com/LegislationDetail.aspx?ID=1426307&GUID=E9EC8885-0DDD-4B64-AB2D-EA0503284268" } legislation_details = scraper.expandLegislationSummary(legislation_no_topics) assert_equal(legislation_details[0]["Topics"], [])
def can_get_history_detail_using_summary_row(): config = Config(hostname='phila.legistar.com', sponsor_links=False).defaults(DEFAULT_CONFIG) scraper = LegistarScraper(config) legislation_summary = { 'URL': 'http://phila.legistar.com/LegislationDetail.aspx?ID=1236768&GUID=EB92A4C2-469A-4D73-97C0-A620BBDDD5BE&Options=ID|Text|&Search=' } legislation_details = scraper.expandLegislationSummary(legislation_summary) history_summary = legislation_details[1][2] attrs, votes = scraper.expandHistorySummary(history_summary) ayes = [vote for vote in votes if vote['Vote'] == 'Ayes'] assert_equal(len(ayes), 14) assert_equal(attrs['Result'], 'Pass')
def __init__(self, **options): self.id_label = options.pop('id_label', 'Record #') self.url_label = options.pop('url_label', 'URL') self.type_label = options.pop('type_label', 'Type') self.status_label = options.pop('status_label', 'Status') self.title_label = options.pop('title_label', 'Title') self.topics_label = options.pop('topics_label', 'Topic') self.intro_date_label = options.pop('intro_date_label', 'Intro Date') self.final_date_label = options.pop('final_date_label', 'Final Date') self.controlling_body_label = options.pop( 'controlling_body_label', 'Current Controlling Legislative Body') self.version_label = options.pop('version_label', 'Version') self.scraper = LegistarScraper(options) self.legislation_summaries = self.scraper.searchLegislation( '', created_before='2012-10-5')
def can_get_history_detail_using_summary_row(): config = Config(hostname='chicago.legistar.com', sponsor_links=False).defaults(DEFAULT_CONFIG) scraper = LegistarScraper(config) legislation_summary = { 'URL': 'https://chicago.legistar.com/LegislationDetail.aspx?ID=1450228&GUID=97689689-D0EA-47A2-8474-09B3A71C221B&Options=Advanced&Search=' } legislation_details = scraper.expandLegislationSummary(legislation_summary) history_summary = legislation_details[1][0]['Action Details'] attrs, votes = scraper.expandHistorySummary(history_summary) ayes = [vote for vote in votes if vote['Vote'] == 'Yea'] assert_equal(len(ayes), 49) assert_equal(attrs['Result'], 'Pass')
def chicago_topics(): """Tests that scraper works for Chicago for legislation with and without topics""" config = Config(hostname="chicago.legistar.com").defaults(DEFAULT_CONFIG) scraper = LegistarScraper(config) legislation_with_topics = { "URL": "http://chicago.legistar.com/LegislationDetail.aspx?ID=1319481&GUID=40B01792-C9D8-4E8C-BADE-2D27BFC8284D" } legislation_details = scraper.expandLegislationSummary(legislation_with_topics) print legislation_details[0] assert_equal(legislation_details[0]["Topics"], [u"PUBLIC WAY USAGE - Awnings"]) legislation_no_topics = { "URL": "http://chicago.legistar.com/LegislationDetail.aspx?ID=1429779&GUID=118DDF75-D698-4526-BA54-B560BB6CCB04" } legislation_details = scraper.expandLegislationSummary(legislation_no_topics) assert_equal(legislation_details[0]["Topics"], [])
def default_legislation_and_calendar_uris(): config = Config(hostname='synecdoche.legistar.com', fulltext=True).defaults(DEFAULT_CONFIG) scraper = LegistarScraper(config) assert_equal(scraper._legislation_uri, 'https://synecdoche.legistar.com/Legislation.aspx') assert_equal(scraper._calendar_uri, 'https://synecdoche.legistar.com/Calendar.aspx')
def philly_topics(): """Tests that scraper works for Philly legislation with and without topics""" config = Config(hostname='philly.legistar.com').defaults(DEFAULT_CONFIG) scraper = LegistarScraper(config) legislation_with_topics = { 'URL': 'http://phila.legistar.com/LegislationDetail.aspx?ID=1433307&GUID=773A9C3F-ABA5-4D6C-B901-A9EEE3B1B8B0' } legislation_details = scraper.expandLegislationSummary( legislation_with_topics) assert_equal(legislation_details[0]["Topics"], [u'LIQUOR BY THE DRINK TAX', u'SCHOOL TAX AUTHORIZATION']) legislation_no_topics = { 'URL': 'http://phila.legistar.com/LegislationDetail.aspx?ID=1426307&GUID=E9EC8885-0DDD-4B64-AB2D-EA0503284268' } legislation_details = scraper.expandLegislationSummary( legislation_no_topics) assert_equal(legislation_details[0]["Topics"], [])
def chicago_topics(): """Tests that scraper works for Chicago for legislation with and without topics""" config = Config(hostname='chicago.legistar.com').defaults(DEFAULT_CONFIG) scraper = LegistarScraper(config) legislation_with_topics = { 'URL': 'http://chicago.legistar.com/LegislationDetail.aspx?ID=1319481&GUID=40B01792-C9D8-4E8C-BADE-2D27BFC8284D' } legislation_details = scraper.expandLegislationSummary( legislation_with_topics) print legislation_details[0] assert_equal(legislation_details[0]["Topics"], [u'PUBLIC WAY USAGE - Awnings']) legislation_no_topics = { 'URL': 'http://chicago.legistar.com/LegislationDetail.aspx?ID=1429779&GUID=118DDF75-D698-4526-BA54-B560BB6CCB04' } legislation_details = scraper.expandLegislationSummary( legislation_no_topics) assert_equal(legislation_details[0]["Topics"], [])
def __init__(self, **options): self.id_label = options.pop('id_label', 'Record #') self.url_label = options.pop('url_label', 'URL') self.type_label = options.pop('type_label', 'Type') self.status_label = options.pop('status_label', 'Status') self.title_label = options.pop('title_label', 'Title') self.indexes_label = options.pop('indexes_label', 'Indexes') self.intro_date_label = options.pop('intro_date_label', 'Intro Date') self.final_date_label = options.pop('final_date_label', 'Final Date') self.controlling_body_label = options.pop('controlling_body_label', 'Current Controlling Legislative Body') self.version_label = options.pop('version_label', 'Version') self.scraper = LegistarScraper(options) self.legislation_summaries = self.scraper.searchLegislation('', created_before='2012-10-5')
def paging_through_calendar(): config = Config(hostname='alexandria.legistar.com', fulltext=False).defaults(DEFAULT_CONFIG) scraper = LegistarScraper(config) events = list(scraper.councilCalendar('all')) assert_greater(len(events), 100)
class HostedLegistarSiteWrapper(object): """ A facade over the Philadelphia city council legistar site data. It is responsible for scraping data out of the site. The main external point of interaction is scrape_legis_file. requires: BeautifulSoup, mechanize """ def __init__(self, **options): self.id_label = options.pop('id_label', 'Record #') self.url_label = options.pop('url_label', 'URL') self.type_label = options.pop('type_label', 'Type') self.status_label = options.pop('status_label', 'Status') self.title_label = options.pop('title_label', 'Title') self.topics_label = options.pop('topics_label', 'Topic') self.intro_date_label = options.pop('intro_date_label', 'Intro Date') self.final_date_label = options.pop('final_date_label', 'Final Date') self.controlling_body_label = options.pop( 'controlling_body_label', 'Current Controlling Legislative Body') self.version_label = options.pop('version_label', 'Version') self.scraper = LegistarScraper(options) self.legislation_summaries = self.scraper.searchLegislation( '', created_before='2012-10-5') def scrape_legis_file(self, key, summary): '''Extract a record from the given document (soup). The key is for the sake of record-keeping. It is the key passed to the site URL.''' while True: try: legislation_attrs, legislation_history = self.scraper.expandLegislationSummary( summary) break except urllib2.URLError as e: log.warning(e) log.warning('skipping to next leg record') except AttributeError as e: log.warning(e) log.warning('skipping to next leg record') while True: try: summary = self.legislation_summaries.next() break except urllib2.URLError as e: log.warning(e) log.warning('sleeping for five minutes') time.sleep('360') parsed_url = urlparse.urlparse(summary['URL']) key = urlparse.parse_qs(parsed_url.query)['ID'][0] # re-order the sponsor name by '[First] [Last]' instead of '[Last], [First]' sponsors = legislation_attrs['Sponsors'] first_name_first_sponsors = [] for sponsor in sponsors: if ',' in sponsor: name_list = sponsor.split(',') name_list.reverse() sponsor = ' '.join(name_list).strip() first_name_first_sponsors.append(sponsor) topics = legislation_attrs.get('Topics', None) if topics is None: joined_topics = legislation_attrs.get(self.topics_label, '') topics = [topic.strip() for topic in joined_topics.split(',')] try: record = { 'key': key, 'id': summary[self.id_label], 'url': summary[self.url_label], 'type': summary[self.type_label], 'status': summary[self.status_label], 'title': summary[self.title_label], 'topics': topics, 'controlling_body': legislation_attrs[self.controlling_body_label], 'intro_date': self.convert_date(summary[self.intro_date_label]), 'final_date': self.convert_date(summary.setdefault(self.final_date_label, '')), 'version': summary.setdefault(self.version_label, ''), #'contact' : None, 'sponsors': first_name_first_sponsors, # probably remove this from the model as well 'minutes_url': None } except KeyError, e: raise ScrapeError('Failed to find key %s in either summary keys ' '(%r) or attrs (%r)' % (e, summary.keys(), legislation_attrs.keys())) try: attachments = legislation_attrs['Attachments'] for attachment in attachments: attachment['key'] = key attachment['file'] = attachment['label'] attachment['description'] = attachment['label'] del attachment['label'] except KeyError: attachments = [] actions = [] for act in legislation_history: try: act_details, act_votes = self.scraper.expandHistorySummary(act) except (KeyError, AttributeError) as e: print e print summary continue try: acting_body = act['Action By'] if not isinstance(acting_body, basestring): acting_body = acting_body['label'] action = { 'key': key, 'date_taken': self.convert_date(act['Date']), 'acting_body': acting_body, 'motion': act['Result'], 'description': act['Action'], 'notes': '', 'votes': [{ 'voter': vote['Person Name'], 'value': vote['Vote'] } for vote in act_votes] } except TypeError as e: print e print summary continue except KeyError as e: print act print e print summary raise actions.append(action) # we should probably remove this from the model since the hosted # legistar does not have minutes minutes = [] log.info('Scraped legfile with key %r' % (key, )) log.debug("%r %r %r %r" % (record, attachments, actions, minutes)) return record, attachments, actions, minutes
def paging_through_calendar(): config = Config(hostname="alexandria.legistar.com", fulltext=False).defaults(DEFAULT_CONFIG) scraper = LegistarScraper(config) events = list(scraper.councilCalendar("all")) assert_greater(len(events), 100)
def __init__(self, **options): self.scraper = LegistarScraper(options) self.legislation_summaries = self.scraper.searchLegislation( '', created_before='2012-10-5')
class HostedLegistarSiteWrapper(object): """ A facade over the Philadelphia city council legistar site data. It is responsible for scraping data out of the site. The main external point of interaction is scrape_legis_file. requires: BeautifulSoup, mechanize """ def __init__(self, **options): self.scraper = LegistarScraper(options) self.legislation_summaries = self.scraper.searchLegislation( '', created_before='2012-10-5') def scrape_legis_file(self, key, summary): '''Extract a record from the given document (soup). The key is for the sake of record-keeping. It is the key passed to the site URL.''' while True: try: legislation_attrs, legislation_history = self.scraper.expandLegislationSummary( summary) break except urllib2.URLError as e: print e print 'skipping to next leg record' except AttributeError as e: print e print 'skipping to next leg record' while True: try: summary = self.legislation_summaries.next() break except urllib2.URLError as e: print e print 'sleeping for five minutes' time.sleep('360') parsed_url = urlparse.urlparse(summary['URL']) key = urlparse.parse_qs(parsed_url.query)['ID'][0] # re-order the sponsor name by '[First] [Last]' instead of '[Last], [First]' sponsors = legislation_attrs['Sponsors'] first_name_first_sponsors = [] for sponsor in sponsors: if ',' in sponsor: name_list = sponsor.split(',') name_list.reverse() sponsor = ' '.join(name_list).strip() first_name_first_sponsors.append(sponsor) record = { 'key': key, 'id': summary['Record #'], 'url': summary['URL'], 'type': summary['Type'], 'status': summary['Status'], 'title': summary['Title'], 'controlling_body': legislation_attrs['Current Controlling Legislative Body'], 'intro_date': self.convert_date(summary['Intro Date']), 'final_date': self.convert_date(summary.setdefault('Final Date', '')), 'version': summary.setdefault('Version', ''), #'contact' : None, 'sponsors': first_name_first_sponsors, # probably remove this from the model as well 'minutes_url': None } try: attachments = legislation_attrs['Attachments'] for attachment in attachments: attachment['key'] = key attachment['file'] = attachment['label'] attachment['description'] = attachment['label'] del attachment['label'] except KeyError: attachments = [] actions = [] for act in legislation_history: try: act_details, act_votes = self.scraper.expandHistorySummary(act) except (KeyError, AttributeError) as e: print e print summary continue try: action = { 'key': key, 'date_taken': self.convert_date(act['Date']), 'acting_body': act['Action By']['label'], 'motion': act['Result'], 'description': act['Action'], 'notes': '' } except TypeError as e: print e print summary continue except KeyError as e: print act print e print summary raise actions.append(action) # we should probably remove this from the model since the hosted # legistar does not have minutes minutes = [] log.info('Scraped legfile with key %r' % (key, )) log.debug("%r %r %r %r" % (record, attachments, actions, minutes)) return record, attachments, actions, minutes def convert_date(self, orig_date): if orig_date: return datetime.datetime.strptime(orig_date, '%m/%d/%Y').date() else: return '' def check_for_new_content(self, last_key): '''Grab the next legislation summary row. Doesn't use the last_key parameter; just starts at the beginning for each instance of the scraper. ''' try: print 'next leg record' next_summary = self.legislation_summaries.next() return 0, next_summary except StopIteration: return None, None def init_pdf_cache(self, pdf_mapping): pass
def paging_through_council_members(): config = Config(hostname='a2gov.legistar.com', fulltext=False).defaults(DEFAULT_CONFIG) scraper = LegistarScraper(config) members = list(scraper.councilMembers(follow_links=False)) assert_greater(len(members), 100)