def parse_sponsors():
    with open("tests/local/LegislationDetail.aspx?ID=1255978&GUID=8051C1E6-DED6-433B-AC9A-0FE436051C9F") as f:
        soup = BeautifulSoup(f)
    config = Config(hostname="chicago.legistar.com", fulltext=True).defaults(DEFAULT_CONFIG)
    scraper = LegistarScraper(config)
    legislation_details = scraper.parseLegislationDetail(soup)
    assert_equal(legislation_details[0]["Sponsors"][1], u"Moreno, Proco Joe")
def supports_simple_initial_search_form():
    config = {"hostname": "phila.legistar.com", "fulltext": True}
    scraper = LegistarScraper(config)
    summaries = scraper.searchLegislation("")
    try:
        summaries.next()
    except StopIteration:
        fail("no legislation found")
def philly_sponsors():
    config = Config(hostname="phila.legistar.com", sponsor_links=False).defaults(DEFAULT_CONFIG)
    scraper = LegistarScraper(config)
    legislation_summary = {
        "URL": "http://phila.legistar.com/LegislationDetail.aspx?ID=1233260&GUID=DC103FB6-FF9D-4250-B0CE-111B80E8B80C"
    }
    legislation_details = scraper.expandLegislationSummary(legislation_summary)
    assert_equal(legislation_details[0]["Sponsors"][0], u"Councilmember DiCicco")
def supports_simple_initial_search_form():
    config = Config(hostname="phila.legistar.com", fulltext=True).defaults(DEFAULT_CONFIG)
    scraper = LegistarScraper(config)
    summaries = scraper.searchLegislation("")
    try:
        summaries.next()
    except StopIteration:
        fail("no legislation found")
def supports_fetching_calendar():
    config = Config(hostname="phila.legistar.com", fulltext=False).defaults(DEFAULT_CONFIG)
    scraper = LegistarScraper(config)
    events = scraper.councilCalendar("all")
    try:
        events.next()
    except StopIteration:
        fail("no events found")
def supports_fetching_council_members():
    config = Config(hostname="phila.legistar.com", fulltext=True).defaults(DEFAULT_CONFIG)
    scraper = LegistarScraper(config)
    members = scraper.councilMembers()
    try:
        members.next()
    except StopIteration:
        fail("no council members found")
示例#7
0
def link_address_is_none():
    config = Config(hostname='phila.legistar.com',
                    sponsor_links=False).defaults(DEFAULT_CONFIG)
    scraper = LegistarScraper(config)
    from BeautifulSoup import BeautifulSoup
    link = BeautifulSoup('<html><a></a></html>').find('a')
    address = scraper._get_link_address(link)
    assert_is_none(address)
def recognize_dates():
    config = {"hostname": "phila.legistar.com", "date_format": "%m/%d/%Y", "fulltext": True}
    scraper = LegistarScraper(config)
    summaries = scraper.searchLegislation("")
    summary = summaries.next()
    import datetime

    assert_is_instance(summary["File Created"], datetime.datetime)
def link_address_is_none():
    config = Config(hostname="phila.legistar.com", sponsor_links=False).defaults(DEFAULT_CONFIG)
    scraper = LegistarScraper(config)
    from BeautifulSoup import BeautifulSoup

    link = BeautifulSoup("<html><a></a></html>").find("a")
    address = scraper._get_link_address(link)
    assert_is_none(address)
def parse_sponsors():
    config = {"hostname": "chicago.legistar.com", "fulltext": True}
    scraper = LegistarScraper(config)
    legislation_summary = {
        "URL": "http://chicago.legistar.com/LegislationDetail.aspx?ID=1255978&GUID=8051C1E6-DED6-433B-AC9A-0FE436051C9F"
    }
    legislation_details = scraper.expandLegislationSummary(legislation_summary)
    assert_equal(legislation_details[0]["Sponsors"][1], u"Moreno, Proco Joe")
def recognize_dates():
    config = Config(hostname="phila.legistar.com", sponsor_links=False, date_format="%m/%d/%Y").defaults(DEFAULT_CONFIG)
    scraper = LegistarScraper(config)
    summaries = scraper.searchLegislation("")
    summary = summaries.next()
    import datetime

    assert_is_instance(summary["File Created"], datetime.datetime)
def supports_advanced_initial_search_form():
    config = Config(hostname="chicago.legistar.com", fulltext=True).defaults(DEFAULT_CONFIG)
    scraper = LegistarScraper(config)
    summaries = scraper.searchLegislation("")
    try:
        summaries.next()
    except StopIteration:
        # fail('no legislation found')
        assert False
def paging_through_results():
    config = {"hostname": "chicago.legistar.com", "fulltext": True}
    scraper = LegistarScraper(config)
    summaries = list(scraper.searchLegislation("pub"))
    # Making summaries a list forces the scraper to iterate completely through
    # the generator
    for s in summaries:
        print s["Record #"]
    assert_greater(len(summaries), 100)
def link_address_is_href():
    config = {"hostname": "phila.legistar.com", "fulltext": True}
    scraper = LegistarScraper(config)

    from BeautifulSoup import BeautifulSoup

    link = BeautifulSoup('<html><a href="http://www.google.com"></a></html>').find("a")
    address = scraper._get_link_address(link)
    assert_equal(address, "http://www.google.com")
def parse_detail_keys():
    config = Config(hostname="phila.legistar.com", fulltext=True).defaults(DEFAULT_CONFIG)
    scraper = LegistarScraper(config)
    summary = {
        "URL": "http://phila.legistar.com/LegislationDetail.aspx?ID=1265815&GUID=97CBBF7C-A123-4808-9D50-A1E340BE5BC1"
    }
    detail = scraper.expandLegislationSummary(summary)
    assert_in(u"Version", detail[0].keys())
    assert_not_in(u"CITY COUNCIL", detail[0].keys())
def link_address_is_none():
    config = {"hostname": "phila.legistar.com", "fulltext": True}
    scraper = LegistarScraper(config)

    from BeautifulSoup import BeautifulSoup

    link = BeautifulSoup("<html><a></a></html>").find("a")
    address = scraper._get_link_address(link)
    assert_is_none(address)
def link_address_is_onclick():
    config = {"hostname": "phila.legistar.com", "fulltext": True}
    scraper = LegistarScraper(config)

    from BeautifulSoup import BeautifulSoup

    link = BeautifulSoup("<html><a onclick=\"radopen('http://www.google.com');\"></a></html>").find("a")
    address = scraper._get_link_address(link)
    assert_equal(address, "http://www.google.com")
示例#18
0
def supports_fetching_council_members():
    config = Config(hostname='phila.legistar.com',
                    fulltext=True).defaults(DEFAULT_CONFIG)
    scraper = LegistarScraper(config)
    members = scraper.councilMembers()
    try:
        members.next()
    except StopIteration:
        fail('no council members found')
def paging_through_legislation():
    config = Config(hostname="chicago.legistar.com", fulltext=True).defaults(DEFAULT_CONFIG)
    scraper = LegistarScraper(config)
    summaries = list(scraper.searchLegislation("pub"))
    # Making summaries a list forces the scraper to iterate completely through
    # the generator
    for s in summaries:
        print s["Record #"]
    assert_greater(len(summaries), 100)
示例#20
0
def supports_fetching_calendar():
    config = Config(hostname='phila.legistar.com',
                    fulltext=False).defaults(DEFAULT_CONFIG)
    scraper = LegistarScraper(config)
    events = scraper.councilCalendar('all')
    try:
        events.next()
    except StopIteration:
        fail('no events found')
def supports_simple_initial_search_form():
    config = Config(hostname='phila.legistar.com',
                    fulltext=True).defaults(DEFAULT_CONFIG)
    scraper = LegistarScraper(config)
    summaries = scraper.searchLegislation('')
    try:
        summaries.next()
    except StopIteration:
        fail('no legislation found')
示例#22
0
def attachments_list():
    config = Config(hostname='phila.legistar.com',
                    sponsor_links=False).defaults(DEFAULT_CONFIG)
    scraper = LegistarScraper(config)
    detail = scraper.expandLegislationSummary({
        'URL':
        'http://phila.legistar.com/LegislationDetail.aspx?ID=1243262&GUID=01021C5A-3624-4E5D-AA32-9822D1F5DA29&Options=ID|Text|&Search='
    })
    # Attachments value should be a list
    assert_is_instance(detail[0]['Attachments'], list)
def parse_sponsors():
    config = Config(hostname='chicago.legistar.com',
                    fulltext=True).defaults(DEFAULT_CONFIG)
    scraper = LegistarScraper(config)
    legislation_summary = {
        'URL':
        'http://chicago.legistar.com/LegislationDetail.aspx?ID=1255978&GUID=8051C1E6-DED6-433B-AC9A-0FE436051C9F'
    }
    legislation_details = scraper.expandLegislationSummary(legislation_summary)
    assert_equal(legislation_details[0]["Sponsors"][1], u'Moreno, Proco Joe')
def attachments_list():
    config = {"hostname": "phila.legistar.com", "fulltext": True}
    scraper = LegistarScraper(config)
    detail = scraper.expandLegislationSummary(
        {
            "URL": "http://phila.legistar.com/LegislationDetail.aspx?ID=1243262&GUID=01021C5A-3624-4E5D-AA32-9822D1F5DA29&Options=ID|Text|&Search="
        }
    )
    # Attachments value should be a list
    assert_is_instance(detail[0]["Attachments"], list)
def attachments_list():
    config = Config(hostname="phila.legistar.com", sponsor_links=False).defaults(DEFAULT_CONFIG)
    scraper = LegistarScraper(config)
    detail = scraper.expandLegislationSummary(
        {
            "URL": "http://phila.legistar.com/LegislationDetail.aspx?ID=1243262&GUID=01021C5A-3624-4E5D-AA32-9822D1F5DA29&Options=ID|Text|&Search="
        }
    )
    # Attachments value should be a list
    assert_is_instance(detail[0]["Attachments"], list)
示例#26
0
def parse_sponsors():
    with open(
            'tests/local/LegislationDetail.aspx?ID=1255978&GUID=8051C1E6-DED6-433B-AC9A-0FE436051C9F'
    ) as f:
        soup = BeautifulSoup(f)
    config = Config(hostname='chicago.legistar.com',
                    fulltext=True).defaults(DEFAULT_CONFIG)
    scraper = LegistarScraper(config)
    legislation_details = scraper.parseLegislationDetail(soup)
    assert_equal(legislation_details[0]["Sponsors"][1], u'Moreno, Proco Joe')
示例#27
0
def supports_advanced_initial_search_form():
    config = Config(hostname='chicago.legistar.com',
                    fulltext=True).defaults(DEFAULT_CONFIG)
    scraper = LegistarScraper(config)
    summaries = scraper.searchLegislation('')
    try:
        summaries.next()
    except StopIteration:
        #fail('no legislation found')
        assert False
def no_attachments_list():
    config = {"hostname": "phila.legistar.com", "fulltext": True}
    scraper = LegistarScraper(config)
    detail = scraper.expandLegislationSummary(
        {
            "URL": "http://phila.legistar.com/LegislationDetail.aspx?ID=1254964&GUID=AF8A4E91-4DF6-41A2-80B4-EFC94A2AFF89&Options=ID|Text|&Search="
        }
    )
    # Legislation with no attachments should have no attachment key
    assert_not_in("Attachments", detail[0])
示例#29
0
def paging_through_legislation():
    config = Config(hostname='chicago.legistar.com',
                    fulltext=True).defaults(DEFAULT_CONFIG)
    scraper = LegistarScraper(config)
    summaries = list(scraper.searchLegislation('pub'))
    # Making summaries a list forces the scraper to iterate completely through
    # the generator
    for s in summaries:
        print s['Record #']
    assert_greater(len(summaries), 100)
def link_address_is_onclick():
    config = Config(hostname="phila.legistar.com", sponsor_links=False).defaults(DEFAULT_CONFIG)

    scraper = LegistarScraper(config)

    from BeautifulSoup import BeautifulSoup

    link = BeautifulSoup("<html><a onclick=\"radopen('http://www.google.com');\"></a></html>").find("a")
    address = scraper._get_link_address(link)
    assert_equal(address, "http://www.google.com")
示例#31
0
def parse_detail_keys():
    config = Config(hostname='phila.legistar.com',
                    fulltext=True).defaults(DEFAULT_CONFIG)
    scraper = LegistarScraper(config)
    summary = {
        'URL':
        'http://phila.legistar.com/LegislationDetail.aspx?ID=1265815&GUID=97CBBF7C-A123-4808-9D50-A1E340BE5BC1'
    }
    detail = scraper.expandLegislationSummary(summary)
    assert_in(u'Version', detail[0].keys())
    assert_not_in(u'CITY COUNCIL', detail[0].keys())
示例#32
0
def no_attachments_list():
    config = Config(hostname='phila.legistar.com',
                    sponsor_links=False).defaults(DEFAULT_CONFIG)

    scraper = LegistarScraper(config)
    detail = scraper.expandLegislationSummary({
        'URL':
        'http://phila.legistar.com/LegislationDetail.aspx?ID=1254964&GUID=AF8A4E91-4DF6-41A2-80B4-EFC94A2AFF89&Options=ID|Text|&Search='
    })
    # Legislation with no attachments should have no attachment key
    assert_not_in('Attachments', detail[0])
示例#33
0
def recognize_dates():
    config = Config(
        hostname='phila.legistar.com',
        sponsor_links=False,
        date_format='%m/%d/%Y',
    ).defaults(DEFAULT_CONFIG)
    scraper = LegistarScraper(config)
    summaries = scraper.searchLegislation('')
    summary = summaries.next()
    import datetime
    assert_is_instance(summary['File Created'], datetime.datetime)
示例#34
0
def link_address_is_href():
    config = Config(hostname='phila.legistar.com',
                    sponsor_links=False).defaults(DEFAULT_CONFIG)

    scraper = LegistarScraper(config)

    from BeautifulSoup import BeautifulSoup
    link = BeautifulSoup(
        '<html><a href="http://www.google.com"></a></html>').find('a')
    address = scraper._get_link_address(link)
    assert_equal(address, 'http://www.google.com')
def no_attachments_list():
    config = Config(hostname="phila.legistar.com", sponsor_links=False).defaults(DEFAULT_CONFIG)

    scraper = LegistarScraper(config)
    detail = scraper.expandLegislationSummary(
        {
            "URL": "http://phila.legistar.com/LegislationDetail.aspx?ID=1254964&GUID=AF8A4E91-4DF6-41A2-80B4-EFC94A2AFF89&Options=ID|Text|&Search="
        }
    )
    # Legislation with no attachments should have no attachment key
    assert_not_in("Attachments", detail[0])
示例#36
0
def philly_sponsors():
    config = Config(hostname='phila.legistar.com',
                    sponsor_links=False).defaults(DEFAULT_CONFIG)
    scraper = LegistarScraper(config)
    legislation_summary = {
        'URL':
        'http://phila.legistar.com/LegislationDetail.aspx?ID=1233260&GUID=DC103FB6-FF9D-4250-B0CE-111B80E8B80C'
    }
    legislation_details = scraper.expandLegislationSummary(legislation_summary)
    assert_equal(legislation_details[0]["Sponsors"][0],
                 u'Councilmember DiCicco')
示例#37
0
def history_row_url():
    config = Config(hostname='chicago.legistar.com',
                    fulltext=True).defaults(DEFAULT_CONFIG)
    scraper = LegistarScraper(config)
    detail = scraper.expandLegislationSummary({
        'URL':
        'http://chicago.legistar.com/LegislationDetail.aspx?ID=1255978&GUID=8051C1E6-DED6-433B-AC9A-0FE436051C9F&Options=Advanced&Search='
    })
    assert_equal(
        detail[1][0]['Action Details']['url'],
        'https://chicago.legistar.com/HistoryDetail.aspx?ID=6534991&GUID=253AA818-B592-4594-8237-0A617AA41766'
    )
def history_row_url():
    config = Config(hostname="chicago.legistar.com", fulltext=True).defaults(DEFAULT_CONFIG)
    scraper = LegistarScraper(config)
    detail = scraper.expandLegislationSummary(
        {
            "URL": "http://chicago.legistar.com/LegislationDetail.aspx?ID=1255978&GUID=8051C1E6-DED6-433B-AC9A-0FE436051C9F&Options=Advanced&Search="
        }
    )
    assert_equal(
        detail[1][0]["Action Details"]["url"],
        "https://chicago.legistar.com/HistoryDetail.aspx?ID=6534991&GUID=253AA818-B592-4594-8237-0A617AA41766",
    )
def history_row_url():
    config = {"hostname": "chicago.legistar.com", "fulltext": True}
    scraper = LegistarScraper(config)
    detail = scraper.expandLegislationSummary(
        {
            "URL": "http://chicago.legistar.com/LegislationDetail.aspx?ID=1255978&GUID=8051C1E6-DED6-433B-AC9A-0FE436051C9F&Options=Advanced&Search="
        }
    )
    assert_equal(
        detail[1][0]["URL"],
        "http://chicago.legistar.com/HistoryDetail.aspx?ID=6534991&GUID=253AA818-B592-4594-8237-0A617AA41766",
    )
def can_get_history_detail_using_summary_row():
    config = {"hostname": "phila.legistar.com", "fulltext": True}
    scraper = LegistarScraper(config)
    legislation_summary = {
        "URL": "http://phila.legistar.com/LegislationDetail.aspx?ID=1236768&GUID=EB92A4C2-469A-4D73-97C0-A620BBDDD5BE&Options=ID|Text|&Search="
    }
    legislation_details = scraper.expandLegislationSummary(legislation_summary)
    history_summary = legislation_details[1][2]

    attrs, votes = scraper.expandHistorySummary(history_summary)
    ayes = [vote for vote in votes if vote["Vote"] == "Ayes"]
    assert_equal(len(ayes), 14)
    assert_equal(attrs["Result"], "Pass")
def can_get_history_detail_using_summary_row():
    config = Config(hostname="chicago.legistar.com", sponsor_links=False).defaults(DEFAULT_CONFIG)
    scraper = LegistarScraper(config)
    legislation_summary = {
        "URL": "https://chicago.legistar.com/LegislationDetail.aspx?ID=1450228&GUID=97689689-D0EA-47A2-8474-09B3A71C221B&Options=Advanced&Search="
    }
    legislation_details = scraper.expandLegislationSummary(legislation_summary)

    history_summary = legislation_details[1][0]["Action Details"]

    attrs, votes = scraper.expandHistorySummary(history_summary)
    ayes = [vote for vote in votes if vote["Vote"] == "Yea"]
    assert_equal(len(ayes), 49)
    assert_equal(attrs["Result"], "Pass")
def philly_topics():
    """Tests that scraper works for Philly legislation with and without topics"""
    config = Config(hostname="philly.legistar.com").defaults(DEFAULT_CONFIG)
    scraper = LegistarScraper(config)
    legislation_with_topics = {
        "URL": "http://phila.legistar.com/LegislationDetail.aspx?ID=1433307&GUID=773A9C3F-ABA5-4D6C-B901-A9EEE3B1B8B0"
    }
    legislation_details = scraper.expandLegislationSummary(legislation_with_topics)
    assert_equal(legislation_details[0]["Topics"], [u"LIQUOR BY THE DRINK TAX", u"SCHOOL TAX AUTHORIZATION"])
    legislation_no_topics = {
        "URL": "http://phila.legistar.com/LegislationDetail.aspx?ID=1426307&GUID=E9EC8885-0DDD-4B64-AB2D-EA0503284268"
    }
    legislation_details = scraper.expandLegislationSummary(legislation_no_topics)
    assert_equal(legislation_details[0]["Topics"], [])
def can_get_history_detail_using_summary_row():
    config = Config(hostname='phila.legistar.com',
                    sponsor_links=False).defaults(DEFAULT_CONFIG)
    scraper = LegistarScraper(config)
    legislation_summary = {
        'URL':
        'http://phila.legistar.com/LegislationDetail.aspx?ID=1236768&GUID=EB92A4C2-469A-4D73-97C0-A620BBDDD5BE&Options=ID|Text|&Search='
    }
    legislation_details = scraper.expandLegislationSummary(legislation_summary)
    history_summary = legislation_details[1][2]

    attrs, votes = scraper.expandHistorySummary(history_summary)
    ayes = [vote for vote in votes if vote['Vote'] == 'Ayes']
    assert_equal(len(ayes), 14)
    assert_equal(attrs['Result'], 'Pass')
    def __init__(self, **options):
        self.id_label = options.pop('id_label', 'Record #')
        self.url_label = options.pop('url_label', 'URL')
        self.type_label = options.pop('type_label', 'Type')
        self.status_label = options.pop('status_label', 'Status')
        self.title_label = options.pop('title_label', 'Title')
        self.topics_label = options.pop('topics_label', 'Topic')
        self.intro_date_label = options.pop('intro_date_label', 'Intro Date')
        self.final_date_label = options.pop('final_date_label', 'Final Date')
        self.controlling_body_label = options.pop(
            'controlling_body_label', 'Current Controlling Legislative Body')
        self.version_label = options.pop('version_label', 'Version')

        self.scraper = LegistarScraper(options)
        self.legislation_summaries = self.scraper.searchLegislation(
            '', created_before='2012-10-5')
示例#45
0
def can_get_history_detail_using_summary_row():
    config = Config(hostname='chicago.legistar.com',
                    sponsor_links=False).defaults(DEFAULT_CONFIG)
    scraper = LegistarScraper(config)
    legislation_summary = {
        'URL':
        'https://chicago.legistar.com/LegislationDetail.aspx?ID=1450228&GUID=97689689-D0EA-47A2-8474-09B3A71C221B&Options=Advanced&Search='
    }
    legislation_details = scraper.expandLegislationSummary(legislation_summary)

    history_summary = legislation_details[1][0]['Action Details']

    attrs, votes = scraper.expandHistorySummary(history_summary)
    ayes = [vote for vote in votes if vote['Vote'] == 'Yea']
    assert_equal(len(ayes), 49)
    assert_equal(attrs['Result'], 'Pass')
def chicago_topics():
    """Tests that scraper works for Chicago for legislation with and without topics"""
    config = Config(hostname="chicago.legistar.com").defaults(DEFAULT_CONFIG)
    scraper = LegistarScraper(config)
    legislation_with_topics = {
        "URL": "http://chicago.legistar.com/LegislationDetail.aspx?ID=1319481&GUID=40B01792-C9D8-4E8C-BADE-2D27BFC8284D"
    }
    legislation_details = scraper.expandLegislationSummary(legislation_with_topics)

    print legislation_details[0]
    assert_equal(legislation_details[0]["Topics"], [u"PUBLIC WAY USAGE - Awnings"])

    legislation_no_topics = {
        "URL": "http://chicago.legistar.com/LegislationDetail.aspx?ID=1429779&GUID=118DDF75-D698-4526-BA54-B560BB6CCB04"
    }
    legislation_details = scraper.expandLegislationSummary(legislation_no_topics)
    assert_equal(legislation_details[0]["Topics"], [])
示例#47
0
def default_legislation_and_calendar_uris():
    config = Config(hostname='synecdoche.legistar.com',
                    fulltext=True).defaults(DEFAULT_CONFIG)
    scraper = LegistarScraper(config)
    assert_equal(scraper._legislation_uri,
                 'https://synecdoche.legistar.com/Legislation.aspx')
    assert_equal(scraper._calendar_uri,
                 'https://synecdoche.legistar.com/Calendar.aspx')
示例#48
0
def philly_topics():
    """Tests that scraper works for Philly legislation with and without topics"""
    config = Config(hostname='philly.legistar.com').defaults(DEFAULT_CONFIG)
    scraper = LegistarScraper(config)
    legislation_with_topics = {
        'URL':
        'http://phila.legistar.com/LegislationDetail.aspx?ID=1433307&GUID=773A9C3F-ABA5-4D6C-B901-A9EEE3B1B8B0'
    }
    legislation_details = scraper.expandLegislationSummary(
        legislation_with_topics)
    assert_equal(legislation_details[0]["Topics"],
                 [u'LIQUOR BY THE DRINK TAX', u'SCHOOL TAX AUTHORIZATION'])
    legislation_no_topics = {
        'URL':
        'http://phila.legistar.com/LegislationDetail.aspx?ID=1426307&GUID=E9EC8885-0DDD-4B64-AB2D-EA0503284268'
    }
    legislation_details = scraper.expandLegislationSummary(
        legislation_no_topics)
    assert_equal(legislation_details[0]["Topics"], [])
示例#49
0
def chicago_topics():
    """Tests that scraper works for Chicago for legislation with and without topics"""
    config = Config(hostname='chicago.legistar.com').defaults(DEFAULT_CONFIG)
    scraper = LegistarScraper(config)
    legislation_with_topics = {
        'URL':
        'http://chicago.legistar.com/LegislationDetail.aspx?ID=1319481&GUID=40B01792-C9D8-4E8C-BADE-2D27BFC8284D'
    }
    legislation_details = scraper.expandLegislationSummary(
        legislation_with_topics)

    print legislation_details[0]
    assert_equal(legislation_details[0]["Topics"],
                 [u'PUBLIC WAY USAGE - Awnings'])

    legislation_no_topics = {
        'URL':
        'http://chicago.legistar.com/LegislationDetail.aspx?ID=1429779&GUID=118DDF75-D698-4526-BA54-B560BB6CCB04'
    }
    legislation_details = scraper.expandLegislationSummary(
        legislation_no_topics)
    assert_equal(legislation_details[0]["Topics"], [])
    def __init__(self, **options):
        self.id_label = options.pop('id_label', 'Record #')
        self.url_label = options.pop('url_label', 'URL')
        self.type_label = options.pop('type_label', 'Type')
        self.status_label = options.pop('status_label', 'Status')
        self.title_label = options.pop('title_label', 'Title')
        self.indexes_label = options.pop('indexes_label', 'Indexes')
        self.intro_date_label = options.pop('intro_date_label', 'Intro Date')
        self.final_date_label = options.pop('final_date_label', 'Final Date')
        self.controlling_body_label = options.pop('controlling_body_label', 'Current Controlling Legislative Body')
        self.version_label = options.pop('version_label', 'Version')

        self.scraper = LegistarScraper(options)
        self.legislation_summaries =  self.scraper.searchLegislation('', created_before='2012-10-5')
示例#51
0
def paging_through_calendar():
    config = Config(hostname='alexandria.legistar.com',
                    fulltext=False).defaults(DEFAULT_CONFIG)
    scraper = LegistarScraper(config)
    events = list(scraper.councilCalendar('all'))
    assert_greater(len(events), 100)
class HostedLegistarSiteWrapper(object):
    """
    A facade over the Philadelphia city council legistar site data.  It is
    responsible for scraping data out of the site.  The main external point
    of interaction is scrape_legis_file.

    requires: BeautifulSoup, mechanize
    """
    def __init__(self, **options):
        self.id_label = options.pop('id_label', 'Record #')
        self.url_label = options.pop('url_label', 'URL')
        self.type_label = options.pop('type_label', 'Type')
        self.status_label = options.pop('status_label', 'Status')
        self.title_label = options.pop('title_label', 'Title')
        self.topics_label = options.pop('topics_label', 'Topic')
        self.intro_date_label = options.pop('intro_date_label', 'Intro Date')
        self.final_date_label = options.pop('final_date_label', 'Final Date')
        self.controlling_body_label = options.pop(
            'controlling_body_label', 'Current Controlling Legislative Body')
        self.version_label = options.pop('version_label', 'Version')

        self.scraper = LegistarScraper(options)
        self.legislation_summaries = self.scraper.searchLegislation(
            '', created_before='2012-10-5')

    def scrape_legis_file(self, key, summary):
        '''Extract a record from the given document (soup). The key is for the
           sake of record-keeping.  It is the key passed to the site URL.'''

        while True:
            try:
                legislation_attrs, legislation_history = self.scraper.expandLegislationSummary(
                    summary)
                break
            except urllib2.URLError as e:
                log.warning(e)
                log.warning('skipping to next leg record')
            except AttributeError as e:
                log.warning(e)
                log.warning('skipping to next leg record')
            while True:
                try:
                    summary = self.legislation_summaries.next()
                    break
                except urllib2.URLError as e:
                    log.warning(e)
                    log.warning('sleeping for five minutes')
                    time.sleep('360')

        parsed_url = urlparse.urlparse(summary['URL'])
        key = urlparse.parse_qs(parsed_url.query)['ID'][0]

        # re-order the sponsor name by '[First] [Last]' instead of '[Last], [First]'
        sponsors = legislation_attrs['Sponsors']
        first_name_first_sponsors = []
        for sponsor in sponsors:
            if ',' in sponsor:
                name_list = sponsor.split(',')
                name_list.reverse()
                sponsor = ' '.join(name_list).strip()
            first_name_first_sponsors.append(sponsor)

        topics = legislation_attrs.get('Topics', None)
        if topics is None:
            joined_topics = legislation_attrs.get(self.topics_label, '')
            topics = [topic.strip() for topic in joined_topics.split(',')]

        try:
            record = {
                'key':
                key,
                'id':
                summary[self.id_label],
                'url':
                summary[self.url_label],
                'type':
                summary[self.type_label],
                'status':
                summary[self.status_label],
                'title':
                summary[self.title_label],
                'topics':
                topics,
                'controlling_body':
                legislation_attrs[self.controlling_body_label],
                'intro_date':
                self.convert_date(summary[self.intro_date_label]),
                'final_date':
                self.convert_date(summary.setdefault(self.final_date_label,
                                                     '')),
                'version':
                summary.setdefault(self.version_label, ''),
                #'contact' : None,
                'sponsors':
                first_name_first_sponsors,
                # probably remove this from the model as well
                'minutes_url':
                None
            }
        except KeyError, e:
            raise ScrapeError('Failed to find key %s in either summary keys '
                              '(%r) or attrs (%r)' %
                              (e, summary.keys(), legislation_attrs.keys()))

        try:
            attachments = legislation_attrs['Attachments']
            for attachment in attachments:
                attachment['key'] = key
                attachment['file'] = attachment['label']
                attachment['description'] = attachment['label']
                del attachment['label']
        except KeyError:
            attachments = []

        actions = []
        for act in legislation_history:
            try:
                act_details, act_votes = self.scraper.expandHistorySummary(act)
            except (KeyError, AttributeError) as e:
                print e
                print summary
                continue
            try:
                acting_body = act['Action By']
                if not isinstance(acting_body, basestring):
                    acting_body = acting_body['label']

                action = {
                    'key':
                    key,
                    'date_taken':
                    self.convert_date(act['Date']),
                    'acting_body':
                    acting_body,
                    'motion':
                    act['Result'],
                    'description':
                    act['Action'],
                    'notes':
                    '',
                    'votes': [{
                        'voter': vote['Person Name'],
                        'value': vote['Vote']
                    } for vote in act_votes]
                }
            except TypeError as e:
                print e
                print summary
                continue
            except KeyError as e:
                print act
                print e
                print summary
                raise
            actions.append(action)

        # we should probably remove this from the model since the hosted
        # legistar does not have minutes
        minutes = []

        log.info('Scraped legfile with key %r' % (key, ))
        log.debug("%r %r %r %r" % (record, attachments, actions, minutes))

        return record, attachments, actions, minutes
def paging_through_calendar():
    config = Config(hostname="alexandria.legistar.com", fulltext=False).defaults(DEFAULT_CONFIG)
    scraper = LegistarScraper(config)
    events = list(scraper.councilCalendar("all"))
    assert_greater(len(events), 100)
 def __init__(self, **options):
     self.scraper = LegistarScraper(options)
     self.legislation_summaries = self.scraper.searchLegislation(
         '', created_before='2012-10-5')
class HostedLegistarSiteWrapper(object):
    """
    A facade over the Philadelphia city council legistar site data.  It is
    responsible for scraping data out of the site.  The main external point
    of interaction is scrape_legis_file.

    requires: BeautifulSoup, mechanize
    """
    def __init__(self, **options):
        self.scraper = LegistarScraper(options)
        self.legislation_summaries = self.scraper.searchLegislation(
            '', created_before='2012-10-5')

    def scrape_legis_file(self, key, summary):
        '''Extract a record from the given document (soup). The key is for the
           sake of record-keeping.  It is the key passed to the site URL.'''

        while True:
            try:
                legislation_attrs, legislation_history = self.scraper.expandLegislationSummary(
                    summary)
                break
            except urllib2.URLError as e:
                print e
                print 'skipping to next leg record'
            except AttributeError as e:
                print e
                print 'skipping to next leg record'
            while True:
                try:
                    summary = self.legislation_summaries.next()
                    break
                except urllib2.URLError as e:
                    print e
                    print 'sleeping for five minutes'
                    time.sleep('360')

        parsed_url = urlparse.urlparse(summary['URL'])
        key = urlparse.parse_qs(parsed_url.query)['ID'][0]

        # re-order the sponsor name by '[First] [Last]' instead of '[Last], [First]'
        sponsors = legislation_attrs['Sponsors']
        first_name_first_sponsors = []
        for sponsor in sponsors:
            if ',' in sponsor:
                name_list = sponsor.split(',')
                name_list.reverse()
                sponsor = ' '.join(name_list).strip()
            first_name_first_sponsors.append(sponsor)

        record = {
            'key':
            key,
            'id':
            summary['Record #'],
            'url':
            summary['URL'],
            'type':
            summary['Type'],
            'status':
            summary['Status'],
            'title':
            summary['Title'],
            'controlling_body':
            legislation_attrs['Current Controlling Legislative Body'],
            'intro_date':
            self.convert_date(summary['Intro Date']),
            'final_date':
            self.convert_date(summary.setdefault('Final Date', '')),
            'version':
            summary.setdefault('Version', ''),
            #'contact' : None,
            'sponsors':
            first_name_first_sponsors,
            # probably remove this from the model as well
            'minutes_url':
            None
        }

        try:
            attachments = legislation_attrs['Attachments']
            for attachment in attachments:
                attachment['key'] = key
                attachment['file'] = attachment['label']
                attachment['description'] = attachment['label']
                del attachment['label']
        except KeyError:
            attachments = []

        actions = []
        for act in legislation_history:
            try:
                act_details, act_votes = self.scraper.expandHistorySummary(act)
            except (KeyError, AttributeError) as e:
                print e
                print summary
                continue
            try:
                action = {
                    'key': key,
                    'date_taken': self.convert_date(act['Date']),
                    'acting_body': act['Action By']['label'],
                    'motion': act['Result'],
                    'description': act['Action'],
                    'notes': ''
                }
            except TypeError as e:
                print e
                print summary
                continue
            except KeyError as e:
                print act
                print e
                print summary
                raise
            actions.append(action)

        # we should probably remove this from the model since the hosted
        # legistar does not have minutes
        minutes = []

        log.info('Scraped legfile with key %r' % (key, ))
        log.debug("%r %r %r %r" % (record, attachments, actions, minutes))

        return record, attachments, actions, minutes

    def convert_date(self, orig_date):
        if orig_date:
            return datetime.datetime.strptime(orig_date, '%m/%d/%Y').date()
        else:
            return ''

    def check_for_new_content(self, last_key):
        '''Grab the next legislation summary row. Doesn't use the last_key
           parameter; just starts at the beginning for each instance of the
           scraper.
        '''
        try:
            print 'next leg record'
            next_summary = self.legislation_summaries.next()
            return 0, next_summary
        except StopIteration:
            return None, None

    def init_pdf_cache(self, pdf_mapping):
        pass
示例#56
0
def paging_through_council_members():
    config = Config(hostname='a2gov.legistar.com',
                    fulltext=False).defaults(DEFAULT_CONFIG)
    scraper = LegistarScraper(config)
    members = list(scraper.councilMembers(follow_links=False))
    assert_greater(len(members), 100)