Python PhillyLegistarSiteWrapper примеры использования

Язык программирования: Python

Пространство имен/Пакет: phillyleg.management.scraper_wrappers

Примеров на hotexamples.com: 23

Python PhillyLegistarSiteWrapper - 23 примеров найдено. Это лучшие примеры Python кода для phillyleg.management.scraper_wrappers.PhillyLegistarSiteWrapper, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

PhillyLegistarSiteWrapper(12)

extract_pdf_text(3)

urlopen(3)

get_minutes_date(2)

check_for_new_content(1)

collect_minutes(1)

convert_date(1)

extract_xml_text(1)

get_minutes_doc(1)

is_error_page(1)

scrape_legis_file(1)

Пример #1

Показать файл

Файл: management_tests.py Проект: citizennerd/councilmatic

    def test_MinutesDateParsedCorrectly(self):
        wrapper = PhillyLegistarSiteWrapper()

        expected_date = dt.date(2083, 12, 6) # They learned nothing from Y2K
        taken_date = wrapper.get_minutes_date('http://www.bogus.com/path/mydoc_83-12-06_bill.pdf')

        self.assertEqual(taken_date, expected_date)

Пример #2

Показать файл

Файл: management_tests.py Проект: phxdata/mesa-councilmatic

    def test_detectsErrorsCorrectly(self):
        wrapper = PhillyLegistarSiteWrapper(root_url='')

        soup = bs.BeautifulSoup(self.open_legfile('12000').read())
        self.assertTrue(wrapper.is_error_page(soup))

        soup = bs.BeautifulSoup(self.open_legfile('73').read())
        self.assertTrue(not wrapper.is_error_page(soup))

Пример #3

Показать файл

Файл: management_tests.py Проект: citizennerd/councilmatic

    def test_ExitsSilentlyOnNoNewContent(self):
        wrapper = PhillyLegistarSiteWrapper()
        error_page = self.open_legfile('12000').read()
        wrapper.urlopen = mock.Mock(
            side_effect=lambda *a, **k: StringIO(error_page))

        wrapper.check_for_new_content(73)
        self.assertEqual(wrapper.urlopen.call_count, 10)

Пример #4

Показать файл

Файл: management_tests.py Проект: phxdata/mesa-councilmatic

    def test_MinutesDateParsedCorrectly(self):
        wrapper = PhillyLegistarSiteWrapper(root_url='')

        expected_date = dt.date(2083, 12, 6)  # They learned nothing from Y2K
        taken_date = wrapper.get_minutes_date(
            'http://www.bogus.com/path/mydoc_83-12-06_bill.pdf')

        self.assertEqual(taken_date, expected_date)

Пример #5

Показать файл

Файл: management_tests.py Проект: citizennerd/councilmatic

    def test_RaisesErrorOnTooMany404(self):
        from httplib import BadStatusLine
        wrapper = PhillyLegistarSiteWrapper()
        wrapper.urlopen = mock.Mock(
            side_effect=BadStatusLine(500))

        self.assertRaises(BadStatusLine, wrapper.check_for_new_content, 73)
        self.assertEqual(wrapper.urlopen.call_count, 10)

Пример #6

Показать файл

Файл: management_tests.py Проект: citizennerd/councilmatic

    def test_detectsErrorsCorrectly(self):
        wrapper = PhillyLegistarSiteWrapper()

        soup = bs.BeautifulSoup(self.open_legfile('12000').read())
        self.assertTrue(wrapper.is_error_page(soup))

        soup = bs.BeautifulSoup(self.open_legfile('73').read())
        self.assertTrue(not wrapper.is_error_page(soup))

Пример #7

Показать файл

Файл: management_tests.py Проект: phxdata/mesa-councilmatic

    def test_RaisesErrorOnTooMany404(self):
        from httplib import BadStatusLine
        wrapper = PhillyLegistarSiteWrapper(root_url='')
        wrapper.urlopen = mock.Mock(side_effect=BadStatusLine(500))

        self.assertRaises(BadStatusLine, wrapper.check_for_new_content, 73)
        # Check that we've retried the URL 10 items
        self.assertEqual(wrapper.urlopen.call_count, 10)

Пример #8

Показать файл

Файл: management_tests.py Проект: phxdata/mesa-councilmatic

    def test_ExitsSilentlyOnNoNewContent(self):
        wrapper = PhillyLegistarSiteWrapper(root_url='')
        error_page = self.open_legfile('12000').read()
        wrapper.urlopen = mock.Mock(
            side_effect=lambda *a, **k: StringIO(error_page))

        wrapper.check_for_new_content(73)
        # Check that we've tried 100 additional items
        self.assertEqual(wrapper.urlopen.call_count, 100)

Пример #9

Показать файл

Файл: management_tests.py Проект: phxdata/mesa-councilmatic

    def test_DealsWith404PdfAddressesCorrectly(self):
        # I don't know why they'd be deleting these files, but when they do (and
        # they do) we have to handle it.
        wrapper = PhillyLegistarSiteWrapper(root_url='')
        expected_text = ''

        attachment_pdf = 'http://legislation.phila.gov/attachments/115954.pdf'
        attachment_text = wrapper.extract_pdf_text(attachment_pdf)
        self.assertEqual(attachment_text, expected_text)

Пример #10

Показать файл

Файл: management_tests.py Проект: citizennerd/councilmatic

    def test_DealsWith404PdfAddressesCorrectly(self):
        # I don't know why they'd be deleting these files, but when they do (and
        # they do) we have to handle it.
        wrapper = PhillyLegistarSiteWrapper()
        expected_text = ''

        attachment_pdf = 'http://legislation.phila.gov/attachments/115954.pdf'
        attachment_text = wrapper.extract_pdf_text(attachment_pdf)
        self.assertEqual(attachment_text, expected_text)

Пример #11

Показать файл

Файл: management_tests.py Проект: citizennerd/councilmatic

    def test_PdfDataIsCached(self):
        wrapper = PhillyLegistarSiteWrapper()
        wrapper.urlopen = mock.Mock(return_value=StringIO('<doc><pdf2xml></pdf2xml></doc>'))
        wrapper.extract_xml_text = mock.Mock()

        actions = [{'minutes_url': 'http://www.sample.com/file.pdf'},
                   {'minutes_url': 'http://www.sample.com/file.pdf'},
                   {'minutes_url': 'http://www.sample.com/other/file.pdf'}]
        minutes = wrapper.collect_minutes(actions)

        self.assertEqual(wrapper.urlopen.call_count, 2)

Пример #12

Показать файл

Файл: management_tests.py Проект: citizennerd/councilmatic

    def test_MinutesDocumentConstructedCorrectly(self):
        wrapper = PhillyLegistarSiteWrapper()
        wrapper.get_minutes_date = mock.Mock(return_value=dt.date(2083, 12, 6))
        wrapper.extract_pdf_text = mock.Mock(return_value='This is the text')

        expected_doc = {'url': 'http://www.example.com/doc.pdf',
                        'fulltext': 'This is the text',
                        'date_taken': dt.date(2083, 12, 6)}
        minutes_doc = wrapper.get_minutes_doc('http://www.example.com/doc.pdf')

        self.assertEqual(minutes_doc, expected_doc)

Пример #13

Показать файл

Файл: management_tests.py Проект: phxdata/mesa-councilmatic

    def test_RecognizeNotesRow(self):
        # The history on some filings (like key=73) have notes.  These need to
        # be detected.
        html = self.open_legfile('73').read()
        soup = bs.BeautifulSoup(html)

        wrapper = PhillyLegistarSiteWrapper(root_url='')
        file_record, attachment_records, action_records, minutes_records = \
            wrapper.scrape_legis_file(73, soup)

        self.assertEqual(
            len([act_rec for act_rec in action_records if act_rec['notes']]),
            2)

Пример #14

Показать файл

Файл: management_tests.py Проект: citizennerd/councilmatic

    def test_RecognizeNotesRow(self):
        # The history on some filings (like key=73) have notes.  These need to
        # be detected.
        html = self.open_legfile('73').read()
        soup = bs.BeautifulSoup(html)

        wrapper = PhillyLegistarSiteWrapper()
        file_record, attachment_records, action_records, minutes_records = \
            wrapper.scrape_legis_file(73, soup)

        self.assertEqual(
            len([act_rec for act_rec in action_records
                 if act_rec['notes']]), 2)

Пример #15

Показать файл

Файл: management_tests.py Проект: phxdata/mesa-councilmatic

    def test_MinutesDocumentConstructedCorrectly(self):
        wrapper = PhillyLegistarSiteWrapper(root_url='')
        wrapper.get_minutes_date = mock.Mock(return_value=dt.date(2083, 12, 6))
        wrapper.extract_pdf_text = mock.Mock(return_value='This is the text')

        expected_doc = {
            'url': 'http://www.example.com/doc.pdf',
            'fulltext': 'This is the text',
            'date_taken': dt.date(2083, 12, 6)
        }
        minutes_doc = wrapper.get_minutes_doc('http://www.example.com/doc.pdf')

        self.assertEqual(minutes_doc, expected_doc)

Пример #16

Показать файл

Файл: updatelegfiles.py Проект: joannecheng/councilmatic

    def _get_new_files(self):
        # Create a datastore wrapper object
        ds = CouncilmaticDataStoreWrapper()
        source = PhillyLegistarSiteWrapper()

        # Get the latest filings
        curr_key = ds.get_latest_key()

        while True:
            curr_key, source_obj = source.check_for_new_content(curr_key)

            if source_obj is None:
                break

            record, attachments, actions, minutes = source.scrape_legis_file(curr_key, source_obj)
            ds.save_legis_file(record, attachments, actions, minutes)

Пример #17

Показать файл

Файл: management_tests.py Проект: phxdata/mesa-councilmatic

    def test_PdfDataIsCached(self):
        wrapper = PhillyLegistarSiteWrapper(root_url='')
        wrapper.urlopen = mock.Mock(
            return_value=StringIO('<doc><pdf2xml></pdf2xml></doc>'))
        wrapper.extract_xml_text = mock.Mock()

        actions = [{
            'minutes_url': 'http://www.sample.com/file.pdf'
        }, {
            'minutes_url': 'http://www.sample.com/file.pdf'
        }, {
            'minutes_url': 'http://www.sample.com/other/file.pdf'
        }]
        minutes = wrapper.collect_minutes(actions)

        self.assertEqual(wrapper.urlopen.call_count, 2)

Пример #18

Показать файл

    def handle(self, *args, **options):
        log = logging.getLogger()
        log.setLevel(logging.INFO)

        # Create a datastore wrapper object
        ds = self.ds = CouncilmaticDataStoreWrapper()
        source = self.source = PhillyLegistarSiteWrapper(
            settings.LEGISLATION['ROOT'])

        # Seed the PDF cache with already-downloaded content.
        #
        # Downloading and parsing PDF content really slows down the scraping
        # process.  If we had to redownload all of them every time we scraped,
        # it would take a really long time to refresh all of the old stuff.  So
        # that PDFs that have already been downloaded won't be again, seed the
        # source cache with that data.
        #
        # Hopefully this won't be too much of a burden on memory :).
        source.init_pdf_cache(ds.pdf_mapping)

        update_files = options['update_files']

        try:
            self._get_new_files()
            if update_files:
                self._get_updated_files()
        except TooManyGeocodeRequests:
            sys.exit(0)

Пример #19

Показать файл

Файл: management_tests.py Проект: citizennerd/councilmatic

    def test_ResolutionPdfParsesCorrectly(self):
        wrapper = PhillyLegistarSiteWrapper()
        expected_text = """\n\n\n\n\n\n\n\n\nCity of Philadelphia \n \n \n \n \nCity of Philadelphia \n- 1 - \n \n \n \nCity Council \nChief Clerk's Office \n402 City Hall \nPhiladelphia, PA 19107 \nRESOLUTION NO. 110406 \n \n \nIntroduced May 12, 2011 \n \n \nCouncilmember DiCicco \n \n \nReferred to the \nCommittee of the Whole   \n \n \nRESOLUTION \n \nAppointing David Campoli to the Board of Directors of the Center City District. \n \n \n \nRESOLVED, BY THE COUNCIL OF THE CITY OF PHILADELPHIA, \nTHAT David Campoli is hereby appointed as a member of the Board of Directors of the \nCenter City District, to serve in a term ending December 31, 2012. \n \n \n\n\n\nCity of Philadelphia \n \nRESOLUTION NO. 110406 continued \n \n \n \n \n \nCity of Philadelphia \n- 2 - \n \n \n \n \n\n"""

        # Raw stream
        resolution_pdf = open(os.path.join(self.pdfs_dir, '11530.pdf')).read()
        resolution_text = wrapper.extract_pdf_text(resolution_pdf)
        self.assertEqual(resolution_text, expected_text)

        # File URL
        resolution_pdf = 'file://' + os.path.join(self.pdfs_dir, '11530.pdf')
        resolution_text = wrapper.extract_pdf_text(resolution_pdf)
        self.assertEqual(resolution_text, expected_text)

        # Web URL -- This will only work if you're online.
        resolution_pdf = 'http://legislation.phila.gov/attachments/11530.pdf'
        resolution_text = wrapper.extract_pdf_text(resolution_pdf)
        self.assertEqual(resolution_text, expected_text)

Пример #20

Показать файл

Файл: management_tests.py Проект: phxdata/mesa-councilmatic

    def test_ResolutionPdfParsesCorrectly(self):
        wrapper = PhillyLegistarSiteWrapper(root_url='')
        expected_text = """\n\n\n\n\n\n\n\n\nCity of Philadelphia \n \n \n \n \nCity of Philadelphia \n- 1 - \n \n \n \nCity Council \nChief Clerk's Office \n402 City Hall \nPhiladelphia, PA 19107 \nRESOLUTION NO. 110406 \n \n \nIntroduced May 12, 2011 \n \n \nCouncilmember DiCicco \n \n \nReferred to the \nCommittee of the Whole   \n \n \nRESOLUTION \n \nAppointing David Campoli to the Board of Directors of the Center City District. \n \n \n \nRESOLVED, BY THE COUNCIL OF THE CITY OF PHILADELPHIA, \nTHAT David Campoli is hereby appointed as a member of the Board of Directors of the \nCenter City District, to serve in a term ending December 31, 2012. \n \n \n\n\n\nCity of Philadelphia \n \nRESOLUTION NO. 110406 continued \n \n \n \n \n \nCity of Philadelphia \n- 2 - \n \n \n \n \n\n"""

        # Raw stream
        resolution_pdf = open(os.path.join(self.pdfs_dir, '11530.pdf')).read()
        resolution_text = wrapper.extract_pdf_text(resolution_pdf)
        self.assertEqual(resolution_text, expected_text)

        # File URL
        resolution_pdf = 'file://' + os.path.join(self.pdfs_dir, '11530.pdf')
        resolution_text = wrapper.extract_pdf_text(resolution_pdf)
        self.assertEqual(resolution_text, expected_text)

        # Web URL -- This will only work if you're online.
        resolution_pdf = 'http://legislation.phila.gov/attachments/11530.pdf'
        resolution_text = wrapper.extract_pdf_text(resolution_pdf)
        self.assertEqual(resolution_text, expected_text)

Пример #21

Показать файл

Файл: management_tests.py Проект: citizennerd/councilmatic

    def test_ConvertDateIsEmptyWhenNoDateGiven(self):
        wrapper = PhillyLegistarSiteWrapper()

        self.assertEqual(wrapper.convert_date(None), '')

Пример #22

Показать файл

Файл: models.py Проект: corydissinger/councilmatic

 def get_data_source(self):
     return PhillyLegistarSiteWrapper()

Пример #23

Показать файл

Файл: management_tests.py Проект: phxdata/mesa-councilmatic

    def test_ConvertDateIsEmptyWhenNoDateGiven(self):
        wrapper = PhillyLegistarSiteWrapper(root_url='')

        self.assertEqual(wrapper.convert_date(None), '')