Пример #1
0
    def legislation_detail_url(self, matter_id):
        gateway_url = self.BASE_WEB_URL + '/gateway.aspx?m=l&id={0}'.format(matter_id)

        # We want to supress any session level params for this head request,
        # since they could lead to an additonal level of redirect.
        #
        # Per
        # http://docs.python-requests.org/en/master/user/advanced/, we
        # have to do this by setting session level params to None
        response = self.head(
            gateway_url,
            params={k: None for k in self.params}
        )

        # If the gateway URL redirects, the matter is publicly viewable. Grab
        # its detail URL from the response headers.
        if response.status_code == 302:
            legislation_detail_route = response.headers['Location']
            return urljoin(self.BASE_WEB_URL, legislation_detail_route)

        # If the gateway URL returns a 200, it has not redirected, i.e., the
        # matter is not publicly viewable. Return an unauthorized response.
        elif response.status_code == 200:
            response.status_code = 403
            raise scrapelib.HTTPError(response)

        # If the status code is anything but a 200 or 302, something is wrong.
        # Raise an HTTPError to interrupt the scrape.
        else:
            self.error('{0} returned an unexpected status code: {1}'.format(gateway_url, response.status_code))
            response.status_code = 500
            raise scrapelib.HTTPError(response)
Пример #2
0
    def _check_errors(self, response, payload):
        if response.url.endswith('Error.aspx'):
            response.status_code = 503
            raise scrapelib.HTTPError(response)

        if not response.text:
            response.status_code = 520
            raise scrapelib.HTTPError(response)

        if payload:
            self._range_error(response, payload)
Пример #3
0
    def _range_error(self, response, payload):
        '''Legistar intermittently does not return the expected response when
        selecting a time range when searching for events. Right now we
        are only handling the 'All' range
        '''

        if self._range_is_all(payload):

            expected_range = 'All Years'

            page = lxml.html.fromstring(response.text)
            returned_range, = page.xpath(
                "//input[@id='ctl00_ContentPlaceHolder1_lstYears_Input']")

            returned_range = returned_range.value

            if returned_range != expected_range:
                response.status_code = 520
                # In the event of a retry, the new request does not
                # contain the correct payload data.  This comes as a
                # result of not updating the payload via sessionSecrets:
                # so, we do that here.
                payload.update(self.sessionSecrets(page))

                raise scrapelib.HTTPError(response)
Пример #4
0
    def _check_errors(self, response, payload):
        if response.url.endswith('Error.aspx'):
            response.status_code = 503
            raise scrapelib.HTTPError(response)

        if not response.text:
            if response.request.method.lower() in {'get', 'post'}:
                response.status_code = 520
                raise scrapelib.HTTPError(response)

        if 'This record no longer exists. It might have been deleted.' in response.text:
            response.status_code = 410
            raise scrapelib.HTTPError(response)

        if payload:
            self._range_error(response, payload)
Пример #5
0
 def _check_errors(self, response):
     if response.url.endswith('Error.aspx'):
         response.status_code = 503
     elif not response.text:
         response.status_code = 520
     else:
         return None
     
     raise scrapelib.HTTPError(response)
Пример #6
0
 def _check_errors(self, response):
     if response.url.endswith('Error.aspx'):
         response.status_code = 503
         raise scrapelib.HTTPError(response)
Пример #7
0
def test_house():
    scraper = MyMOBillScraper({})
    m = Mox()
    m.StubOutWithMock(scraper, 'urlopen')
    # first, get the list of all house bills for the given year.
    scraper.urlopen(StrContains('BillList.aspx?year=')) \
            .AndReturn(openFile('file://%s/openstates/mo/tests/bills-house.html' % os.getcwd()))

    # then the details...
    # the first one in the list has a funky text (non standard)
    scraper.urlopen(StrContains('billsummaryprn.aspx?bill=')) \
            .AndReturn(openFile('file://%s/openstates/mo/tests/billdetail2-house.html' % os.getcwd()))
    scraper.urlopen(Regex('^.*biltxt\/intro\/HB.*$')) \
            .AndReturn(openFile('file://%s/openstates/mo/tests/billtext2-house.html' % os.getcwd()))
    scraper.urlopen(StrContains('BillActionsPrn.aspx?bill=')) \
            .AndReturn(openFile('file://%s/openstates/mo/tests/billactions-house.html' % os.getcwd()))

    # the second one doesn't have text:
    scraper.urlopen(StrContains('billsummaryprn.aspx?bill=')) \
            .AndReturn(openFile('file://%s/openstates/mo/tests/billdetail-house.html' % os.getcwd()))
    scraper.urlopen(Regex('^.*biltxt\/intro\/HB.*$')) \
            .AndRaise(scrapelib.HTTPError(scrapelib.Response('url','url'),None))
    scraper.urlopen(StrContains('BillActionsPrn.aspx?bill=')) \
            .AndReturn(openFile('file://%s/openstates/mo/tests/billactions-house.html' % os.getcwd()))

    # the third one doesn't have or a summary page to begin with:
    scraper.urlopen(StrContains('billsummaryprn.aspx?bill=')) \
            .AndReturn(openFile('file://%s/openstates/mo/tests/billdetail3-house.html' % os.getcwd()))

    # the fourth one in the list has a funky text (non standard) for the cosponsors
    scraper.urlopen(StrContains('billsummaryprn.aspx?bill=')) \
            .AndReturn(openFile('file://%s/openstates/mo/tests/billdetail2-house.html' % os.getcwd()))
    scraper.urlopen(Regex('^.*biltxt\/intro\/HB.*$')) \
            .AndReturn(openFile('file://%s/openstates/mo/tests/billtext3-house.html' % os.getcwd()))
    scraper.urlopen(StrContains('BillActionsPrn.aspx?bill=')) \
            .AndReturn(openFile('file://%s/openstates/mo/tests/billactions-house.html' % os.getcwd()))

    # the fifth one was withdrawn - no real text
    scraper.urlopen(StrContains('billsummaryprn.aspx?bill=')) \
            .AndReturn(openFile('file://%s/openstates/mo/tests/billdetail2-house.html' % os.getcwd()))
    scraper.urlopen(Regex('^.*biltxt\/intro\/HB.*$')) \
            .AndReturn(openFile('file://%s/openstates/mo/tests/billtext4-house.html' % os.getcwd()))
    scraper.urlopen(StrContains('BillActionsPrn.aspx?bill=')) \
            .AndReturn(openFile('file://%s/openstates/mo/tests/billactions-house.html' % os.getcwd()))

    # do the rest are normal/fine:
    scraper.urlopen(StrContains('billsummaryprn.aspx?bill=')) \
            .MultipleTimes() \
            .AndReturn(openFile('file://%s/openstates/mo/tests/billdetail-house.html' % os.getcwd()))
    scraper.urlopen(Regex('^.*biltxt\/intro\/HB.*$')) \
            .MultipleTimes() \
            .AndReturn(openFile('file://%s/openstates/mo/tests/billtext-house.html' % os.getcwd()))
    scraper.urlopen(StrContains('BillActionsPrn.aspx?bill=')) \
            .MultipleTimes() \
            .AndReturn(openFile('file://%s/openstates/mo/tests/billactions-house.html' % os.getcwd()))

    m.ReplayAll()
    scraper.scrape_house('2011')

    eq_(1143, len(scraper.bills))
    eq_(2, len(scraper.bad_urls))

    # the first bill is mostly standard.
    eq_('HB 26', scraper.bills[0]['bill_id'])
    eq_(
        'http://www.house.mo.gov/billsummaryprn.aspx?bill=HB1&year=2011&code=R',
        scraper.bills[0]['bill_url'])
    eq_('HB 26', scraper.bills[0]['official_title'])
    eq_(2, len(scraper.bills[0]['sponsors']))
    eq_('Jones, Tishaura', scraper.bills[0]['sponsors'][0]['name'])
    eq_('Curls, Shalonn', scraper.bills[0]['sponsors'][1]['name'])
    eq_('http://www.house.mo.gov/member.aspx?district=063&year=2011',
        scraper.bills[0]['sponsors'][0]['sponsor_link'])

    # the second bill doesn't have any cosponsor info.
    eq_('HB 45', scraper.bills[1]['bill_id'])
    eq_('SS SCS HCS HB 45', scraper.bills[1]['official_title'])
    eq_('Hoskins, Denny', scraper.bills[1]['sponsors'][0]['name'])
    eq_('http://www.house.mo.gov/member.aspx?district=121&year=2011',
        scraper.bills[1]['sponsors'][0]['sponsor_link'])

    # the third bill doesn't have any info at all. It didn't get logged, but I've recorded
    # it in another data structure for later fixing maybe.

    # 4th

    # the rest of the bills are pretty detailed
    therestindex = 4
    eq_(8, len(scraper.bills[therestindex]['sponsors']))
    eq_('http://www.house.mo.gov/member.aspx?district=121&year=2011',
        scraper.bills[therestindex]['sponsors'][0]['sponsor_link'])
    eq_('primary', scraper.bills[therestindex]['sponsors'][0]['type'])
    eq_('cosponsor', scraper.bills[therestindex]['sponsors'][1]['type'])
    eq_('Allen, Sue', scraper.bills[therestindex]['sponsors'][1]['name'])
    eq_('SCHARNHORST', scraper.bills[therestindex]['sponsors'][-1]['name'])

    eq_(42, len(scraper.bills[0]['actions']))
    eq_(1, len(scraper.bills[0]['versions']))
    eq_(5, len(scraper.bills[1]['versions']))
    eq_([
        'Introduced', 'Committee', 'Perfected', 'Senate Comm Sub',
        'Truly Agreed'
    ], [x['name'] for x in scraper.bills[1]['versions']])
    eq_(
        'http://www.house.mo.gov/billtracking/bills111/biltxt/intro/HB0026I.htm',
        scraper.bills[0]['versions'][0]['url'])
    eq_('Introduced', scraper.bills[0]['versions'][-1]['name'])
    eq_('Truly Agreed', scraper.bills[1]['versions'][-1]['name'])
    eq_(
        'http://www.house.mo.gov/billtracking/bills111/biltxt/intro/HB0045I.htm',
        scraper.bills[1]['versions'][0]['url'])
    eq_(
        'http://www.house.mo.gov/billtracking/bills111/biltxt/truly/HB0045T.htm',
        scraper.bills[1]['versions'][-1]['url'])
    m.UnsetStubs()
    m.VerifyAll()