def legislation_detail_url(self, matter_id): gateway_url = self.BASE_WEB_URL + '/gateway.aspx?m=l&id={0}'.format(matter_id) # We want to supress any session level params for this head request, # since they could lead to an additonal level of redirect. # # Per # http://docs.python-requests.org/en/master/user/advanced/, we # have to do this by setting session level params to None response = self.head( gateway_url, params={k: None for k in self.params} ) # If the gateway URL redirects, the matter is publicly viewable. Grab # its detail URL from the response headers. if response.status_code == 302: legislation_detail_route = response.headers['Location'] return urljoin(self.BASE_WEB_URL, legislation_detail_route) # If the gateway URL returns a 200, it has not redirected, i.e., the # matter is not publicly viewable. Return an unauthorized response. elif response.status_code == 200: response.status_code = 403 raise scrapelib.HTTPError(response) # If the status code is anything but a 200 or 302, something is wrong. # Raise an HTTPError to interrupt the scrape. else: self.error('{0} returned an unexpected status code: {1}'.format(gateway_url, response.status_code)) response.status_code = 500 raise scrapelib.HTTPError(response)
def _check_errors(self, response, payload): if response.url.endswith('Error.aspx'): response.status_code = 503 raise scrapelib.HTTPError(response) if not response.text: response.status_code = 520 raise scrapelib.HTTPError(response) if payload: self._range_error(response, payload)
def _range_error(self, response, payload): '''Legistar intermittently does not return the expected response when selecting a time range when searching for events. Right now we are only handling the 'All' range ''' if self._range_is_all(payload): expected_range = 'All Years' page = lxml.html.fromstring(response.text) returned_range, = page.xpath( "//input[@id='ctl00_ContentPlaceHolder1_lstYears_Input']") returned_range = returned_range.value if returned_range != expected_range: response.status_code = 520 # In the event of a retry, the new request does not # contain the correct payload data. This comes as a # result of not updating the payload via sessionSecrets: # so, we do that here. payload.update(self.sessionSecrets(page)) raise scrapelib.HTTPError(response)
def _check_errors(self, response, payload): if response.url.endswith('Error.aspx'): response.status_code = 503 raise scrapelib.HTTPError(response) if not response.text: if response.request.method.lower() in {'get', 'post'}: response.status_code = 520 raise scrapelib.HTTPError(response) if 'This record no longer exists. It might have been deleted.' in response.text: response.status_code = 410 raise scrapelib.HTTPError(response) if payload: self._range_error(response, payload)
def _check_errors(self, response): if response.url.endswith('Error.aspx'): response.status_code = 503 elif not response.text: response.status_code = 520 else: return None raise scrapelib.HTTPError(response)
def _check_errors(self, response): if response.url.endswith('Error.aspx'): response.status_code = 503 raise scrapelib.HTTPError(response)
def test_house(): scraper = MyMOBillScraper({}) m = Mox() m.StubOutWithMock(scraper, 'urlopen') # first, get the list of all house bills for the given year. scraper.urlopen(StrContains('BillList.aspx?year=')) \ .AndReturn(openFile('file://%s/openstates/mo/tests/bills-house.html' % os.getcwd())) # then the details... # the first one in the list has a funky text (non standard) scraper.urlopen(StrContains('billsummaryprn.aspx?bill=')) \ .AndReturn(openFile('file://%s/openstates/mo/tests/billdetail2-house.html' % os.getcwd())) scraper.urlopen(Regex('^.*biltxt\/intro\/HB.*$')) \ .AndReturn(openFile('file://%s/openstates/mo/tests/billtext2-house.html' % os.getcwd())) scraper.urlopen(StrContains('BillActionsPrn.aspx?bill=')) \ .AndReturn(openFile('file://%s/openstates/mo/tests/billactions-house.html' % os.getcwd())) # the second one doesn't have text: scraper.urlopen(StrContains('billsummaryprn.aspx?bill=')) \ .AndReturn(openFile('file://%s/openstates/mo/tests/billdetail-house.html' % os.getcwd())) scraper.urlopen(Regex('^.*biltxt\/intro\/HB.*$')) \ .AndRaise(scrapelib.HTTPError(scrapelib.Response('url','url'),None)) scraper.urlopen(StrContains('BillActionsPrn.aspx?bill=')) \ .AndReturn(openFile('file://%s/openstates/mo/tests/billactions-house.html' % os.getcwd())) # the third one doesn't have or a summary page to begin with: scraper.urlopen(StrContains('billsummaryprn.aspx?bill=')) \ .AndReturn(openFile('file://%s/openstates/mo/tests/billdetail3-house.html' % os.getcwd())) # the fourth one in the list has a funky text (non standard) for the cosponsors scraper.urlopen(StrContains('billsummaryprn.aspx?bill=')) \ .AndReturn(openFile('file://%s/openstates/mo/tests/billdetail2-house.html' % os.getcwd())) scraper.urlopen(Regex('^.*biltxt\/intro\/HB.*$')) \ .AndReturn(openFile('file://%s/openstates/mo/tests/billtext3-house.html' % os.getcwd())) scraper.urlopen(StrContains('BillActionsPrn.aspx?bill=')) \ .AndReturn(openFile('file://%s/openstates/mo/tests/billactions-house.html' % os.getcwd())) # the fifth one was withdrawn - no real text scraper.urlopen(StrContains('billsummaryprn.aspx?bill=')) \ .AndReturn(openFile('file://%s/openstates/mo/tests/billdetail2-house.html' % os.getcwd())) scraper.urlopen(Regex('^.*biltxt\/intro\/HB.*$')) \ .AndReturn(openFile('file://%s/openstates/mo/tests/billtext4-house.html' % os.getcwd())) scraper.urlopen(StrContains('BillActionsPrn.aspx?bill=')) \ .AndReturn(openFile('file://%s/openstates/mo/tests/billactions-house.html' % os.getcwd())) # do the rest are normal/fine: scraper.urlopen(StrContains('billsummaryprn.aspx?bill=')) \ .MultipleTimes() \ .AndReturn(openFile('file://%s/openstates/mo/tests/billdetail-house.html' % os.getcwd())) scraper.urlopen(Regex('^.*biltxt\/intro\/HB.*$')) \ .MultipleTimes() \ .AndReturn(openFile('file://%s/openstates/mo/tests/billtext-house.html' % os.getcwd())) scraper.urlopen(StrContains('BillActionsPrn.aspx?bill=')) \ .MultipleTimes() \ .AndReturn(openFile('file://%s/openstates/mo/tests/billactions-house.html' % os.getcwd())) m.ReplayAll() scraper.scrape_house('2011') eq_(1143, len(scraper.bills)) eq_(2, len(scraper.bad_urls)) # the first bill is mostly standard. eq_('HB 26', scraper.bills[0]['bill_id']) eq_( 'http://www.house.mo.gov/billsummaryprn.aspx?bill=HB1&year=2011&code=R', scraper.bills[0]['bill_url']) eq_('HB 26', scraper.bills[0]['official_title']) eq_(2, len(scraper.bills[0]['sponsors'])) eq_('Jones, Tishaura', scraper.bills[0]['sponsors'][0]['name']) eq_('Curls, Shalonn', scraper.bills[0]['sponsors'][1]['name']) eq_('http://www.house.mo.gov/member.aspx?district=063&year=2011', scraper.bills[0]['sponsors'][0]['sponsor_link']) # the second bill doesn't have any cosponsor info. eq_('HB 45', scraper.bills[1]['bill_id']) eq_('SS SCS HCS HB 45', scraper.bills[1]['official_title']) eq_('Hoskins, Denny', scraper.bills[1]['sponsors'][0]['name']) eq_('http://www.house.mo.gov/member.aspx?district=121&year=2011', scraper.bills[1]['sponsors'][0]['sponsor_link']) # the third bill doesn't have any info at all. It didn't get logged, but I've recorded # it in another data structure for later fixing maybe. # 4th # the rest of the bills are pretty detailed therestindex = 4 eq_(8, len(scraper.bills[therestindex]['sponsors'])) eq_('http://www.house.mo.gov/member.aspx?district=121&year=2011', scraper.bills[therestindex]['sponsors'][0]['sponsor_link']) eq_('primary', scraper.bills[therestindex]['sponsors'][0]['type']) eq_('cosponsor', scraper.bills[therestindex]['sponsors'][1]['type']) eq_('Allen, Sue', scraper.bills[therestindex]['sponsors'][1]['name']) eq_('SCHARNHORST', scraper.bills[therestindex]['sponsors'][-1]['name']) eq_(42, len(scraper.bills[0]['actions'])) eq_(1, len(scraper.bills[0]['versions'])) eq_(5, len(scraper.bills[1]['versions'])) eq_([ 'Introduced', 'Committee', 'Perfected', 'Senate Comm Sub', 'Truly Agreed' ], [x['name'] for x in scraper.bills[1]['versions']]) eq_( 'http://www.house.mo.gov/billtracking/bills111/biltxt/intro/HB0026I.htm', scraper.bills[0]['versions'][0]['url']) eq_('Introduced', scraper.bills[0]['versions'][-1]['name']) eq_('Truly Agreed', scraper.bills[1]['versions'][-1]['name']) eq_( 'http://www.house.mo.gov/billtracking/bills111/biltxt/intro/HB0045I.htm', scraper.bills[1]['versions'][0]['url']) eq_( 'http://www.house.mo.gov/billtracking/bills111/biltxt/truly/HB0045T.htm', scraper.bills[1]['versions'][-1]['url']) m.UnsetStubs() m.VerifyAll()