def test_url_matcher(self): self.assertTrue(url_matcher("http://www.google.com.ar", "%www.google.com%")) self.assertTrue(url_matcher("http://www.google.com.ar", "http://www.google.com%")) self.assertTrue(url_matcher("http://www.google.com.ar", "%www.google.com.ar")) self.assertTrue(url_matcher("http://www.google.com.ar", "http://www.google.com.ar")) self.assertFalse(url_matcher("http://www.google.com.ar", "%www.google.com")) self.assertFalse(url_matcher("http://www.google.com.ar", "www.google.com%")) self.assertFalse(url_matcher("http://www.google.com.ar", "%www.goo.com%")) self.assertFalse(url_matcher("http://www.google.com.ar", "http://www.goo.com.ar"))
def _search_in_urls_list(self, urls_list, url, default=True): """ Searches an url in a list of urls """ if not urls_list: return default for pattern in urls_list: if url_matcher(url, pattern): return True return False
def _get_response(self, url, data=None): """ Returns the response data from a request params: data: if this param is present it makes a POST. """ for pattern, post_data in self.post_urls: if url_matcher(url, pattern): data = post_data return self._make_request(url, data)
def _validate(self, response): """ Override this method in order to provide more validations before the data extraction with the given scraper class """ for pattern in self.matching_urls: if url_matcher(response.url, pattern): if self.debug: print "%s matches the url %s" % (self.__class__.__name__, response.url) return self.on_cannot_scrape(response)
def _test_url_matcher_with_regex(self): self.assertTrue(url_matcher("http://www.google.com.ar", "http://([a-z.]+)")) self.assertTrue(url_matcher("http://www.google.com.ar", "http://(([a-z]+.){4})")) self.assertTrue(url_matcher("http://www.google.com.ar", "[a-z/:.]+")) self.assertFalse(url_matcher("http://www.google.com.ar", "http://([a-z]+)")) self.assertFalse(url_matcher("http://www.google.com.ar", "http://(([a-z]+.){1})")) self.assertFalse(url_matcher("http://www.google.com.ar", "[a-z:.]+"))
def _validate(self, response): """ Override this method in order to provide more validations before the data extraction with the given scraper class """ for pattern in self.matching_urls: if url_matcher(response.url, pattern): if self.debug: print "%s matches the url %s" % (self.__class__.__name__, response.url) return raise ScraperCantParseError("The Scraper %s can't parse the html from %s" % (self.__class__.__name__, response.url))
def test_url_matcher(self): self.assertTrue( url_matcher("http://www.google.com.ar", "%www.google.com%")) self.assertTrue( url_matcher("http://www.google.com.ar", "http://www.google.com%")) self.assertTrue( url_matcher("http://www.google.com.ar", "%www.google.com.ar")) self.assertTrue( url_matcher("http://www.google.com.ar", "http://www.google.com.ar")) self.assertFalse( url_matcher("http://www.google.com.ar", "%www.google.com")) self.assertFalse( url_matcher("http://www.google.com.ar", "www.google.com%")) self.assertFalse( url_matcher("http://www.google.com.ar", "%www.goo.com%")) self.assertFalse( url_matcher("http://www.google.com.ar", "http://www.goo.com.ar"))
def _test_url_matcher_with_regex(self): self.assertTrue( url_matcher("http://www.google.com.ar", "http://([a-z.]+)")) self.assertTrue( url_matcher("http://www.google.com.ar", "http://(([a-z]+.){4})")) self.assertTrue(url_matcher("http://www.google.com.ar", "[a-z/:.]+")) self.assertFalse( url_matcher("http://www.google.com.ar", "http://([a-z]+)")) self.assertFalse( url_matcher("http://www.google.com.ar", "http://(([a-z]+.){1})")) self.assertFalse(url_matcher("http://www.google.com.ar", "[a-z:.]+"))