def test_40(self): good_characters = ''.join([ x for x in string.printable if x not in string.punctuation and x not in string.whitespace ]) oracle_iterator = [ { "loc": sample(good_characters, randint(8, 16)), "alternate": [ sample(good_characters, randint(8, 16)) for _ in range(randint(0, 10)) ] } for _ in range(randint(0, 100)) # Not too many tests ] # Let's build the correct results locations = sorted([x['loc'] for x in oracle_iterator]) alternate = [l for x in oracle_iterator for l in x['alternate']] full = sorted(locations + alternate) # And get the iterloc ones result1 = sorted(list(iterloc(oracle_iterator))) result2 = sorted(list(iterloc(oracle_iterator, alt=True))) # Check the results self.assertEqual(result1, locations) self.assertEqual(result2, full)
def _parse_sitemap(self, response): self.logger.info(f"_parse_sitemap, response: {response.url}") if response.url.endswith("/robots.txt"): for url in sitemap_urls_from_robots(response.text, base_url=response.url): yield Request(url, callback=self._parse_sitemap) else: body = self._get_sitemap_body(response) if body is None: logger.warning( "Ignoring invalid sitemap: %(response)s", {"response": response}, extra={"spider": self}, ) return s = Sitemap(body) it = self.sitemap_filter(s) if s.type.lower() == "sitemapindex": for loc in iterloc(it, self.sitemap_alternate_links): if any(x.search(loc) for x in self._follow): yield Request(loc, callback=self._parse_sitemap) elif s.type == "urlset": for loc in iterloc(it, self.sitemap_alternate_links): for r, c in self._cbs: if r.search(loc): yield Request(loc, callback=c) break
def _parse_sitemap(self, response): if response.url.endswith('/robots.txt'): for url in sitemap_urls_from_robots(response.body): yield Request(url, callback=self._parse_sitemap) else: body = self._get_sitemap_body(response) if body is None: self.logger.warning("Ignoring invalid sitemap: %(response)s", {'response': response}, extra={'spider': self}) return s = Sitemap(body) if s.type == 'sitemapindex': for loc in iterloc(s, self.sitemap_alternate_links): if any(x.search(loc) for x in self._follow): yield Request(loc, callback=self._parse_sitemap) elif s.type == 'urlset': for loc in iterloc(s): for r, c in self._cbs: if r.search(loc): if not doctor_exists(loc): self.logger.debug("Doctor's url not found in db. Fetching data") yield Request(loc, callback=c) else: self.logger.debug("Doctor's url found in db. Passing it on") break
def test_39(self): single_element_iterator = [{ "loc": "location", "alternate": ["alternate_location"] }] result1 = list(iterloc(single_element_iterator)) result2 = list(iterloc(single_element_iterator, alt=True)) self.assertEqual(result1, ["location"]) self.assertEqual(sorted(result2), sorted(["location", "alternate_location"]))
def _parse_sitemap(self, response): logging.info("Parsing sitemap %s" % response) if response.url.endswith('/robots.txt'): for url in sitemap_urls_from_robots(response.body): yield Request(url, callback=self._parse_sitemap) else: body = self._get_sitemap_body(response) if body is None: logging.warning("Ignoring invalid sitemap: %(response)s", response=response) return s = Sitemap(body) if s.type == 'sitemapindex': for loc in iterloc(s, self.sitemap_alternate_links): if any(x.search(loc) for x in self._follow): yield Request(loc, callback=self._parse_sitemap) elif s.type == 'urlset': for url in iter(s): loc = url['loc'] # Add the lastmod date to the Request meta lastmod = url.get('lastmod', None) if lastmod is not None: lastmod = parse_w3c_datetime(lastmod) for r, c in self._cbs: if r.search(loc): self.urls.append({"url": loc, "lastmod": lastmod}) break
def _parse_sitemap(self, response): """This is adapted from scrapy.spiders.sitemap""" if response.url.endswith('/robots.txt'): for url in sitemap_urls_from_robots(response.text, base_url=response.url): yield Request(url, callback=self._parse_sitemap) else: body = self._get_sitemap_body(response) if body is None: self.logger.warning("Ignoring invalid sitemap: %(response)s", {'response': response}, extra={'spider': self}) return s = NewsSitemap(body) if s.type == 'sitemapindex': for loc in iterloc(s, self.sitemap_alternate_links): if any(x.search(loc) for x in self._follow): yield self.url_to_request(loc, callback=self._parse_sitemap) elif s.type == 'urlset': for loc, meta in self.iterurlset(s): for r, c in self._cbs: if r.search(loc): try: self.logger.debug(f'Queuing {loc}') yield self.url_to_request(loc, callback=c, meta=meta) break except Exception as e: self.logger.error( f'Failed to queue {loc}: {e}')
def _parse_sitemap(self, response): body = self._get_sitemap_body(response) if body is None: self.logger.warning("Ignoring invalid sitemap: %(response)s", {'response': response}, extra={'spider': self}) return s = Sitemap(body) if s.type == 'urlset': for loc in iterloc(s): for r, c in self._cbs: if r.search(loc): category_regex = r'toptenreviews\.com/(.*)/$' match = re.search(category_regex, loc) # the URL pattern must be change if there is no category matching if not match: break category = CategoryItem() category['category_path'] = match.group(1) category['category_url'] = loc if self.should_skip_category(category): break yield category request = Request(loc, callback=c) request.meta['category'] = category yield request break
def _parse_sitemap(self, response): logging.info("Parsing sitemap %s" % response) if response.url.endswith('/robots.txt'): for url in sitemap_urls_from_robots(response.body): yield Request(url, callback=self._parse_sitemap) else: body = self._get_sitemap_body(response) if body is None: logging.warning("Ignoring invalid sitemap: %(response)s", response=response) return s = Sitemap(body) if s.type == 'sitemapindex': for loc in iterloc(s, self.sitemap_alternate_links): if any(x.search(loc) for x in self._follow): yield Request(loc, callback=self._parse_sitemap) elif s.type == 'urlset': urls = list(iter(s)) logging.info("Checking {0} sitemap URLs".format(len(urls))) for url in urls: loc = url['loc'] # Add the lastmod date to the Request meta lastmod = url.get('lastmod', None) if lastmod is not None: lastmod = parse_w3c_datetime(lastmod) for r, c in self._cbs: if r.search(loc): self.urls.append({"url": loc, "lastmod": lastmod}) logging.info("Adding sitemap URL {0}".format(loc)) break
def test_38(self): empty_iterator = [] result1 = list(iterloc(empty_iterator)) result2 = list(iterloc(empty_iterator, alt=True)) self.assertEqual(result1, []) self.assertEqual(result2, [])