def __init__(self, *a, **kw): super(SitemapSpider, self).__init__(*a, **kw) self._cbs = [] for rule, cb in self.sitemap_rules: if isinstance(cb, basestring): cb = getattr(self, cb) self._cbs.append((regex(rule), cb)) self._follow = [regex(x) for x in self.sitemap_follow] self._current_sitemap_url = None self._sitemap_urls = self.sitemap_urls[:] self._site_urls = []
def __init__(self, allow=None, deny=None, allow_domains=None, deny_domains=None, tags=['a', 'area', 'link'], attrs=['href'], unique=True, deny_extensions=None, filter_mobile=True): self.allow_res = [regex(x) for x in arg_to_iter(allow)] self.deny_res = [regex(x) for x in arg_to_iter(deny)] self.allow_domains = set(arg_to_iter(allow_domains)) self.deny_domains = set(arg_to_iter(deny_domains)) self.unique = unique if deny_extensions is None: deny_extensions = IGNORED_EXTENSIONS self.deny_extensions = set(['.' + e for e in deny_extensions]) self.filter_mobile = filter_mobile tags = list(arg_to_iter(tags)) # make a local copy self.tag_func = lambda x: x in tags attrs = list(arg_to_iter(attrs)) self.attr_func = lambda x: x in attrs
def test_regex(self): re_type = type(re.compile('')) self.assertIsInstance(regex(r'.*'), re_type) self.assertIsInstance(regex(u'abc'), re_type) self.assertIsInstance(regex(re.compile('.')), re_type)