def __init__(self, allow = (), deny = (), allow_domains = (), deny_domains = (), restrict_xpaths = (), tags = ('a', 'area'), attrs = ('href'), canonicalize = True, unique = True, process_value = None, check_url = True): #Add check_url parameter self.check_url = check_url SgmlLinkExtractor.__init__(self, allow = allow, deny = deny, allow_domains = allow_domains, deny_domains = deny_domains, restrict_xpaths = restrict_xpaths, tags = tags, attrs = attrs, canonicalize = canonicalize, unique = unique, process_value = process_value)
def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=(), tags=('a', 'area'), attrs=('href'), canonicalize=True, unique=True, process_value=None, check_url=True): # Add check_url parameter self.check_url = check_url SgmlLinkExtractor.__init__(self, allow=allow, deny=deny, allow_domains=allow_domains, deny_domains=deny_domains, restrict_xpaths=restrict_xpaths, tags=tags, attrs=attrs, canonicalize=canonicalize, unique=unique, process_value=process_value)
def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=(), tags=('a', 'area'), attrs=('href'), canonicalize=True, unique=True, process_value=None, ignore_set=set()): self.ignore_set = ignore_set SgmlLinkExtractor.__init__(self, allow=allow, deny=deny, allow_domains=allow_domains, deny_domains=deny_domains, restrict_xpaths=restrict_xpaths, tags=tags, attrs=attrs, canonicalize=canonicalize, unique=unique, process_value=process_value)
def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=(), tags=('a', 'area'), attrs=('href'), canonicalize=True, unique=True, process_value=None, deny_extensions=None, seen_urls=[]): SgmlLinkExtractor.__init__(self,allow=allow, deny=deny, allow_domains=allow_domains, deny_domains=deny_domains, restrict_xpaths=restrict_xpaths, tags=tags, attrs=attrs, canonicalize=canonicalize, unique=unique, process_value=process_value, deny_extensions=deny_extensions) for l in seen_urls: self.seen_urls[l]=True
def __init__(self, *args, **kwargs): self.allow_range = kwargs.pop('allow_range', None) self.deny_range = kwargs.pop('deny_range', None) SgmlLinkExtractor.__init__(self, *args, **kwargs)
def __init__(self, allow, restrict_xpaths=()): BuggySgmlLinkExtractor.__init__(self, allow=allow, restrict_xpaths=restrict_xpaths)