def priority(self, link, method=DEPTH): if "?" in link.url: # This ignores links with a querystring. return 0.0 else: # Otherwise use the default priority ranker, # i.e. the priority depends on DEPTH or BREADTH crawl mode. return Spider.priority(self, link, method)
def priority(self, link, method=DEPTH): match = re.search("/\d{4}/\w{3}/\d{2}/", link.url) if match: if re.search("media", link.url): res = 0.0 else: res = Spider.priority(self, link, method) else: res= 0.0 return res
def __init__(self, whoosh): Spider.__init__(self, links=["http://www.theguardian.com/"], domains=["www.theguardian.com"], delay=0.0) self.whoosh=whoosh
def priority(self, link, method=DEPTH): match = re.search("huffingtonpost.co.uk/\d{4}/\d{2}/\d{2}/", link.url) if match: return Spider.priority(self, link, method) else: return 0.0
def priority(self, link, method=DEPTH): match = re.search("in.reuters.com/article/\d{4}/\d{2}/\d{2}/", link.url) if match: return Spider.priority(self, link, method) else: return 0.0
def __init__(self, whoosh): Spider.__init__(self, links=["http://www.huffingtonpost.co.uk/"], domains=["huffingtonpost.co.uk"], delay=0.0) self.whoosh=whoosh
def __init__(self, whoosh): Spider.__init__(self, links=["http://in.reuters.com/"], domains=["in.reuters.com"], delay=0.0) self.whoosh=whoosh
def __init__(self, links, domains, delay, whoosh): Spider.__init__(self, links=links, domains=domains, delay=delay) self.whoosh=whoosh