예제 #1
0
 def priority(self, link, method=DEPTH):
     if "?" in link.url:
         # This ignores links with a querystring.
         return 0.0
     else:
         # Otherwise use the default priority ranker,
         # i.e. the priority depends on DEPTH or BREADTH crawl mode.
         return Spider.priority(self, link, method)
예제 #2
0
 def priority(self, link, method=DEPTH):
     if "?" in link.url:
         # This ignores links with a querystring.
         return 0.0
     else:
         # Otherwise use the default priority ranker,
         # i.e. the priority depends on DEPTH or BREADTH crawl mode.
         return Spider.priority(self, link, method)
예제 #3
0
 def priority(self, link, method=DEPTH):
     match = re.search("/\d{4}/\w{3}/\d{2}/", link.url)
     if match:
         if re.search("media", link.url):
             res = 0.0
         else:
             res =  Spider.priority(self, link, method)
     else:
         res= 0.0
     return res
예제 #4
0
 def __init__(self, whoosh):
     Spider.__init__(self, links=["http://www.theguardian.com/"], domains=["www.theguardian.com"], delay=0.0)
     self.whoosh=whoosh
예제 #5
0
 def priority(self, link, method=DEPTH):
     match = re.search("huffingtonpost.co.uk/\d{4}/\d{2}/\d{2}/", link.url)
     if match:
         return Spider.priority(self, link, method)
     else:
         return 0.0
예제 #6
0
 def priority(self, link, method=DEPTH):
     match = re.search("in.reuters.com/article/\d{4}/\d{2}/\d{2}/", link.url)
     if match:
         return Spider.priority(self, link, method)
     else:
         return 0.0
예제 #7
0
 def __init__(self, whoosh):
     Spider.__init__(self, links=["http://www.huffingtonpost.co.uk/"], domains=["huffingtonpost.co.uk"], delay=0.0)
     self.whoosh=whoosh
예제 #8
0
 def __init__(self, whoosh):
     Spider.__init__(self, links=["http://in.reuters.com/"], domains=["in.reuters.com"], delay=0.0)
     self.whoosh=whoosh
예제 #9
0
 def __init__(self, links, domains, delay, whoosh):
     Spider.__init__(self, links=links, domains=domains, delay=delay)
     self.whoosh=whoosh