def __init__(self, url, depth, index): Crawler.__init__(self, links=[url]) re_text = u"""[абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЭЬЮЯ ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz-]+""" self._clear_text_re = re.compile(re_text) self._index = index self._current_depth = -1 self._depths = {self.next: 0} self._max_depth = depth
def priority(self, link, method=DEPTH): if "?" in link.url: # This ignores links with a querystring. return 0.0 else: # Otherwise use the default priority ranker, # i.e. the priority depends on DEPTH or BREADTH crawl mode. return Crawler.priority(self, link, method)
def __init__(self, links=[], domains=[], delay=20.0, parse=HTMLLinkParser().parse, sort=FIFO): #call super constructor Crawler.__init__(self, links, domains, delay, parse, sort) #save first link into root_url attribute self.root_url = links[0] self.crawl_id = save_crawl(self.root_url) # this will match on the end of rank urls like # http://palatinusbridge.hu/mezhon/eredmenyek/2014palaered/hetfo/ph140120.htm self.target_pattern = 'p\w\d{6}\.htm' # this will match on the end of day urls like # http://palatinusbridge.hu/mezhon/eredmenyek/2014palaered/hetfo/ self.day_pattern = '[a-z]{4,9}/\Z' self.year_pattern = '[0-9]{4}palaered/\Z'
def crawl(self, method=BREADTH, **kwargs): next_link = self.next if next_link: self._current_depth = self._depths[next_link] print('Crawling %dth page at depth %d' % (len(self.visited), self._current_depth)) try: return Crawler.crawl(self, method, **kwargs) except Exception as e: print('Ошибка при построении индекса: %s' % e) return False
def push(self, link, priority=1.0, sort=FILO): if inspect.stack()[2][3] == '__init__': Crawler.push(self, link, priority, sort) elif self._current_depth + 1 < self._max_depth: self._depths[link] = self._current_depth + 1 Crawler.push(self, link, priority, sort)
def priority(self, link, method=None): #if "linkedin" in link.url or "twitter" in link.url or "facebook" in link.url or "google" in link.url: if self.badLink.detect(link.url): return 0.1 else: return Crawler.priority(self, link, method)