def page_crawled(self, response, links): scores = {} response.meta['state'] = _state.get_id('CRAWLED') for link in links: if link.meta['state'] is None: url, fingerprint, _ = self.canonicalsolver.get_canonical_url(link) scores[fingerprint] = self.get_score(url) link.meta['state'] = _state.get_id('QUEUED') return scores
def page_crawled(self, response, links): scores = {} response.meta['state'] = _state.get_id('CRAWLED') for link in links: if link.meta['state'] is None: url, fingerprint, _ = self.canonicalsolver.get_canonical_url( link) scores[fingerprint] = self.get_score(url) link.meta['state'] = _state.get_id('QUEUED') return scores
def add_seeds(self, seeds): scores = {} for seed in seeds: if seed.meta['state'] is None: url, fingerprint, _ = self.canonicalsolver.get_canonical_url(seed) scores[fingerprint] = 1.0 seed.meta['state'] = _state.get_id('QUEUED') return scores
def add_seeds(self, seeds): scores = {} for seed in seeds: if seed.meta['state'] is None: url, fingerprint, _ = self.canonicalsolver.get_canonical_url( seed) scores[fingerprint] = 1.0 seed.meta['state'] = _state.get_id('QUEUED') return scores
def page_crawled(self, response, links): scores = {} response.meta['state'] = _state.get_id('CRAWLED') url, fingerprint, _ = self.canonicalsolver.get_canonical_url(response) if 'p_score' not in response.meta: drill_down = False else: score = response.meta['p_score'] drill_down = self.classifier.classify_paragraphs(score) if drill_down: self.results[fingerprint] = [ score, url, response.meta['title'], response.meta['descr'], response.meta['keywords'], ] self.results_collected += 1 scheduled = 0 for link in links: if link.meta['state'] is None: url, fingerprint, _ = self.canonicalsolver.get_canonical_url(link) if drill_down: url_parts = urlparse(url) path_parts = url_parts.path.split('/') scores[fingerprint] = 1.0 / (len(path_parts) + len(url_parts.path)*0.1) link.meta['state'] = self.S_QUEUED else: scores[fingerprint] = None link.meta['state'] = self.S_NOT_CRAWLED scheduled += 1 self.stats['downloaded'] += 1 self.stats['scheduled'] += scheduled return scores
def page_error(self, request, error): url, fingerprint, _ = self.canonicalsolver.get_canonical_url(request) request.meta['state'] = _state.get_id('ERROR') return {fingerprint: 0.0}