Пример #1
0
 def page_crawled(self, response, links):
     scores = {}
     response.meta['state'] = _state.get_id('CRAWLED')
     for link in links:
         if link.meta['state'] is None:
             url, fingerprint, _ = self.canonicalsolver.get_canonical_url(link)
             scores[fingerprint] = self.get_score(url)
             link.meta['state'] = _state.get_id('QUEUED')
     return scores
Пример #2
0
 def page_crawled(self, response, links):
     scores = {}
     response.meta['state'] = _state.get_id('CRAWLED')
     for link in links:
         if link.meta['state'] is None:
             url, fingerprint, _ = self.canonicalsolver.get_canonical_url(
                 link)
             scores[fingerprint] = self.get_score(url)
             link.meta['state'] = _state.get_id('QUEUED')
     return scores
Пример #3
0
 def add_seeds(self, seeds):
     scores = {}
     for seed in seeds:
         if seed.meta['state'] is None:
             url, fingerprint, _ = self.canonicalsolver.get_canonical_url(seed)
             scores[fingerprint] = 1.0
             seed.meta['state'] = _state.get_id('QUEUED')
     return scores
Пример #4
0
 def add_seeds(self, seeds):
     scores = {}
     for seed in seeds:
         if seed.meta['state'] is None:
             url, fingerprint, _ = self.canonicalsolver.get_canonical_url(
                 seed)
             scores[fingerprint] = 1.0
             seed.meta['state'] = _state.get_id('QUEUED')
     return scores
Пример #5
0
    def page_crawled(self, response, links):
        scores = {}
        response.meta['state'] = _state.get_id('CRAWLED')
        url, fingerprint, _ = self.canonicalsolver.get_canonical_url(response)

        if 'p_score' not in response.meta:
            drill_down = False
        else:
            score = response.meta['p_score']
            drill_down = self.classifier.classify_paragraphs(score)
            if drill_down:
                self.results[fingerprint] = [
                    score,
                    url,
                    response.meta['title'],
                    response.meta['descr'],
                    response.meta['keywords'],
                ]
                self.results_collected += 1

        scheduled = 0
        for link in links:
            if link.meta['state'] is None:
                url, fingerprint, _ = self.canonicalsolver.get_canonical_url(link)
                if drill_down:
                    url_parts = urlparse(url)
                    path_parts = url_parts.path.split('/')
                    scores[fingerprint] = 1.0 / (len(path_parts) + len(url_parts.path)*0.1)
                    link.meta['state'] = self.S_QUEUED
                else:
                    scores[fingerprint] = None
                    link.meta['state'] = self.S_NOT_CRAWLED
                scheduled += 1
        self.stats['downloaded'] += 1
        self.stats['scheduled'] += scheduled
        return scores
Пример #6
0
 def page_error(self, request, error):
     url, fingerprint, _ = self.canonicalsolver.get_canonical_url(request)
     request.meta['state'] = _state.get_id('ERROR')
     return {fingerprint: 0.0}
Пример #7
0
 def page_error(self, request, error):
     url, fingerprint, _ = self.canonicalsolver.get_canonical_url(request)
     request.meta['state'] = _state.get_id('ERROR')
     return {fingerprint: 0.0}