예제 #1
0
 def page_crawled(self, response, links):
     scores = {}
     response.meta['state'] = _state.get_id('CRAWLED')
     for link in links:
         if link.meta['state'] is None:
             url, fingerprint, _ = self.canonicalsolver.get_canonical_url(link)
             scores[fingerprint] = self.get_score(url)
             link.meta['state'] = _state.get_id('QUEUED')
     return scores
예제 #2
0
 def page_crawled(self, response, links):
     scores = {}
     response.meta['state'] = _state.get_id('CRAWLED')
     for link in links:
         if link.meta['state'] is None:
             url, fingerprint, _ = self.canonicalsolver.get_canonical_url(link)
             scores[fingerprint] = self.get_score(url)
             link.meta['state'] = _state.get_id('QUEUED')
     print "$$ inside page_crawled : value -> {0}, links -> {1}".format(response.url,links)
     return scores
예제 #3
0
 def add_seeds(self, seeds):
     scores = {}
     for seed in seeds:
         if seed.meta['state'] is None:
             url, fingerprint, _ = self.canonicalsolver.get_canonical_url(seed)
             scores[fingerprint] = 1.0
             seed.meta['state'] = _state.get_id('QUEUED')
     return scores
예제 #4
0
    def add_seeds(self, seeds):
        scores = {}
        for seed in seeds:
            if seed.meta['state'] is None:
                url, fingerprint, _ = self.canonicalsolver.get_canonical_url(seed)
                scores[fingerprint] = 1.0
                seed.meta['state'] = _state.get_id('QUEUED')

        print "$$ inside add_seeds : seeds -> {0}, scores -> {1}".format(seeds, scores)
        print "$$ printing seed.meta"
        for seed in seeds:
            print seed.meta

        return scores
예제 #5
0
 def page_error(self, request, error):
     url, fingerprint, _ = self.canonicalsolver.get_canonical_url(request)
     request.meta['state'] = _state.get_id('ERROR')
     print "$$ inside page_error : url -> {0}, error_reason -> {1}".format(request.url, error)
     return {fingerprint: 0.0}
예제 #6
0
 def page_error(self, request, error):
     url, fingerprint, _ = self.canonicalsolver.get_canonical_url(request)
     request.meta['state'] = _state.get_id('ERROR')
     return {fingerprint: 0.0}