示例#1
0
 def run(self):
     self.total_bytes = 0
     html_data = self._get_html(url=self.url)
     if html_data is None:
         return
     self.total_bytes += len(html_data)
     extractor = LinkExtractor(base_url=self.url)
     extractor.feed(html_data)
     for link in extractor.links:
         extra_data = self._get_html(url=link)
         if extra_data:
             self.total_bytes += len(extra_data)
示例#2
0
 def run(self):
     self.total_bytes = 0
     html_data = self._get_html(url=self.url)
     if html_data is None:
         return
     self.total_bytes += len(html_data)
     if self.go_ahead:
         extractor = LinkExtractor(base_url=self.url)
         extractor.feed(html_data)
         sizers = [PageSizer(url=link, go_ahead=False) for link in extractor.links]
         for sizer in sizers:
             sizer.start()
         for sizer in sizers:
             sizer.join()
         for sizer in sizers:
             self.total_bytes += sizer.total_bytes
 def run(self):
     self.total_bytes = 0
     html_data = self._get_html(url=self.url)
     if html_data is None:
         return
     self.total_bytes += len(html_data)
     if self.go_ahead:
         extractor = LinkExtractor(base_url=self.url)
         extractor.feed(html_data)
         collector = multiprocessing.Queue()
         sizers = [PageSizer(url=link, go_ahead=False, collector=collector) for link in extractor.links]
         for sizer in sizers:
             sizer.start()
         for sizer in sizers:
             sizer.join()
         while not collector.empty():
             data = collector.get()
             self.total_bytes += data['total_bytes']
     self.collector.put(dict(url=self.url, total_bytes=self.total_bytes))