def run(self): self.total_bytes = 0 html_data = self._get_html(url=self.url) if html_data is None: return self.total_bytes += len(html_data) extractor = LinkExtractor(base_url=self.url) extractor.feed(html_data) for link in extractor.links: extra_data = self._get_html(url=link) if extra_data: self.total_bytes += len(extra_data)
def run(self): self.total_bytes = 0 html_data = self._get_html(url=self.url) if html_data is None: return self.total_bytes += len(html_data) if self.go_ahead: extractor = LinkExtractor(base_url=self.url) extractor.feed(html_data) sizers = [PageSizer(url=link, go_ahead=False) for link in extractor.links] for sizer in sizers: sizer.start() for sizer in sizers: sizer.join() for sizer in sizers: self.total_bytes += sizer.total_bytes
def run(self): self.total_bytes = 0 html_data = self._get_html(url=self.url) if html_data is None: return self.total_bytes += len(html_data) if self.go_ahead: extractor = LinkExtractor(base_url=self.url) extractor.feed(html_data) collector = multiprocessing.Queue() sizers = [PageSizer(url=link, go_ahead=False, collector=collector) for link in extractor.links] for sizer in sizers: sizer.start() for sizer in sizers: sizer.join() while not collector.empty(): data = collector.get() self.total_bytes += data['total_bytes'] self.collector.put(dict(url=self.url, total_bytes=self.total_bytes))