Пример #1
0
 def spider_from(self, url, source_url=None, limit=20000):
     print '%d links crawled, %d links in queue. url %s' % (len(
         self.hrefs), len(self.queue), url)
     if url not in self.hrefs:
         self.hrefs |= set([url])
         data = LinkExtractor(url,
                              source_url=source_url).extract(get_meta=True)
         data.update({
             'guid': url,
             'stories_id': hashlib.md5(url).hexdigest()
         })
         story = self.db.addStory(data)
         limit -= 1
         # Add the inlinks to a queue so we get the closest links first
         self.queue |= set([
             link['href'] for link in data['story_links']
             if link['inlink'] is True and link['href'] not in self.hrefs
             and not any((subl in link['href'] for subl in ('/topics/', )))
         ])
     else:
         print '--> Already crawled'
     if self.queue:
         return self.spider_from(self.queue.pop(),
                                 source_url=source_url,
                                 limit=limit)
     return len(self.hrefs)
Пример #2
0
 def run(self):
     self.total_bytes = 0
     html_data = self._get_html(url=self.url)
     if html_data is None:
         return
     self.total_bytes += len(html_data)
     extractor = LinkExtractor(base_url=self.url)
     extractor.feed(html_data)
     for link in extractor.links:
         extra_data = self._get_html(url=link)
         if extra_data:
             self.total_bytes += len(extra_data)
Пример #3
0
 def run(self):
     self.total_bytes = 0
     html_data = self._get_html(url=self.url)
     if html_data is None:
         return
     self.total_bytes += len(html_data)
     if self.go_ahead:
         extractor = LinkExtractor(base_url=self.url)
         extractor.feed(html_data)
         sizers = [PageSizer(url=link, go_ahead=False) for link in extractor.links]
         for sizer in sizers:
             sizer.start()
         for sizer in sizers:
             sizer.join()
         for sizer in sizers:
             self.total_bytes += sizer.total_bytes
Пример #4
0
 def run(self):
     self.total_bytes = 0
     html_data = self._get_html(url=self.url)
     if html_data is None:
         return
     self.total_bytes += len(html_data)
     if self.go_ahead:
         extractor = LinkExtractor(base_url=self.url)
         extractor.feed(html_data)
         collector = multiprocessing.Queue()
         sizers = [PageSizer(url=link, go_ahead=False, collector=collector) for link in extractor.links]
         for sizer in sizers:
             sizer.start()
         for sizer in sizers:
             sizer.join()
         while not collector.empty():
             data = collector.get()
             self.total_bytes += data['total_bytes']
     self.collector.put(dict(url=self.url, total_bytes=self.total_bytes))
Пример #5
0
 def spider_from(self, url, source_url=None, limit=20000):
     print '%d links crawled, %d links in queue. url %s' % (len(self.hrefs), len(self.queue), url)
     if url not in self.hrefs:
         self.hrefs |= set([url])
         data = LinkExtractor(url, source_url=source_url).extract(get_meta=True)
         data.update({
             'guid': url,
             'stories_id': hashlib.md5(url).hexdigest()
         })
         story = self.db.addStory(data)
         limit -= 1
         # Add the inlinks to a queue so we get the closest links first
         self.queue |= set([link['href'] for link in data['story_links']
                           if link['inlink'] is True and link['href'] not in self.hrefs and not any((subl in link['href'] for subl in ('/topics/',)))])
     else:
         print '--> Already crawled'
     if self.queue:
         return self.spider_from(self.queue.pop(), source_url=source_url, limit=limit)
     return len(self.hrefs)
Пример #6
0
        self._queue = queue
        self._used_links = set()

    def process(self, data):
        url, links = data
        fresh = set(links) - self._used_links
        self._used_links.update(fresh)
        for link in fresh:
            queue.put(link)
        data = (url, fresh)
        self._output.write(functools.partial(self._save, data))

    @staticmethod
    def _save(data, fp):
        url, links = data
        fp.write('--URL crawled---:' + url + '\n')
        print('URL crawled---:' + url)
        for link in links:
            fp.write('\t' + link + '\n')
            print(link)


queue = queue.Queue()
queue.put('https://github.com')

provider = Provider(queue, _base.Output('links.txt'))
looper = Looper(queue, [LinkExtractor(provider)])
looper.run(timeout=1)

queue.join()