def test_google_crawl(self):
   ''' test google top 10 search crawler '''
   keywords = [['new', 'york', 'university'], ['Torsten', 'Suel'], ['Amazon', 'Full', 'Time'], ['Google', 'Ann', 'Arbor']]
   for key in keywords:
     gs = GoogleWebCrawler(key, fake=False)
     urls = gs.query()
     self.assertTrue(self.validate_urls(urls))
 def test_google_crawl_fake(self):
   ''' test google top 10 search crawler '''
   keywords = [['new', 'york', 'university'], ['Torsten', 'Suel'], ['Amazon', 'Full', 'Time'], ['Google', 'Ann', 'Arbor']]
   for key in keywords:
     gs = GoogleWebCrawler(key, fake=True)
     urls = gs.query()
     self.assertTrue(urls == ['http://engineering.nyu.edu', 'http://www.nyu.edu'])
 def test_google_crawl(self):
     ''' test google top 10 search crawler '''
     keywords = [['new', 'york', 'university'], ['Torsten', 'Suel'],
                 ['Amazon', 'Full', 'Time'], ['Google', 'Ann', 'Arbor']]
     for key in keywords:
         gs = GoogleWebCrawler(key, fake=False)
         urls = gs.query()
         self.assertTrue(self.validate_urls(urls))
 def test_google_crawl_fake(self):
     ''' test google top 10 search crawler '''
     keywords = [['new', 'york', 'university'], ['Torsten', 'Suel'],
                 ['Amazon', 'Full', 'Time'], ['Google', 'Ann', 'Arbor']]
     for key in keywords:
         gs = GoogleWebCrawler(key, fake=True)
         urls = gs.query()
         self.assertTrue(
             urls == ['http://engineering.nyu.edu', 'http://www.nyu.edu'])
예제 #5
0
    def run(self):
        ''' run the dispatcher '''

        # crawl google web search engine
        gs = GoogleWebCrawler(self.keywords, self.args.fake)

        urls = gs.query()
        if not urls and gs.error > 0:
            print('Network Error. Please check network connection.')
            return

        if not urls:
            bs = BingWebCrawler(self.keywords, self.args.fake)
            urls = bs.query()

        if not urls:
            print(
                'See crawl failed. Please check network connection or contact the author.'
            )
            return

        self.bulk_url_enqueue(urls)

        # launch the crawler thread
        t_crawler = threading.Thread(target=self.run_page_crawler)
        t_crawler.daemon = True
        t_crawler.start()

        # launch the log writer thread
        t_logger = threading.Thread(target=self.run_log_writter)
        t_logger.daemon = True
        t_logger.start()

        # launch the progress reporter
        t_reporter = threading.Thread(target=self.run_progress_reporter)
        t_reporter.daemon = True
        t_reporter.start()

        # wait for the workers to finish
        t_crawler.join()
        t_logger.join()

        # finalize statistical metrics
        self.stats.finalize()

        # close reporter
        t_reporter.join()

        # close logger
        self.logger.close()
  def run(self):
    ''' run the dispatcher '''

    # crawl google web search engine
    gs = GoogleWebCrawler(self.keywords, self.args.fake)

    urls = gs.query()
    if not urls and gs.error > 0:
      print('Network Error. Please check network connection.')
      return

    if not urls:
      bs = BingWebCrawler(self.keywords, self.args.fake)
      urls = bs.query()

    if not urls:
      print('See crawl failed. Please check network connection or contact the author.')
      return

    self.bulk_url_enqueue(urls)

    # launch the crawler thread
    t_crawler = threading.Thread(target=self.run_page_crawler)
    t_crawler.daemon = True
    t_crawler.start()

    # launch the log writer thread
    t_logger = threading.Thread(target=self.run_log_writter)
    t_logger.daemon = True
    t_logger.start()

    # launch the progress reporter
    t_reporter = threading.Thread(target=self.run_progress_reporter)
    t_reporter.daemon = True
    t_reporter.start()

    # wait for the workers to finish
    t_crawler.join()
    t_logger.join()

    # finalize statistical metrics
    self.stats.finalize()

    # close reporter
    t_reporter.join()

    # close logger
    self.logger.close()