def test_accepted_hosts(self): sys.argv = ['pylinkvalidator', 'http://www.example.com/'] config = Config() config.parse_cli_config() self.assertTrue('www.example.com' in config.accepted_hosts) sys.argv = ['pylinkvalidator', '-H', 'www.example.com', 'http://example.com', 'foo.com', 'http://www.example.com/', 'baz.com'] config = Config() config.parse_cli_config() self.assertTrue('www.example.com' in config.accepted_hosts) self.assertTrue('example.com' in config.accepted_hosts) self.assertTrue('foo.com' in config.accepted_hosts) self.assertTrue('baz.com' in config.accepted_hosts)
def test_accepted_hosts(self): sys.argv = ['pylinkvalidator', 'http://www.example.com/'] config = Config() config.parse_cli_config() self.assertTrue('www.example.com' in config.accepted_hosts) sys.argv = ['pylinkvalidator', '-H', 'www.example.com', 'http://example.com', 'foo.com', 'http://www.example.com/', 'baz.com'] config = Config() config.parse_cli_config() self.assertTrue('www.example.com' in config.accepted_hosts) self.assertTrue('example.com' in config.accepted_hosts) self.assertTrue('foo.com' in config.accepted_hosts) self.assertTrue('baz.com' in config.accepted_hosts)
def _run_crawler_plain( self, crawler_class, other_options=None, url="/index.html"): url = self.get_url(url) sys.argv = ['pylinkvalidator', "-m", "process", url] if not other_options: other_options = [] sys.argv.extend(other_options) config = Config() config.parse_cli_config() crawler = crawler_class(config, get_logger()) crawler.crawl() if config.options.multi: crawler.site.collect_multi_sites() return crawler.site
def _run_crawler_plain( self, crawler_class, other_options=None, url="/index.html"): url = self.get_url(url) sys.argv = ['pylinkvalidator', "-m", "process", url] if not other_options: other_options = [] sys.argv.extend(other_options) config = Config() config.parse_cli_config() crawler = crawler_class(config, get_logger()) crawler.crawl() if config.options.multi: crawler.site.collect_multi_sites() return crawler.site
def test_url_file_path(self): (_, temp_file_path) = mkstemp() url = self.get_url("/index.html") url2 = self.get_url("/robots.txt") with open(temp_file_path, "w") as temp_file: temp_file.write(url + "\n") temp_file.write(url2 + "\n") sys.argv = [ "pylinkvalidator", "-m", "process", "--url-file-path", temp_file_path] config = Config() config.parse_cli_config() crawler = ThreadSiteCrawler(config, get_logger()) crawler.crawl() site = crawler.site self.assertEqual(12, len(site.pages)) self.assertEqual(1, len(site.error_pages)) os.unlink(temp_file_path)
def test_url_file_path(self): (_, temp_file_path) = mkstemp() url = self.get_url("/index.html") url2 = self.get_url("/robots.txt") with open(temp_file_path, "w") as temp_file: temp_file.write(url + "\n") temp_file.write(url2 + "\n") sys.argv = [ "pylinkvalidator", "-m", "process", "--url-file-path", temp_file_path] config = Config() config.parse_cli_config() crawler = ThreadSiteCrawler(config, get_logger()) crawler.crawl() site = crawler.site self.assertEqual(12, len(site.pages)) self.assertEqual(1, len(site.error_pages)) os.unlink(temp_file_path)
def execute_from_command_line(): """Runs the crawler and retrieves the configuration from the command line. """ try: start = time.time() config = Config() config.parse_cli_config() logger = configure_logger(config) crawler = execute_from_config(config, logger) stop = time.time() if not crawler.site.is_ok or config.options.when == WHEN_ALWAYS: report(crawler.site, config, stop - start, logger) if not crawler.site.is_ok: sys.exit(1) except Exception as e: print(e) sys.exit(1)
def execute_from_command_line(): """Runs the crawler and retrieves the configuration from the command line. """ try: start = time.time() config = Config() config.parse_cli_config() logger = configure_logger(config) crawler = execute_from_config(config, logger) stop = time.time() if not crawler.site.is_ok or config.options.when == WHEN_ALWAYS: report(crawler.site, config, stop - start, logger) if not crawler.site.is_ok: sys.exit(1) except Exception as e: print(e) sys.exit(1)