Exemplo n.º 1
0
def crawl_with_options(urls, options_dict=None, logger_builder=None):
    """Crawls URLs with provided options and logger.

    :param options_dict: Must contain the long name of the command line
            options. (optional)

    :param logger_builder: Function that will be called to instantiate a
            logger. (optional)

    :rtype: A pylinkchecker.crawler.Site instance
    """

    config = Config()

    config.parse_api_config(urls, options_dict)

    if not logger_builder:
        logger = configure_logger(config)
    else:
        logger = logger_builder()

    # TODO In the future, we will pass the logger builder and not the logger
    # to enable the ProcessSiteCrawler to instantiate its own custom logger.
    crawler = execute_from_config(config, logger)

    return crawler.site
Exemplo n.º 2
0
def crawl(url):
    """Crawls a URL and returns a pylinkchecker.crawler.Site instance.

    :rtype: A pylinkchecker.crawler.Site instance
    """
    config = Config()
    config.parse_api_config([url])
    logger = configure_logger(config)
    crawler = execute_from_config(config, logger)

    return crawler.site
Exemplo n.º 3
0
    def _run_crawler_plain(self, crawler_class, other_options=None):
        url = self.get_url("/index.html")
        sys.argv = ['pylinkchecker', "-m", "process", url]
        if not other_options:
            other_options = []
        sys.argv.extend(other_options)
        config = Config()
        config.parse_cli_config()

        crawler = crawler_class(config, get_logger())
        crawler.crawl()

        return crawler.site
Exemplo n.º 4
0
    def _run_crawler_plain(self, crawler_class, other_options=None):
        url = self.get_url("/index.html")
        sys.argv = ['pylinkchecker', "-m", "process", url]
        if not other_options:
            other_options = []
        sys.argv.extend(other_options)
        config = Config()
        config.parse_cli_config()

        crawler = crawler_class(config, get_logger())
        crawler.crawl()

        return crawler.site
Exemplo n.º 5
0
def execute_from_command_line():
    """Runs the crawler and retrieves the configuration from the command line."""
    try:
        start = time.time()
        config = Config()
        config.parse_cli_config()

        logger = configure_logger(config)
        crawler = execute_from_config(config, logger)

        stop = time.time()

        if not crawler.site.is_ok or config.options.when == WHEN_ALWAYS:
            report(crawler.site, config, stop - start, logger)

        if not crawler.site.is_ok:
            sys.exit(1)
    except Exception as e:
        print(e)
        sys.exit(1)
Exemplo n.º 6
0
def execute_from_command_line():
    """Runs the crawler and retrieves the configuration from the command line."""
    try:
        start = time.time()
        config = Config()
        config.parse_cli_config()

        logger = configure_logger(config)
        crawler = execute_from_config(config, logger)

        stop = time.time()

        if not crawler.site.is_ok or config.options.when == WHEN_ALWAYS:
            report(crawler.site, config, stop - start, logger)

        if not crawler.site.is_ok:
            sys.exit(1)
    except Exception as e:
        print(e)
        sys.exit(1)
Exemplo n.º 7
0
def execute_from_command_line():
    start = time.time()
    config = Config()
    config.parse_config()

    if not config.start_urls:
        print("At least one starting URL must be supplied.")
        sys.exit(1)

    if config.options.verbose == VERBOSE_QUIET:
        logging.basicConfig(level=logging.CRITICAL)
    elif config.options.verbose == VERBOSE_NORMAL:
        logging.basicConfig(level=logging.WARNING)
    else:
        logging.basicConfig(level=logging.DEBUG)

    logger = get_logger()

    if config.options.mode == MODE_THREAD:
        crawler = ThreadSiteCrawler(config, logger)
    elif config.options.mode == MODE_PROCESS:
        crawler = ProcessSiteCrawler(config, logger)
    elif config.options.mode == MODE_GREEN:
        crawler = GreenSiteCrawler(config, logger)

    if not crawler:
        print("Invalid crawling mode supplied.")
        sys.exit(1)

    crawler.crawl()

    stop = time.time()

    if not crawler.site.is_ok or config.options.when == WHEN_ALWAYS:
        report(crawler.site, config, stop - start, logger)

    if not crawler.site.is_ok:
        sys.exit(1)
Exemplo n.º 8
0
    def test_accepted_hosts(self):
        sys.argv = ['pylinkchecker', 'http://www.example.com/']
        config = Config()
        config.parse_cli_config()
        self.assertTrue('www.example.com' in config.accepted_hosts)

        sys.argv = ['pylinkchecker', '-H', 'www.example.com',
                'http://example.com', 'foo.com', 'http://www.example.com/',
                'baz.com']
        config = Config()
        config.parse_cli_config()

        self.assertTrue('www.example.com' in config.accepted_hosts)
        self.assertTrue('example.com' in config.accepted_hosts)
        self.assertTrue('foo.com' in config.accepted_hosts)
        self.assertTrue('baz.com' in config.accepted_hosts)
Exemplo n.º 9
0
    def test_accepted_hosts(self):
        sys.argv = ['pylinkchecker', 'http://www.example.com/']
        config = Config()
        config.parse_cli_config()
        self.assertTrue('www.example.com' in config.accepted_hosts)

        sys.argv = ['pylinkchecker', '-H', 'www.example.com',
                'http://example.com', 'foo.com', 'http://www.example.com/',
                'baz.com']
        config = Config()
        config.parse_cli_config()

        self.assertTrue('www.example.com' in config.accepted_hosts)
        self.assertTrue('example.com' in config.accepted_hosts)
        self.assertTrue('foo.com' in config.accepted_hosts)
        self.assertTrue('baz.com' in config.accepted_hosts)