Python Crawler示例，test_utils.crawler.base.Crawler Python示例

示例#1

0

显示文件

文件： crawler_tests.py 项目： DexterW/django-webservice-tools

 def test_url_plugin(self):
     conf_urls = {'this_wont_be_crawled': True}
     c = Crawler('/', conf_urls=conf_urls)
     c.run()
     logs = open('crawler_log')
     output = logs.read()
     self.assertTrue(output.find('These patterns were not matched during the crawl: this_wont_be_crawled') != -1)

示例#2

0

显示文件

文件： crawler_tests.py 项目： DexterW/django-webservice-tools

 def test_time_plugin(self):
     #This isn't testing much, but I can't know how long the time will take
     c = Crawler('/')
     c.run()
     logs = open('crawler_log')
     output = logs.read()
     self.assertTrue(output.find('Time taken:') != -1)

示例#3

0

显示文件

文件： crawler_tests.py 项目： zavatskiy/django-test-utils

 def test_time_plugin(self):
     #This isn't testing much, but I can't know how long the time will take
     c = Crawler('/')
     c.run()
     logs = open('crawler_log')
     output = logs.read()
     self.assertTrue(output.find('Time taken:') != -1)

示例#4

0

显示文件

文件： crawler_tests.py 项目： DexterW/django-webservice-tools

 def test_memory_plugin(self):
     from test_utils.crawler.plugins.memory_plugin import Memory
     Memory.active = True
     c = Crawler('/')
     c.run()
     logs = open('crawler_log')
     output = logs.read()
     self.assertTrue(output.find('Memory consumed:') != -1)

示例#5

0

显示文件

文件： crawler_tests.py 项目： zavatskiy/django-test-utils

 def test_memory_plugin(self):
     from test_utils.crawler.plugins.memory_plugin import Memory
     Memory.active = True
     c = Crawler('/')
     c.run()
     logs = open('crawler_log')
     output = logs.read()
     self.assertTrue(output.find('Memory consumed:') != -1)

示例#6

0

显示文件

文件： crawler_tests.py 项目： zavatskiy/django-test-utils

 def test_url_plugin(self):
     conf_urls = {'this_wont_be_crawled': True}
     c = Crawler('/', conf_urls=conf_urls)
     c.run()
     logs = open('crawler_log')
     output = logs.read()
     self.assertTrue(
         output.find(
             'These patterns were not matched during the crawl: this_wont_be_crawled'
         ) != -1)

示例#7

0

显示文件

class Command(BaseCommand):
    option_list = BaseCommand.option_list + (
        make_option('-p', '--pdb', action='store_true', dest='pdb', default=False,
            help='Pass -p to drop into pdb on an error'),
        make_option('-f', '--fixture', action='store_true', dest='fixtures', default=False,
            help='Pass -f to create a fixture for the data.'),
        make_option('-s', '--safe', action='store_true', dest='html', default=False,
            help='Pass -s to check for html fragments in your pages.'),
        make_option('-t', '--time', action='store_true', dest='time', default=False,
            help='Pass -t to time your requests.'),
        make_option('-r', '--response', action='store_true', dest='response', default=False,
            help='Pass -r to store the response objects.'),
        #TODO
        make_option('-e', '--each', action='store', dest='each',
            type='int',
            help='TODO: Pass -e NUM to specify how many times each URLConf entry should be hit.'),
    )

    help = "Displays all of the url matching routes for the project."
    args = "[relative start url]"

    def handle(self, *args, **options):


        verbosity = int(options.get('verbosity', 1))
        
        if len(args) > 1:
            raise CommandError('Only one start url is currently supported.')
        else:
            start_url = args[0] if args else '/'

        if settings.ADMIN_FOR:
            settings_modules = [__import__(m, {}, {}, ['']) for m in settings.ADMIN_FOR]
        else:
            settings_modules = [settings]

        conf_urls = {}
        for settings_mod in settings_modules:
            try:
                urlconf = __import__(settings_mod.ROOT_URLCONF, {}, {}, [''])
            except Exception, e:
                print ("Error occurred while trying to load %s: %s" % (settings_mod.ROOT_URLCONF, str(e)))
                continue
            view_functions = extract_views_from_urlpatterns(urlconf.urlpatterns)
            for (func, regex) in view_functions:
                #Get function name and add it to the hash of URLConf urls
                func_name = hasattr(func, '__name__') and func.__name__ or repr(func)
                conf_urls[regex] = ['func.__module__', func_name]

            #Now we have all of our URLs to test

        c = Crawler(start_url, conf_urls=conf_urls, verbosity=verbosity)
        c.run()

示例#8

0

显示文件

文件： crawlurls.py 项目： zavatskiy/django-test-utils

class Command(BaseCommand):
    option_list = BaseCommand.option_list + (
        make_option('-p',
                    '--pdb',
                    action='store_true',
                    dest='pdb',
                    default=False,
                    help='Pass -p to drop into pdb on an error'),
        make_option('-d',
                    '--depth',
                    action='store',
                    dest='depth',
                    default=3,
                    help='Specify the depth to crawl.'),
        make_option('-s',
                    '--safe',
                    action='store_true',
                    dest='html',
                    default=False,
                    help='Pass -s to check for html fragments in your pages.'),
        make_option('-r',
                    '--response',
                    action='store_true',
                    dest='response',
                    default=False,
                    help='Pass -r to store the response objects.'),
        make_option('-t',
                    '--time',
                    action='store_true',
                    dest='time',
                    default=False,
                    help='Pass -t to time your requests.'),
        make_option('--enable-plugin',
                    action='append',
                    dest='plugins',
                    default=[],
                    help='Enable the specified plugin'),
        make_option(
            "-o",
            '--output-dir',
            action='store',
            dest='output_dir',
            default=None,
            help='If specified, store plugin output in the provided directory'
        ),
        make_option(
            '--no-parent',
            action='store_true',
            dest="no_parent",
            default=False,
            help='Do not crawl URLs which do not start with your base URL'),
        make_option(
            '-a',
            "--auth",
            action='store',
            dest='auth',
            default=None,
            help='Authenticate (login:user,password:secret) before crawl'))

    help = "Displays all of the url matching routes for the project."
    args = "[relative start url]"

    def handle(self, *args, **options):
        verbosity = int(options.get('verbosity', 1))
        depth = int(options.get('depth', 3))

        auth = _parse_auth(options.get('auth'))

        if verbosity > 1:
            log_level = logging.DEBUG
        elif verbosity:
            log_level = logging.INFO
        else:
            log_level = logging.WARN

        crawl_logger = logging.getLogger('crawler')
        crawl_logger.setLevel(logging.DEBUG)
        crawl_logger.propagate = 0

        log_stats = LogStatsHandler()

        crawl_logger.addHandler(log_stats)

        console = logging.StreamHandler()
        console.setLevel(log_level)
        console.setFormatter(
            logging.Formatter(
                "%(name)s [%(levelname)s] %(module)s: %(message)s"))

        crawl_logger.addHandler(console)

        if len(args) > 1:
            raise CommandError('Only one start url is currently supported.')
        else:
            start_url = args[0] if args else '/'

        if getattr(settings, 'ADMIN_FOR', None):
            settings_modules = [
                __import__(m, {}, {}, ['']) for m in settings.ADMIN_FOR
            ]
        else:
            settings_modules = [settings]

        conf_urls = {}

        # Build the list URLs to test from urlpatterns:
        for settings_mod in settings_modules:
            try:
                urlconf = __import__(settings_mod.ROOT_URLCONF, {}, {}, [''])
            except Exception, e:
                logging.exception("Error occurred while trying to load %s: %s",
                                  settings_mod.ROOT_URLCONF, str(e))
                continue

            view_functions = extract_views_from_urlpatterns(
                urlconf.urlpatterns)
            for (func, regex, namespace, name) in view_functions:
                #Get function name and add it to the hash of URLConf urls
                func_name = hasattr(func,
                                    '__name__') and func.__name__ or repr(func)
                conf_urls[regex] = ['func.__module__', func_name]

        c = Crawler(
            start_url,
            conf_urls=conf_urls,
            verbosity=verbosity,
            output_dir=options.get("output_dir"),
            ascend=not options.get("no_parent"),
            auth=auth,
        )

        # Load plugins:
        for p in options['plugins']:
            # This nested try is somewhat unsightly but allows easy Pythonic
            # usage ("--enable-plugin=tidy") instead of Java-esque
            # "--enable-plugin=test_utils.crawler.plugins.tidy"
            try:
                try:
                    plugin_module = __import__(p)
                except ImportError:
                    if not "." in p:
                        plugin_module = __import__(
                            "test_utils.crawler.plugins.%s" % p,
                            fromlist=["test_utils.crawler.plugins"])
                    else:
                        raise

                c.plugins.append(plugin_module.PLUGIN())
            except (ImportError, AttributeError), e:
                crawl_logger.critical("Unable to load plugin %s: %s", p, e)
                sys.exit(3)

示例#9

0

显示文件

文件： crawler_tests.py 项目： DexterW/django-webservice-tools

 def test_relative_crawling(self):
     c = Crawler('/1')
     c.run()
     self.assertEqual(c.crawled, {u'/1': True})

示例#10

0

显示文件

文件： crawler_tests.py 项目： DexterW/django-webservice-tools

 def test_basic_crawling(self):
     c = Crawler('/')
     c.run()
     self.assertEqual(c.crawled, {'/': True, u'/1': True, u'/2': True})

示例#11

0

显示文件

文件： crawlurls.py 项目： frac/django-test-utils

        # Load plugins:
        for p in options['plugins']:
            # This nested try is somewhat unsightly but allows easy Pythonic
            # usage ("--enable-plugin=tidy") instead of Java-esque
            # "--enable-plugin=test_utils.crawler.plugins.tidy"
            try:
                try:
                    plugin_module = __import__(p)
                except ImportError:
                    if not "." in p:
                        plugin_module = __import__("test_utils.crawler.plugins.%s" % p)
                    else:
                        raise

                plugin_module.active = True
            except ImportError, e:
                crawl_logger.critical("Unable to load plugin %s: %s", p, e)
                sys.exit(3)

        c = Crawler(start_url, conf_urls=conf_urls, verbosity=verbosity)
        c.run(max_depth=depth)

        # We'll exit with a non-zero status if we had any errors
        max_log_level = max(log_stats.stats.keys())
        if max_log_level >= logging.ERROR:
            sys.exit(2)
        elif max_log_level >= logging.WARNING:
            sys.exit(1)
        else:
            sys.exit(0)

示例#12

0

显示文件

文件： crawler_tests.py 项目： zavatskiy/django-test-utils

 def test_relative_crawling(self):
     c = Crawler('/1')
     c.run()
     self.assertEqual(c.crawled, {u'/1': True})

示例#13

0

显示文件

文件： crawler_tests.py 项目： zavatskiy/django-test-utils

 def test_basic_crawling(self):
     c = Crawler('/')
     c.run()
     self.assertEqual(c.crawled, {'/': True, u'/1': True, u'/2': True})

示例#14

0

显示文件

    def handle(self, *args, **options):
        verbosity = int(options.get('verbosity', 1))
        depth = int(options.get('depth', 3))

        auth = _parse_auth(options.get('auth'))

        if verbosity > 1:
            log_level = logging.DEBUG
        elif verbosity:
            log_level = logging.INFO
        else:
            log_level = logging.WARN

        crawl_logger = logging.getLogger('crawler')
        crawl_logger.setLevel(logging.DEBUG)
        crawl_logger.propagate = 0

        log_stats = LogStatsHandler()

        crawl_logger.addHandler(log_stats)

        console = logging.StreamHandler()
        console.setLevel(log_level)
        console.setFormatter(
            logging.Formatter(
                "%(name)s [%(levelname)s] %(module)s: %(message)s"))

        crawl_logger.addHandler(console)

        if len(args) > 1:
            raise CommandError('Only one start url is currently supported.')
        else:
            start_url = args[0] if args else '/'

        if getattr(settings, 'ADMIN_FOR', None):
            settings_modules = [
                __import__(m, {}, {}, ['']) for m in settings.ADMIN_FOR
            ]
        else:
            settings_modules = [settings]

        conf_urls = {}

        # Build the list URLs to test from urlpatterns:
        for settings_mod in settings_modules:
            try:
                urlconf = __import__(settings_mod.ROOT_URLCONF, {}, {}, [''])
            except Exception as e:
                logging.exception("Error occurred while trying to load %s: %s",
                                  settings_mod.ROOT_URLCONF, str(e))
                continue

            view_functions = extract_views_from_urlpatterns(
                urlconf.urlpatterns)
            for (func, regex, namespace, name) in view_functions:
                # Get function name and add it to the hash of URLConf urls
                func_name = hasattr(func,
                                    '__name__') and func.__name__ or repr(func)
                conf_urls[regex] = ['func.__module__', func_name]

        c = Crawler(
            start_url,
            conf_urls=conf_urls,
            verbosity=verbosity,
            output_dir=options.get("output_dir"),
            ascend=not options.get("no_parent"),
            auth=auth,
        )

        # Load plugins:
        for p in options['plugins']:
            # This nested try is somewhat unsightly but allows easy Pythonic
            # usage ("--enable-plugin=tidy") instead of Java-esque
            # "--enable-plugin=test_utils.crawler.plugins.tidy"
            try:
                try:
                    plugin_module = __import__(p)
                except ImportError:
                    if not "." in p:
                        plugin_module = __import__(
                            "test_utils.crawler.plugins.%s" % p,
                            fromlist=["test_utils.crawler.plugins"])
                    else:
                        raise

                c.plugins.append(plugin_module.PLUGIN())
            except (ImportError, AttributeError) as e:
                crawl_logger.critical("Unable to load plugin %s: %s", p, e)
                sys.exit(3)

        c.run(max_depth=depth)

        # We'll exit with a non-zero status if we had any errors
        max_log_level = max(log_stats.stats.keys())
        if max_log_level >= logging.ERROR:
            sys.exit(2)
        elif max_log_level >= logging.WARNING:
            sys.exit(1)
        else:
            sys.exit(0)

示例#15

0

显示文件

文件： testmaker_tests.py 项目： zen4ever/django-test-utils

 def test_basic_crawling(self):
     conf_urls = {}
     verbosity = 1
     c = Crawler('/', conf_urls=conf_urls, verbosity=verbosity)
     c.run()
     self.assertEqual(c.crawled, {'/': True, u'/1': True, u'/2': True})

示例#16

0

显示文件

文件： testmaker_tests.py 项目： zen4ever/django-test-utils

 def test_relative_crawling(self):
     conf_urls = {}
     verbosity = 1
     c = Crawler('/1', conf_urls=conf_urls, verbosity=verbosity)
     c.run()
     self.assertEqual(c.crawled, {u'/1': True})

示例#17

0

显示文件

class Command(BaseCommand):
    option_list = BaseCommand.option_list + (
        make_option('-p', '--pdb', action='store_true', dest='pdb', default=False,
            help='Pass -p to drop into pdb on an error'),
        make_option('-f', '--fixture', action='store_true', dest='fixtures', default=False,
            help='Pass -f to create a fixture for the data.'),
        make_option('-s', '--safe', action='store_true', dest='html', default=False,
            help='Pass -s to check for html fragments in your pages.'),
        make_option('-t', '--time', action='store_true', dest='time', default=False,
            help='Pass -t to time your requests.'),
        make_option('-r', '--response', action='store_true', dest='response', default=False,
            help='Pass -r to store the response objects.'),
        #TODO
        make_option('-e', '--each', action='store', dest='each',
            type='int',
            help='TODO: Pass -e NUM to specify how many times each URLConf entry should be hit.'),
    )

    help = "Displays all of the url matching routes for the project."
    args = "[relative start url]"

    def handle(self, *args, **options):
        verbosity = int(options.get('verbosity', 1))

        if verbosity > 1:
            log_level = logging.DEBUG
        elif verbosity:
            log_level = logging.INFO
        else:
            log_level = logging.WARN

        crawl_logger = logging.getLogger('crawler')
        crawl_logger.setLevel(logging.DEBUG)
        crawl_logger.propagate = 0

        log_stats = LogStatsHandler()

        crawl_logger.addHandler(log_stats)

        console = logging.StreamHandler()
        console.setLevel(log_level)
        console.setFormatter(logging.Formatter('%(name)-12s: %(levelname)-8s %(message)s'))

        crawl_logger.addHandler(console)

        if len(args) > 1:
            raise CommandError('Only one start url is currently supported.')
        else:
            start_url = args[0] if args else '/'

        if settings.ADMIN_FOR:
            settings_modules = [__import__(m, {}, {}, ['']) for m in settings.ADMIN_FOR]
        else:
            settings_modules = [settings]

        conf_urls = {}

        for settings_mod in settings_modules:
            try:
                urlconf = __import__(settings_mod.ROOT_URLCONF, {}, {}, [''])
            except Exception, e:
                logging.exception("Error occurred while trying to load %s: %s", settings_mod.ROOT_URLCONF, str(e))
                continue

            view_functions = extract_views_from_urlpatterns(urlconf.urlpatterns)
            for (func, regex) in view_functions:
                #Get function name and add it to the hash of URLConf urls
                func_name = hasattr(func, '__name__') and func.__name__ or repr(func)
                conf_urls[regex] = ['func.__module__', func_name]

            #Now we have all of our URLs to test

        c = Crawler(start_url, conf_urls=conf_urls, verbosity=verbosity)
        c.run()

        # We'll exit with a non-zero status if we had any errors
        max_log_level = max(log_stats.stats.keys())
        if max_log_level >= logging.ERROR:
            sys.exit(2)
        elif max_log_level >= logging.WARNING:
            sys.exit(1)
        else:
            sys.exit(0)