示例#1
0
 def test_build_root_config_overwrite(self):
     cfg = build_root_config('tests.files.settings_overwrite')
     for key, val in DEFAULT_SPIDER_GLOBAL_CONFIG.items():
         if key == 'spider_modules':
             self.assertEqual(cfg['global'][key], ['zzz'])
         else:
             self.assertEqual(cfg['global'][key], val)
示例#2
0
    def test_build_spider_config_empty(self):
        class TestSpider(Spider):
            pass

        root_cfg = build_root_config('test.files.settings_minimal')
        cfg = build_spider_config(TestSpider, root_cfg)
        self.assertEqual(cfg, DEFAULT_SPIDER_GLOBAL_CONFIG)
示例#3
0
    def test_build_spider_config_empty(self):
        class TestSpider(Spider):
            pass

        root_cfg = build_root_config('tests.files.settings_minimal')
        cfg = build_spider_config(TestSpider, root_cfg)
        self.assertEqual(cfg, DEFAULT_SPIDER_GLOBAL_CONFIG)
示例#4
0
 def test_build_root_config_overwrite(self):
     cfg = build_root_config('test.files.settings_overwrite')
     for key, val in DEFAULT_SPIDER_GLOBAL_CONFIG.items():
         if key == 'spider_modules':
             self.assertEqual(cfg['global'][key], ['zzz'])
         else:
             self.assertEqual(cfg['global'][key], val)
示例#5
0
 def test_build_spider_config_overwrite(self):
     class TestSpider(Spider): pass
     root_cfg = build_root_config('test.files.settings_test_spider')
     cfg = build_spider_config(TestSpider, root_cfg)
     for key, val in  DEFAULT_SPIDER_GLOBAL_CONFIG.items():
         if key == 'spider_modules':
             self.assertEqual(cfg[key], ['zzz'])
         elif key == 'thread_number':
             self.assertEqual(cfg[key], 777)
         else:
             self.assertEqual(cfg[key], val)
示例#6
0
    def test_setup_spider_config(self):
        class TestSpider(Spider):
            @classmethod
            def setup_spider_config(cls, config):
                config['foo'] = 'bar'

        root_cfg = build_root_config('tests.files.settings_minimal')
        cfg = build_spider_config(TestSpider, root_cfg)
        for key, val in DEFAULT_SPIDER_GLOBAL_CONFIG.items():
            if key == 'foo':
                self.assertEqual(cfg[key], 'bar')
            else:
                self.assertEqual(cfg[key], val)
示例#7
0
    def test_setup_spider_config(self):
        class TestSpider(Spider):
            @classmethod
            def setup_spider_config(cls, config):
                config['foo'] = 'bar'

        root_cfg = build_root_config('test.files.settings_minimal')
        cfg = build_spider_config(TestSpider, root_cfg)
        for key, val in DEFAULT_SPIDER_GLOBAL_CONFIG.items():
            if key == 'foo':
                self.assertEqual(cfg[key], 'bar')
            else:
                self.assertEqual(cfg[key], val)
示例#8
0
文件: views.py 项目: sergithon/grab
def grab_control(request):
    form = ControlForm(request.GET or None)
    spider_registry = build_spider_registry(build_root_config())
    spider_choices = [(x, x) for x in spider_registry.keys()]
    form.fields['spider'].choices = spider_choices
    form.fields['spider'].widget.choices = spider_choices

    command_choices = [(x, x) for x in Spider.get_available_command_names()]
    form.fields['command'].choices = command_choices
    form.fields['command'].widget.choices = command_choices

    context = {
        'form': form,
    }
    return render(request, 'grabstat/control_form.html', context)
示例#9
0
文件: views.py 项目: sergithon/grab
def grab_control_api(request, command):
    args = request.GET 
    cls = load_spider_class(build_root_config(), args['spider'])
    spider = cls()
    iface = spider.controller.add_interface('redis')
    if command == 'put_command':
        result_id = iface.put_command({'name': args['command']})
        return {'result_id': result_id}
    elif command == 'pop_result':
        result = iface.pop_result(args['result_id'])
        if result is None:
            return {'status': 'not-ready'}
        else:
            return {'data': result.get('data', ''),
                    'error': result.get('error', ''),
                    }
    else:
        return {'error': 'unknown-command'}
示例#10
0
def main(spider_name,
         thread_number=None,
         slave=False,
         settings='settings',
         network_logs=False,
         disable_proxy=False,
         ignore_lock=False,
         disable_report=False,
         *args,
         **kwargs):
    default_logging(propagate_network_logger=network_logs)

    root_config = build_root_config(settings)
    spider_class = load_spider_class(root_config, spider_name)
    spider_config = build_spider_config(spider_class, root_config)

    spider_args = None
    if hasattr(spider_class, 'setup_arg_parser'):
        parser = ArgumentParser()
        spider_class.setup_arg_parser(parser)
        opts, trash = parser.parse_known_args()
        spider_args = vars(opts)

    if thread_number is None:
        thread_number = \
            int(spider_config.get('thread_number',
                                  deprecated_key='GRAB_THREAD_NUMBER'))

    stat_task_object = kwargs.get('stat_task_object', None)

    bot = spider_class(
        thread_number=thread_number,
        slave=slave,
        config=spider_config,
        network_try_limit=int(
            spider_config.get('network_try_limit',
                              deprecated_key='GRAB_NETWORK_TRY_LIMIT')),
        task_try_limit=int(
            spider_config.get('task_try_limit',
                              deprecated_key='GRAB_TASK_TRY_LIMIT')),
        args=spider_args,
    )
    opt_queue = spider_config.get('queue', deprecated_key='GRAB_QUEUE')
    if opt_queue:
        bot.setup_queue(**opt_queue)

    opt_cache = spider_config.get('cache', deprecated_key='GRAB_CACHE')
    if opt_cache:
        bot.setup_cache(**opt_cache)

    opt_proxy_list = spider_config.get('proxy_list',
                                       deprecated_key='GRAB_PROXY_LIST')
    if opt_proxy_list:
        if disable_proxy:
            logger.debug('Proxy servers disabled via command line')
        else:
            bot.load_proxylist(**opt_proxy_list)

    opt_ifaces = spider_config.get('command_interfaces',
                                   deprecated_key='GRAB_COMMAND_INTERFACES')
    if opt_ifaces:
        for iface_config in opt_ifaces:
            bot.controller.add_interface(**iface_config)

    # Dirty hack
    # FIXIT: REMOVE
    bot.dump_spider_stats = kwargs.get('dump_spider_stats')
    bot.stats_object = kwargs.get('stats_object')

    try:
        bot.run()
    except KeyboardInterrupt:
        pass

    stats = bot.render_stats(timing=spider_config.get(
        'display_timing', deprecated_key='GRAB_DISPLAY_TIMING'))

    if spider_config.get('display_stats', deprecated_key='GRAB_DISPLAY_STATS'):
        logger.debug(stats)

    pid = os.getpid()
    logger.debug('Spider pid is %d' % pid)

    if not disable_report:
        if spider_config.get('save_report', deprecated_key='GRAB_SAVE_REPORT'):
            for subdir in (str(pid), 'last'):
                dir_ = 'var/%s' % subdir
                if not os.path.exists(dir_):
                    os.mkdir(dir_)
                else:
                    clear_directory(dir_)
                for key, lst in bot.items.iteritems():
                    fname_key = key.replace('-', '_')
                    bot.save_list(key, '%s/%s.txt' % (dir_, fname_key))
                with open('%s/report.txt' % dir_, 'wb') as out:
                    out.write(stats)

    return {
        'spider_stats': bot.render_stats(timing=False),
        'spider_timing': bot.render_timing(),
    }
示例#11
0
def process_command_line():
    # Add current directory to python path
    cur_dir = os.path.realpath(os.getcwd())
    sys.path.insert(0, cur_dir)

    parser = ArgumentParser()
    parser.add_argument('action', type=str)
    parser.add_argument('--logging-level', default='debug')
    parser.add_argument('--lock-key')
    #parser.add_argument('--ignore-lock', action='store_true', default=False)
    parser.add_argument('--settings', type=str, default='settings')
    parser.add_argument('--env', type=str)
    parser.add_argument('--profile', action='store_true', default=False)

    args, trash = parser.parse_known_args()

    config = build_root_config()
    if config and config['GRAB_DJANGO_SETTINGS']:
        os.environ['DJANGO_SETTINGS_MODULE'] = 'settings'
        # Turn off DEBUG to prevent memory leaks
        from django.conf import settings
        settings.DEBUG = False

    # Setup logging
    logging_level = getattr(logging, args.logging_level.upper())
    #if args.positional_args:
    #command_key = '_'.join([args.action] + args.positional_args)
    #else:
    #command_key = args.action
    # TODO: enable logs
    setup_logging(args.action, logging_level, clear_handlers=True)

    # Setup action handler
    action_name = args.action
    try:
        # First, try to import script from the grab package
        action_mod = __import__('grab.script.%s' % action_name, None, None,
                                ['foo'])
    except ImportError as ex:
        if (unicode(ex).startswith('No module named')
                and action_name in unicode(ex)):
            pass
        else:
            logging.error('', exc_info=ex)
        # If grab does not provides the script
        # try to import it from the current project
        try:
            action_mod = __import__('script.%s' % action_name, None, None,
                                    ['foo'])
        except ImportError as ex:
            logging.error('', exc_info=ex)
            sys.stderr.write('Could not import %s script' % action_name)
            sys.exit(1)

    if hasattr(action_mod, 'setup_arg_parser'):
        action_mod.setup_arg_parser(parser)
    args, trash = parser.parse_known_args()

    # TODO: enable lock-file processing
    #lock_key = None
    #if not args.slave:
    #if not args.ignore_lock:
    #if not args.lock_key:
    #if hasattr(action_mod, 'setup_lock_key'):
    #lock_key = action_mod.setup_lock_key(action_name, args)
    #else:
    #lock_key = command_key
    #else:
    #lock_key = args.lock_key
    #if lock_key is not None:
    #lock_path = 'var/run/%s.lock' % lock_key
    #print 'Trying to lock file: %s' % lock_path
    #assert_lock(lock_path)

    #logger.debug('Executing %s action' % action_name)
    try:
        if args.profile:
            import cProfile
            import pyprof2calltree
            import pstats

            profile_file = 'var/%s.prof' % action_name
            profile_tree_file = 'var/%s.prof.out' % action_name

            prof = cProfile.Profile()
            prof.runctx('action_mod.main(**vars(args))', globals(), locals())
            stats = pstats.Stats(prof)
            stats.strip_dirs()
            pyprof2calltree.convert(stats, profile_tree_file)
        else:
            action_mod.main(**vars(args))
    except Exception as ex:
        logging.error('Unexpected exception from action handler:', exc_info=ex)
示例#12
0
def build_spider_instance(cls, settings_module):
    root_config = build_root_config(settings_module)
    spider_config = build_spider_config(cls, root_config)
    return cls(config=spider_config)
示例#13
0
def main(spider_name,
         thread_number=None,
         settings_module='settings',
         network_logs=False,
         disable_proxy=False,
         ignore_lock=False,
         disable_report=False,
         api_port=None,
         parser_pool_size=2,
         grab_log_file=None,
         network_log_file=None,
         network_service=None,
         grab_transport=None,
         **kwargs):  # pylint: disable=unused-argument
    default_logging(
        grab_log=grab_log_file,
        network_log=network_log_file,
        propagate_network_logger=network_logs,
    )

    root_config = build_root_config(settings_module)
    spider_class = load_spider_class(root_config, spider_name)
    spider_config = build_spider_config(spider_class, root_config)

    spider_args = None
    if hasattr(spider_class, 'setup_arg_parser'):
        parser = ArgumentParser()
        spider_class.setup_arg_parser(parser)
        opts, _ = parser.parse_known_args()
        spider_args = vars(opts)

    bot = spider_class(
        thread_number=thread_number,
        config=spider_config,
        network_try_limit=None,
        task_try_limit=None,
        args=spider_args,
        http_api_port=api_port,
        parser_pool_size=parser_pool_size,
        network_service=network_service,
        grab_transport=grab_transport,
    )
    opt_queue = spider_config.get('queue')
    if opt_queue:
        bot.setup_queue(**opt_queue)

    opt_cache = spider_config.get('cache')
    if opt_cache:
        bot.setup_cache(**opt_cache)

    opt_proxy_list = spider_config.get('proxy_list')
    if opt_proxy_list:
        if disable_proxy:
            logger.debug('Proxy servers disabled via command line')
        else:
            bot.load_proxylist(**opt_proxy_list)

    opt_ifaces = spider_config.get('command_interfaces')
    if opt_ifaces:
        for iface_config in opt_ifaces:
            bot.controller.add_interface(**iface_config)

    try:
        bot.run()
    except KeyboardInterrupt:
        pass

    stats = bot.render_stats()

    if spider_config.get('display_stats'):
        logger.debug(stats)

    pid = os.getpid()
    logger.debug('Spider pid is %d', pid)

    if not disable_report:
        if spider_config.get('save_report'):
            for subdir in (str(pid), 'last'):
                dir_ = 'var/%s' % subdir
                if not os.path.exists(dir_):
                    os.makedirs(dir_)
                else:
                    clear_directory(dir_)
                for key, lst in bot.stat.collections.items():
                    fname_key = key.replace('-', '_')
                    save_list(lst, '%s/%s.txt' % (dir_, fname_key))
                with open('%s/report.txt' % dir_, 'wb') as out:
                    out.write(make_str(stats))

    return {
        'spider_stats': bot.render_stats(),
    }
示例#14
0
文件: cli.py 项目: Scaurus/grab
def process_command_line():
    # Add current directory to python path
    cur_dir = os.path.realpath(os.getcwd())
    sys.path.insert(0, cur_dir)

    parser = ArgumentParser()
    parser.add_argument('action', type=str)
    parser.add_argument('--logging-level', default='debug')
    parser.add_argument('--lock-key')
    #parser.add_argument('--ignore-lock', action='store_true', default=False)
    parser.add_argument('--settings', type=str, default='settings')
    parser.add_argument('--env', type=str)
    parser.add_argument('--profile', action='store_true', default=False)

    args, trash = parser.parse_known_args()

    config = build_root_config()
    if config and config['GRAB_DJANGO_SETTINGS']:
        os.environ['DJANGO_SETTINGS_MODULE'] = 'settings'
        # Turn off DEBUG to prevent memory leaks
        from django.conf import settings
        settings.DEBUG = False

    # Setup logging
    logging_level = getattr(logging, args.logging_level.upper())
    #if args.positional_args:
        #command_key = '_'.join([args.action] + args.positional_args)
    #else:
        #command_key = args.action
    # TODO: enable logs
    setup_logging(args.action, logging_level, clear_handlers=True)

    # Setup action handler
    action_name = args.action
    try:
        # First, try to import script from the grab package
        action_mod = __import__('grab.script.%s' % action_name, None, None,
                                ['foo'])
    except ImportError as ex:
        if (unicode(ex).startswith('No module named') and
                    action_name in unicode(ex)):
            pass
        else:
            logging.error('', exc_info=ex)
        # If grab does not provides the script
        # try to import it from the current project
        try:
            action_mod = __import__('script.%s' % action_name, None, None,
                                    ['foo'])
        except ImportError as ex:
            logging.error('', exc_info=ex)
            sys.stderr.write('Could not import %s script' % action_name)
            sys.exit(1)

    if hasattr(action_mod, 'setup_arg_parser'):
        action_mod.setup_arg_parser(parser)
    args, trash = parser.parse_known_args()

    # TODO: enable lock-file processing
    #lock_key = None
    #if not args.slave:
        #if not args.ignore_lock:
            #if not args.lock_key:
                #if hasattr(action_mod, 'setup_lock_key'):
                    #lock_key = action_mod.setup_lock_key(action_name, args)
                #else:
                    #lock_key = command_key
            #else:
                #lock_key = args.lock_key
    #if lock_key is not None:
        #lock_path = 'var/run/%s.lock' % lock_key
        #print 'Trying to lock file: %s' % lock_path
        #assert_lock(lock_path)

    #logger.debug('Executing %s action' % action_name)
    try:
        if args.profile:
            import cProfile
            import pyprof2calltree
            import pstats

            profile_file = 'var/%s.prof' % action_name
            profile_tree_file = 'var/%s.prof.out' % action_name

            prof = cProfile.Profile()
            prof.runctx('action_mod.main(**vars(args))',
                        globals(), locals())
            stats = pstats.Stats(prof)
            stats.strip_dirs()
            pyprof2calltree.convert(stats, profile_tree_file)
        else:
            action_mod.main(**vars(args))
    except Exception as ex:
        logging.error('Unexpected exception from action handler:', exc_info=ex)
示例#15
0
文件: crawl.py 项目: abael/grab
def main(spider_name, thread_number=None, slave=False,
         settings_module='settings', network_logs=False,
         disable_proxy=False, ignore_lock=False,
         disable_report=False,
         disable_default_logs=False,
         *args, **kwargs):
    if disable_default_logs:
        default_logging(propagate_network_logger=network_logs,
                        grab_log=None, network_log=None)
    else:
        default_logging(propagate_network_logger=network_logs)

    root_config = build_root_config(settings_module)
    spider_class = load_spider_class(root_config, spider_name)
    spider_config = build_spider_config(spider_class, root_config)

    spider_args = None
    if hasattr(spider_class, 'setup_arg_parser'):
        parser = ArgumentParser()
        spider_class.setup_arg_parser(parser)
        opts, trash = parser.parse_known_args()
        spider_args = vars(opts)

    bot = spider_class(
        thread_number=thread_number,
        slave=slave,
        config=spider_config,
        network_try_limit=None,
        task_try_limit=None,
        args=spider_args,
    )
    opt_queue = spider_config.get('queue')
    if opt_queue:
        bot.setup_queue(**opt_queue)

    opt_cache = spider_config.get('cache')
    if opt_cache:
        bot.setup_cache(**opt_cache)

    opt_proxy_list = spider_config.get('proxy_list')
    if opt_proxy_list:
        if disable_proxy:
            logger.debug('Proxy servers disabled via command line')
        else:
            bot.load_proxylist(**opt_proxy_list)

    opt_ifaces = spider_config.get('command_interfaces')
    if opt_ifaces:
        for iface_config in opt_ifaces:
            bot.controller.add_interface(**iface_config)

    try:
        bot.run()
    except KeyboardInterrupt:
        pass

    stats = bot.render_stats(timing=spider_config.get('display_timing'))
    stats_with_time = bot.render_stats(timing=True)

    if spider_config.get('display_stats'):
        logger.debug(stats)

    pid = os.getpid()
    logger.debug('Spider pid is %d' % pid)

    if not disable_report:
        if spider_config.get('save_report'):
            for subdir in (str(pid), 'last'):
                dir_ = 'var/%s' % subdir
                if not os.path.exists(dir_):
                    os.mkdir(dir_)
                else:
                    clear_directory(dir_)
                for key, lst in bot.items.items():
                    fname_key = key.replace('-', '_')
                    bot.save_list(key, '%s/%s.txt' % (dir_, fname_key))
                with open('%s/report.txt' % dir_, 'wb') as out:
                    out.write(make_str(stats_with_time))

    return {
        'spider_stats': bot.render_stats(timing=False),
        'spider_timing': bot.render_timing(),
    }
示例#16
0
def build_spider_instance(cls, settings_module, **kwargs):
    root_config = build_root_config(settings_module)
    spider_config = build_spider_config(cls, root_config)
    return cls(config=spider_config)
示例#17
0
 def test_build_root_config_minimal_settings(self):
     cfg = build_root_config('tests.files.settings_minimal')
     self.assertEqual(cfg['global'], DEFAULT_SPIDER_GLOBAL_CONFIG)
示例#18
0
文件: crawl.py 项目: bodja/grab
def main(spider_name, thread_number=None, slave=False,
         settings='settings', network_logs=False,
         disable_proxy=False, ignore_lock=False, 
         disable_report=False,
         *args, **kwargs):
    default_logging(propagate_network_logger=network_logs)

    root_config = build_root_config(settings)
    spider_class = load_spider_class(root_config, spider_name)
    spider_config = build_spider_config(spider_class, root_config)

    spider_args = None
    if hasattr(spider_class, 'setup_arg_parser'):
        parser = ArgumentParser()
        spider_class.setup_arg_parser(parser)
        opts, trash = parser.parse_known_args()
        spider_args = vars(opts)

    if thread_number is None:
        thread_number = \
            int(spider_config.get('thread_number',
                                  deprecated_key='GRAB_THREAD_NUMBER'))

    stat_task_object = kwargs.get('stat_task_object', None)

    bot = spider_class(
        thread_number=thread_number,
        slave=slave,
        config=spider_config,
        network_try_limit=int(spider_config.get(
            'network_try_limit', deprecated_key='GRAB_NETWORK_TRY_LIMIT')),
        task_try_limit=int(spider_config.get(
            'task_try_limit', deprecated_key='GRAB_TASK_TRY_LIMIT')),
        args=spider_args,
    )
    opt_queue = spider_config.get('queue', deprecated_key='GRAB_QUEUE')
    if opt_queue:
        bot.setup_queue(**opt_queue)

    opt_cache = spider_config.get('cache', deprecated_key='GRAB_CACHE')
    if opt_cache:
        bot.setup_cache(**opt_cache)

    opt_proxy_list = spider_config.get(
        'proxy_list', deprecated_key='GRAB_PROXY_LIST')
    if opt_proxy_list:
        if disable_proxy:
            logger.debug('Proxy servers disabled via command line')
        else:
            bot.load_proxylist(**opt_proxy_list)

    opt_ifaces = spider_config.get(
        'command_interfaces', deprecated_key='GRAB_COMMAND_INTERFACES')
    if opt_ifaces:
        for iface_config in opt_ifaces:
            bot.controller.add_interface(**iface_config)

    # Dirty hack
    # FIXIT: REMOVE
    bot.dump_spider_stats = kwargs.get('dump_spider_stats')
    bot.stats_object = kwargs.get('stats_object')

    try:
        bot.run()
    except KeyboardInterrupt:
        pass

    stats = bot.render_stats(
        timing=spider_config.get('display_timing',
                                 deprecated_key='GRAB_DISPLAY_TIMING'))

    if spider_config.get('display_stats', deprecated_key='GRAB_DISPLAY_STATS'):
        logger.debug(stats)

    pid = os.getpid()
    logger.debug('Spider pid is %d' % pid)

    if not disable_report:
        if spider_config.get('save_report', deprecated_key='GRAB_SAVE_REPORT'):
            for subdir in (str(pid), 'last'):
                dir_ = 'var/%s' % subdir
                if not os.path.exists(dir_):
                    os.mkdir(dir_)
                else:
                    clear_directory(dir_)
                for key, lst in bot.items.iteritems():
                    fname_key = key.replace('-', '_')
                    bot.save_list(key, '%s/%s.txt' % (dir_, fname_key))
                with open('%s/report.txt' % dir_, 'wb') as out:
                    out.write(stats)

    return {
        'spider_stats': bot.render_stats(timing=False),
        'spider_timing': bot.render_timing(),
    }
示例#19
0
 def test_build_root_config_minimal_settings(self):
     cfg = build_root_config('test.files.settings_minimal')
     self.assertEqual(cfg['global'], DEFAULT_SPIDER_GLOBAL_CONFIG)