示例#1
0
 def test_build_spider_config1(self):
         modname = setup_settings_file({})
         default_config.default_config = {
             'CACHE': {'backend': 'mysql'},
             'VAR1': 'val1',
         }
         config = build_global_config(modname)
         spider_config = build_spider_config('foo', config)
         self.assertEqual(config['CACHE'], {'backend': 'mysql'})
示例#2
0
 def test_build_spider_config1(self):
     modname = setup_settings_file({})
     default_config.default_config = {
         'CACHE': {
             'backend': 'mysql'
         },
         'VAR1': 'val1',
     }
     config = build_global_config(modname)
     spider_config = build_spider_config('foo', config)
     self.assertEqual(config['CACHE'], {'backend': 'mysql'})
示例#3
0
 def test_build_spider_config2(self):
         modname = setup_settings_file({})
         default_config.default_config = {
             'CACHE': {'backend': 'mysql'},
             'SPIDER_CONFIG_FOO': {
                 'CACHE': {'backend': 'tokyo'},
             },
         }
         config = build_global_config(modname)
         spider_config = build_spider_config('foo', config)
         self.assertEqual(spider_config['CACHE'], {'backend': 'tokyo'})
示例#4
0
文件: crawl.py 项目: vasia123/grab
def main(spider_name, thread_number=None, slave=False, force_url=None,
         settings='settings', *args, **kwargs):
    default_logging(propagate_network_logger=kwargs['propagate_network_logger'])

    lock_key = None
    if not slave:
        lock_key = 'crawl.%s' % spider_name
    if lock_key is not None:
        lock_path = 'var/run/%s.lock' % lock_key
        logger.debug('Trying to lock file: %s' % lock_path)
        assert_lock(lock_path)

    config = build_global_config(settings)
    spider_class = load_spider_class(config, spider_name)
    spider_config = build_spider_config(spider_class, config)

    if thread_number is None:
        thread_number = spider_config.getint('GRAB_THREAD_NUMBER')

    bot = spider_class(
        thread_number=thread_number,
        slave=slave,
        config=spider_config,
        network_try_limit=spider_config.getint('GRAB_NETWORK_TRY_LIMIT'),
        task_try_limit=spider_config.getint('GRAB_TASK_TRY_LIMIT'),
    )
    if spider_config.get('GRAB_QUEUE'):
        bot.setup_queue(**spider_config['GRAB_QUEUE'])
    if spider_config.get('GRAB_CACHE'):
        bot.setup_cache(**spider_config['GRAB_CACHE'])
    if spider_config.get('GRAB_PROXY_LIST'):
        bot.load_proxylist(**spider_config['GRAB_PROXY_LIST'])
    try:
        bot.run()
    except KeyboardInterrupt:
        pass

    stats = bot.render_stats(timing=config.get('GRAB_DISPLAY_TIMING'))

    if config.get('GRAB_DISPLAY_STATS'):
        logger.debug(stats)

    pid = os.getpid()
    logger.debug('Spider pid is %d' % pid)

    if config.get('GRAB_SAVE_FATAL_ERRORS'):
        bot.save_list('fatal', 'var/fatal-%d.txt' % pid)

    if config.get('GRAB_SAVE_TASK_ADD_ERRORS'):
        bot.save_list('task-could-not-be-added', 'var/task-add-error-%d.txt' % pid)

    if config.get('GRAB_SAVE_FINAL_STATS'):
        open('var/stats-%d.txt' % pid, 'wb').write(stats)
示例#5
0
def grab_control(request):
    form = ControlForm(request.GET or None)
    spider_registry = build_spider_registry(build_global_config())
    spider_choices = [(x, x) for x in spider_registry.keys()]
    form.fields['spider'].choices = spider_choices
    form.fields['spider'].widget.choices = spider_choices

    command_choices = [(x, x) for x in Spider.get_available_command_names()]
    form.fields['command'].choices = command_choices
    form.fields['command'].widget.choices = command_choices

    context = {
        'form': form,
    }
    return render(request, 'grabstat/control_form.html', context)
示例#6
0
 def test_build_spider_config2(self):
     modname = setup_settings_file({})
     default_config.default_config = {
         'CACHE': {
             'backend': 'mysql'
         },
         'SPIDER_CONFIG_FOO': {
             'CACHE': {
                 'backend': 'tokyo'
             },
         },
     }
     config = build_global_config(modname)
     spider_config = build_spider_config('foo', config)
     self.assertEqual(spider_config['CACHE'], {'backend': 'tokyo'})
示例#7
0
文件: views.py 项目: ArturFis/grab
def grab_control(request):
    form = ControlForm(request.GET or None)
    spider_registry = build_spider_registry(build_global_config())
    spider_choices = [(x, x) for x in spider_registry.keys()]
    form.fields['spider'].choices = spider_choices
    form.fields['spider'].widget.choices = spider_choices

    command_choices = [(x, x) for x in Spider.get_available_command_names()]
    form.fields['command'].choices = command_choices
    form.fields['command'].widget.choices = command_choices

    context = {
        'form': form,
    }
    return render(request, 'grabstat/control_form.html', context)
示例#8
0
文件: views.py 项目: ArturFis/grab
def grab_control_api(request, command):
    args = request.GET 
    cls = load_spider_class(build_global_config(), args['spider'])
    spider = cls()
    iface = spider.controller.add_interface('redis')
    if command == 'put_command':
        result_id = iface.put_command({'name': args['command']})
        return {'result_id': result_id}
    elif command == 'pop_result':
        result = iface.pop_result(args['result_id'])
        if result is None:
            return {'status': 'not-ready'}
        else:
            return {'data': result.get('data', ''),
                    'error': result.get('error', ''),
                    }
    else:
        return {'error': 'unknown-command'}
示例#9
0
def grab_control_api(request, command):
    args = request.GET
    cls = load_spider_class(build_global_config(), args['spider'])
    spider = cls()
    iface = spider.controller.add_interface('redis')
    if command == 'put_command':
        result_id = iface.put_command({'name': args['command']})
        return {'result_id': result_id}
    elif command == 'pop_result':
        result = iface.pop_result(args['result_id'])
        if result is None:
            return {'status': 'not-ready'}
        else:
            return {
                'data': result.get('data', ''),
                'error': result.get('error', ''),
            }
    else:
        return {'error': 'unknown-command'}
示例#10
0
def main(spider_name, thread_number=None, slave=False,
         settings='settings', network_logs=False,
         disable_proxy=False, 
         *args, **kwargs):
    default_logging(propagate_network_logger=network_logs)

    lock_key = None
    if not slave:
        lock_key = 'crawl.%s' % spider_name
    if lock_key is not None:
        lock_path = 'var/run/%s.lock' % lock_key
        logger.debug('Trying to lock file: %s' % lock_path)
        assert_lock(lock_path)

    config = build_global_config(settings)
    spider_class = load_spider_class(config, spider_name)
    spider_config = build_spider_config(spider_class, config)

    if hasattr(spider_class, 'setup_extra_args'):
        parser = ArgumentParser()
        spider_class.setup_extra_args(parser)
        extra_args, trash = parser.parse_known_args()
        spider_config['extra_args'] = vars(extra_args)

    if thread_number is None:
        thread_number = spider_config.getint('GRAB_THREAD_NUMBER')

    stat_task_object = kwargs.get('stat_task_object', None)

    bot = spider_class(
        thread_number=thread_number,
        slave=slave,
        config=spider_config,
        network_try_limit=spider_config.getint('GRAB_NETWORK_TRY_LIMIT'),
        task_try_limit=spider_config.getint('GRAB_TASK_TRY_LIMIT'),
    )
    if spider_config.get('GRAB_QUEUE'):
        bot.setup_queue(**spider_config['GRAB_QUEUE'])
    if spider_config.get('GRAB_CACHE'):
        bot.setup_cache(**spider_config['GRAB_CACHE'])
    if spider_config.get('GRAB_PROXY_LIST'):
        if disable_proxy:
            logger.debug('Proxy servers disabled via command line')
        else:
            bot.load_proxylist(**spider_config['GRAB_PROXY_LIST'])
    if spider_config.get('GRAB_COMMAND_INTERFACES'):
        for iface_config in spider_config['GRAB_COMMAND_INTERFACES']:
            bot.controller.add_interface(**iface_config)

    # Dirty hack
    # FIXIT: REMOVE
    bot.dump_spider_stats = kwargs.get('dump_spider_stats')
    bot.stats_object = kwargs.get('stats_object')

    try:
        bot.run()
    except KeyboardInterrupt:
        pass

    stats = bot.render_stats(timing=config.get('GRAB_DISPLAY_TIMING'))

    if spider_config.get('GRAB_DISPLAY_STATS'):
        logger.debug(stats)

    pid = os.getpid()
    logger.debug('Spider pid is %d' % pid)

    if config.get('GRAB_SAVE_REPORT'):
        for subdir in (str(pid), 'last'):
            dir_ = 'var/%s' % subdir
            if not os.path.exists(dir_):
                os.mkdir(dir_)
            else:
                clear_directory(dir_)
            bot.save_list('fatal', '%s/fatal.txt' % dir_)
            bot.save_list('task-count-rejected', '%s/task_count_rejected.txt' % dir_)
            bot.save_list('network-count-rejected', '%s/network_count_rejected.txt' % dir_)
            bot.save_list('task-with-invalid-url', '%s/task_with_invalid_url.txt' % dir_)
            with open('%s/report.txt' % dir_, 'wb') as out:
                out.write(stats)

    return {
        'spider_stats': bot.render_stats(timing=False),
        'spider_timing': bot.render_timing(),
    }
示例#11
0
文件: cli.py 项目: signaldetect/grab
import os
from argparse import ArgumentParser
import logging
from grab.tools.lock import assert_lock
from grab.tools.logs import default_logging
import sys 

from grab.util.config import build_global_config

from grab.util.py3k_support import *

logger = logging.getLogger('grab.cli')

config = build_global_config()

def activate_env(env_path):
    activate_script = os.path.join(config['GRAB_ACTIVATE_VIRTUALENV'], 'bin/activate_this.py')
    # py3 hack
    if PY3K:
        exec(compile(open(activate_script).read(), activate_script, 'exec'),
             dict(__file__=activate_script))
    else:
        execfile(activate_script, dict(__file__=activate_script))


def setup_logging(action, level):
    root = logging.getLogger()
    root.setLevel(logging.DEBUG)
    #for hdl in root.handlers:
    #    root.removeHandler(hdl)
示例#12
0
 def test_build_global_config3(self):
         modname = setup_settings_file({})
         default_config.default_config = {'CACHE': {'backend': 'mysql'}}
         config = build_global_config(modname)
         self.assertEqual(config['CACHE'], {'backend': 'mysql'})
示例#13
0
文件: cli.py 项目: enchantner/grab
def process_command_line():
    # Add current directory to python path
    cur_dir = os.path.realpath(os.getcwd())
    sys.path.insert(0, cur_dir)

    process_env_option()

    parser = ArgumentParser()
    parser.add_argument('action', type=str)
    #parser.add_argument('positional_args', nargs='*')
    #parser.add_argument('-t', '--thread-number', help='Number of network threads',
    #default=1, type=int)
    parser.add_argument('--logging-level', default='debug')
    #parser.add_argument('--slave', action='store_true', default=False)
    parser.add_argument('--lock-key')
    parser.add_argument('--ignore-lock', action='store_true', default=False)
    parser.add_argument('--settings', type=str, default='settings')
    parser.add_argument('--env', type=str)

    args, trash = parser.parse_known_args()

    config = build_global_config()
    if config and config['GRAB_DJANGO_SETTINGS']:
        os.environ['DJANGO_SETTINGS_MODULE'] = 'settings'
        # Turn off DEBUG to prevent memory leaks
        from django.conf import settings
        settings.DEBUG = False

    # Setup logging
    logging_level = getattr(logging, args.logging_level.upper())
    #if args.positional_args:
    #command_key = '_'.join([args.action] + args.positional_args)
    #else:
    #command_key = args.action
    # TODO: enable logs
    setup_logging(args.action, logging_level, clear_handlers=True)

    # Setup action handler
    action_name = args.action
    try:
        # First, try to import script from the grab package
        action_mod = __import__('grab.script.%s' % action_name, None, None,
                                ['foo'])
    except ImportError as ex:
        if (ex.message.startswith('No module named')
                and action_name in ex.message):
            pass
        else:
            logging.error('', exc_info=ex)
        # If grab does not provides the script
        # try to import it from the current project
        try:
            action_mod = __import__('script.%s' % action_name, None, None,
                                    ['foo'])
        except ImportError as ex:
            logging.error('', exc_info=ex)
            sys.stderr.write('Could not import %s script' % action_name)
            sys.exit(1)

    if hasattr(action_mod, 'setup_arg_parser'):
        action_mod.setup_arg_parser(parser)
    args = parser.parse_args()

    # TODO: enable lock-file processing
    #lock_key = None
    #if not args.slave:
    #if not args.ignore_lock:
    #if not args.lock_key:
    #if hasattr(action_mod, 'setup_lock_key'):
    #lock_key = action_mod.setup_lock_key(action_name, args)
    #else:
    #lock_key = command_key
    #else:
    #lock_key = args.lock_key
    #if lock_key is not None:
    #lock_path = 'var/run/%s.lock' % lock_key
    #print 'Trying to lock file: %s' % lock_path
    #assert_lock(lock_path)

    logger.debug('Executing %s action' % action_name)
    try:
        action_mod.main(**vars(args))
    except Exception as ex:
        logging.error('Unexpected exception from action handler:', exc_info=ex)
示例#14
0
def main(spider_name, thread_number=None, slave=False,
         settings='settings', network_logs=False,
         *args, **kwargs):
    default_logging(propagate_network_logger=network_logs)

    lock_key = None
    if not slave:
        lock_key = 'crawl.%s' % spider_name
    if lock_key is not None:
        lock_path = 'var/run/%s.lock' % lock_key
        logger.debug('Trying to lock file: %s' % lock_path)
        assert_lock(lock_path)

    config = build_global_config(settings)
    spider_class = load_spider_class(config, spider_name)
    spider_config = build_spider_config(spider_class, config)

    if hasattr(spider_class, 'setup_extra_args'):
        parser = ArgumentParser()
        spider_class.setup_extra_args(parser)
        extra_args, trash = parser.parse_known_args()
        spider_config['extra_args'] = vars(extra_args)

    if thread_number is None:
        thread_number = spider_config.getint('GRAB_THREAD_NUMBER')

    stat_task_object = kwargs.get('stat_task_object', None)

    bot = spider_class(
        thread_number=thread_number,
        slave=slave,
        config=spider_config,
        network_try_limit=spider_config.getint('GRAB_NETWORK_TRY_LIMIT'),
        task_try_limit=spider_config.getint('GRAB_TASK_TRY_LIMIT'),
    )
    if spider_config.get('GRAB_QUEUE'):
        bot.setup_queue(**spider_config['GRAB_QUEUE'])
    if spider_config.get('GRAB_CACHE'):
        bot.setup_cache(**spider_config['GRAB_CACHE'])
    if spider_config.get('GRAB_PROXY_LIST'):
        bot.load_proxylist(**spider_config['GRAB_PROXY_LIST'])
    if spider_config.get('GRAB_COMMAND_INTERFACES'):
        for iface_config in spider_config['GRAB_COMMAND_INTERFACES']:
            bot.controller.add_interface(**iface_config)

    # Dirty hack
    # FIXIT: REMOVE
    bot.dump_spider_stats = kwargs.get('dump_spider_stats')
    bot.stats_object = kwargs.get('stats_object')

    try:
        bot.run()
    except KeyboardInterrupt:
        pass

    stats = bot.render_stats(timing=config.get('GRAB_DISPLAY_TIMING'))

    if config.get('GRAB_DISPLAY_STATS'):
        logger.debug(stats)

    pid = os.getpid()
    logger.debug('Spider pid is %d' % pid)

    if config.get('GRAB_SAVE_FATAL_ERRORS'):
        bot.save_list('fatal', 'var/fatal-%d.txt' % pid)

    if config.get('GRAB_SAVE_TASK_ADD_ERRORS'):
        bot.save_list('task-could-not-be-added', 'var/task-add-error-%d.txt' % pid)

    if config.get('GRAB_SAVE_FINAL_STATS'):
        open('var/stats-%d.txt' % pid, 'wb').write(stats)

    return {
        'spider_stats': bot.render_stats(timing=False),
        'spider_timing': bot.render_timing(),
    }
示例#15
0
 def test_build_global_config3(self):
     modname = setup_settings_file({})
     default_config.default_config = {'CACHE': {'backend': 'mysql'}}
     config = build_global_config(modname)
     self.assertEqual(config['CACHE'], {'backend': 'mysql'})
示例#16
0
文件: crawl.py 项目: Kuznitsin/grab
def main(spider_name, thread_number=None, slave=False,
         settings='settings', network_logs=False,
         disable_proxy=False, ignore_lock=False, 
         *args, **kwargs):
    default_logging(propagate_network_logger=network_logs)

    if not ignore_lock:
        lock_key = None
        if not slave:
            lock_key = 'crawl.%s' % spider_name
        if lock_key is not None:
            lock_path = 'var/run/%s.lock' % lock_key
            logger.debug('Trying to lock file: %s' % lock_path)
            assert_lock(lock_path)

    config = build_global_config(settings)
    spider_class = load_spider_class(config, spider_name)
    spider_config = build_spider_config(spider_class, config)

    spider_args = None
    if hasattr(spider_class, 'setup_arg_parser'):
        parser = ArgumentParser()
        spider_class.setup_arg_parser(parser)
        opts, trash = parser.parse_known_args()
        spider_args = vars(opts)

    if thread_number is None:
        thread_number = int(spider_config.get('thread_number', deprecated_key='GRAB_THREAD_NUMBER'))

    stat_task_object = kwargs.get('stat_task_object', None)

    bot = spider_class(
        thread_number=thread_number,
        slave=slave,
        config=spider_config,
        network_try_limit=int(spider_config.get('network_try_limit',
                                                deprecated_key='GRAB_NETWORK_TRY_LIMIT')),
        task_try_limit=int(spider_config.get('task_try_limit',
                                             deprecated_key='GRAB_TASK_TRY_LIMIT')),
        args=spider_args,
    )
    opt_queue = spider_config.get('queue', deprecated_key='GRAB_QUEUE')
    if opt_queue:
        bot.setup_queue(**opt_queue)

    opt_cache = spider_config.get('cache', deprecated_key='GRAB_CACHE')
    if opt_cache:
        bot.setup_cache(**opt_cache)

    opt_proxy_list = spider_config.get('proxy_list', deprecated_key='GRAB_PROXY_LIST')
    if opt_proxy_list:
        if disable_proxy:
            logger.debug('Proxy servers disabled via command line')
        else:
            bot.load_proxylist(**opt_proxy_list)

    opt_ifaces = spider_config.get('command_interfaces', deprecated_key='GRAB_COMMAND_INTERFACES')
    if opt_ifaces:
        for iface_config in opt_ifaces:
            bot.controller.add_interface(**iface_config)

    # Dirty hack
    # FIXIT: REMOVE
    bot.dump_spider_stats = kwargs.get('dump_spider_stats')
    bot.stats_object = kwargs.get('stats_object')

    try:
        bot.run()
    except KeyboardInterrupt:
        pass

    stats = bot.render_stats(timing=spider_config.get('display_timing', deprecated_key='GRAB_DISPLAY_TIMING'))

    if spider_config.get('display_stats', deprecated_key='GRAB_DISPLAY_STATS'):
        logger.debug(stats)

    pid = os.getpid()
    logger.debug('Spider pid is %d' % pid)

    if spider_config.get('save_report', deprecated_key='GRAB_SAVE_REPORT'):
        for subdir in (str(pid), 'last'):
            dir_ = 'var/%s' % subdir
            if not os.path.exists(dir_):
                os.mkdir(dir_)
            else:
                clear_directory(dir_)
            for key, lst in bot.items.iteritems():
                fname_key = key.replace('-', '_')
                bot.save_list(key, '%s/%s.txt' % (dir_, fname_key))
            with open('%s/report.txt' % dir_, 'wb') as out:
                out.write(stats)

    return {
        'spider_stats': bot.render_stats(timing=False),
        'spider_timing': bot.render_timing(),
    }
示例#17
0
文件: cli.py 项目: enchantner/grab
def process_command_line():
    # Add current directory to python path
    cur_dir = os.path.realpath(os.getcwd())
    sys.path.insert(0, cur_dir)

    process_env_option()

    parser = ArgumentParser()
    parser.add_argument('action', type=str)
    #parser.add_argument('positional_args', nargs='*')
    #parser.add_argument('-t', '--thread-number', help='Number of network threads',
                        #default=1, type=int)
    parser.add_argument('--logging-level', default='debug')
    #parser.add_argument('--slave', action='store_true', default=False)
    parser.add_argument('--lock-key')
    parser.add_argument('--ignore-lock', action='store_true', default=False)
    parser.add_argument('--settings', type=str, default='settings')
    parser.add_argument('--env', type=str)

    args, trash = parser.parse_known_args()

    config = build_global_config()
    if config and config['GRAB_DJANGO_SETTINGS']:
        os.environ['DJANGO_SETTINGS_MODULE'] = 'settings'
        # Turn off DEBUG to prevent memory leaks
        from django.conf import settings
        settings.DEBUG = False

    # Setup logging
    logging_level = getattr(logging, args.logging_level.upper())
    #if args.positional_args:
        #command_key = '_'.join([args.action] + args.positional_args)
    #else:
        #command_key = args.action
    # TODO: enable logs
    setup_logging(args.action, logging_level, clear_handlers=True)

    # Setup action handler
    action_name = args.action
    try:
        # First, try to import script from the grab package
        action_mod = __import__('grab.script.%s' % action_name, None, None, ['foo'])
    except ImportError as ex:
        if (ex.message.startswith('No module named') and
            action_name in ex.message):
            pass
        else:
            logging.error('', exc_info=ex)
        # If grab does not provides the script
        # try to import it from the current project
        try:
            action_mod = __import__('script.%s' % action_name, None, None, ['foo'])
        except ImportError as ex:
            logging.error('', exc_info=ex)
            sys.stderr.write('Could not import %s script' % action_name)
            sys.exit(1)

    if hasattr(action_mod, 'setup_arg_parser'):
        action_mod.setup_arg_parser(parser)
    args = parser.parse_args()

    # TODO: enable lock-file processing
    #lock_key = None
    #if not args.slave:
        #if not args.ignore_lock:
            #if not args.lock_key:
                #if hasattr(action_mod, 'setup_lock_key'):
                    #lock_key = action_mod.setup_lock_key(action_name, args)
                #else:
                    #lock_key = command_key
            #else:
                #lock_key = args.lock_key
    #if lock_key is not None:
        #lock_path = 'var/run/%s.lock' % lock_key
        #print 'Trying to lock file: %s' % lock_path
        #assert_lock(lock_path)

    logger.debug('Executing %s action' % action_name)
    try:
        action_mod.main(**vars(args))
    except Exception as ex:
        logging.error('Unexpected exception from action handler:', exc_info=ex)
示例#18
0
文件: cli.py 项目: signaldetect/grab
import os
from argparse import ArgumentParser
import logging
from grab.tools.lock import assert_lock
from grab.tools.logs import default_logging
import sys

from grab.util.config import build_global_config

from grab.util.py3k_support import *

logger = logging.getLogger('grab.cli')

config = build_global_config()


def activate_env(env_path):
    activate_script = os.path.join(config['GRAB_ACTIVATE_VIRTUALENV'],
                                   'bin/activate_this.py')
    # py3 hack
    if PY3K:
        exec(compile(open(activate_script).read(), activate_script, 'exec'),
             dict(__file__=activate_script))
    else:
        execfile(activate_script, dict(__file__=activate_script))


def setup_logging(action, level):
    root = logging.getLogger()
    root.setLevel(logging.DEBUG)
    #for hdl in root.handlers: