Exemplo n.º 1
0
Arquivo: crawl.py Projeto: lorien/grab
def main(spider_name, thread_number=None,
         settings_module='settings', network_logs=False,
         disable_proxy=False, ignore_lock=False,
         disable_report=False,
         api_port=None,
         parser_pool_size=2,
         grab_log_file=None,
         network_log_file=None,
         network_service=None,
         grab_transport=None,
         **kwargs): # pylint: disable=unused-argument
    default_logging(
        grab_log=grab_log_file,
        network_log=network_log_file,
        propagate_network_logger=network_logs,
    )

    root_config = build_root_config(settings_module)
    spider_class = load_spider_class(root_config, spider_name)
    spider_config = build_spider_config(spider_class, root_config)

    spider_args = None
    if hasattr(spider_class, 'setup_arg_parser'):
        parser = ArgumentParser()
        spider_class.setup_arg_parser(parser)
        opts, _ = parser.parse_known_args()
        spider_args = vars(opts)

    bot = spider_class(
        thread_number=thread_number,
        config=spider_config,
        network_try_limit=None,
        task_try_limit=None,
        args=spider_args,
        http_api_port=api_port,
        parser_pool_size=parser_pool_size,
        network_service=network_service,
        grab_transport=grab_transport,

    )
    opt_queue = spider_config.get('queue')
    if opt_queue:
        bot.setup_queue(**opt_queue)

    opt_cache = spider_config.get('cache')
    if opt_cache:
        bot.setup_cache(**opt_cache)

    opt_proxy_list = spider_config.get('proxy_list')
    if opt_proxy_list:
        if disable_proxy:
            logger.debug('Proxy servers disabled via command line')
        else:
            bot.load_proxylist(**opt_proxy_list)

    opt_ifaces = spider_config.get('command_interfaces')
    if opt_ifaces:
        for iface_config in opt_ifaces:
            bot.controller.add_interface(**iface_config)

    try:
        bot.run()
    except KeyboardInterrupt:
        pass

    stats = bot.render_stats()

    if spider_config.get('display_stats'):
        logger.debug(stats)

    pid = os.getpid()
    logger.debug('Spider pid is %d', pid)

    if not disable_report:
        if spider_config.get('save_report'):
            for subdir in (str(pid), 'last'):
                dir_ = 'var/%s' % subdir
                if not os.path.exists(dir_):
                    os.makedirs(dir_)
                else:
                    clear_directory(dir_)
                for key, lst in bot.stat.collections.items():
                    fname_key = key.replace('-', '_')
                    save_list(lst, '%s/%s.txt' % (dir_, fname_key))
                with open('%s/report.txt' % dir_, 'wb') as out:
                    out.write(make_str(stats))

    return {
        'spider_stats': bot.render_stats(),
    }
Exemplo n.º 2
0
def main(spider_name,
         thread_number=None,
         settings_module='settings',
         network_logs=False,
         disable_proxy=False,
         ignore_lock=False,
         disable_report=False,
         api_port=None,
         parser_pool_size=2,
         grab_log_file=None,
         network_log_file=None,
         network_service=None,
         grab_transport=None,
         **kwargs):  # pylint: disable=unused-argument
    default_logging(
        grab_log=grab_log_file,
        network_log=network_log_file,
        propagate_network_logger=network_logs,
    )

    root_config = build_root_config(settings_module)
    spider_class = load_spider_class(root_config, spider_name)
    spider_config = build_spider_config(spider_class, root_config)

    spider_args = None
    if hasattr(spider_class, 'setup_arg_parser'):
        parser = ArgumentParser()
        spider_class.setup_arg_parser(parser)
        opts, _ = parser.parse_known_args()
        spider_args = vars(opts)

    bot = spider_class(
        thread_number=thread_number,
        config=spider_config,
        network_try_limit=None,
        task_try_limit=None,
        args=spider_args,
        http_api_port=api_port,
        parser_pool_size=parser_pool_size,
        network_service=network_service,
        grab_transport=grab_transport,
    )
    opt_queue = spider_config.get('queue')
    if opt_queue:
        bot.setup_queue(**opt_queue)

    opt_cache = spider_config.get('cache')
    if opt_cache:
        bot.setup_cache(**opt_cache)

    opt_proxy_list = spider_config.get('proxy_list')
    if opt_proxy_list:
        if disable_proxy:
            logger.debug('Proxy servers disabled via command line')
        else:
            bot.load_proxylist(**opt_proxy_list)

    opt_ifaces = spider_config.get('command_interfaces')
    if opt_ifaces:
        for iface_config in opt_ifaces:
            bot.controller.add_interface(**iface_config)

    try:
        bot.run()
    except KeyboardInterrupt:
        pass

    stats = bot.render_stats()

    if spider_config.get('display_stats'):
        logger.debug(stats)

    pid = os.getpid()
    logger.debug('Spider pid is %d', pid)

    if not disable_report:
        if spider_config.get('save_report'):
            for subdir in (str(pid), 'last'):
                dir_ = 'var/%s' % subdir
                if not os.path.exists(dir_):
                    os.makedirs(dir_)
                else:
                    clear_directory(dir_)
                for key, lst in bot.stat.collections.items():
                    fname_key = key.replace('-', '_')
                    save_list(lst, '%s/%s.txt' % (dir_, fname_key))
                with open('%s/report.txt' % dir_, 'wb') as out:
                    out.write(make_str(stats))

    return {
        'spider_stats': bot.render_stats(),
    }
Exemplo n.º 3
0
import logging
from django.contrib import messages
from grab import Grab, DataNotFound
from grab.util.log import default_logging
from .list_portals import list_portals

default_logging()
LOGGER = logging.getLogger('grab')
LOGGER.addHandler(logging.StreamHandler())
LOGGER.setLevel(logging.DEBUG)

# Init grab
GRAB = Grab()


def auth_portal(portal, log_pass, request):
    """ Authenticate selected portal """
    try:
        url_login = portal['url_auth']
        GRAB.setup(timeout=10, connect_timeout=10)
        GRAB.go(url_login, log_file='templates/grab/bug_auth.html')
        GRAB.doc.text_search(portal['auth_by'])
        try:
            GRAB.doc.set_input(portal['inp_login'], log_pass['login'])
            GRAB.doc.set_input(portal['inp_password'], log_pass['password'])
            GRAB.doc.submit()
            auth_form = GRAB.doc.text_search(portal['auth_complete'])
            if auth_form is True:
                messages.success(request, "Аутентификация прошла успешно!")
                return True
            else: