Пример #1
0
async def test_prefetch():
    url = URL('http://example.com/')

    config.config(None, None)
    resolver = dns.get_resolver()

    iplist = await dns.prefetch(url, resolver)
    assert len(iplist) > 0
    iplist2 = await dns.prefetch(url, resolver)
    assert iplist == iplist2
Пример #2
0
async def test_cocrawler(capsys):
    config.config(None, None)

    # we have to get around the useragent checks
    config.write('pytest', 'UserAgent', 'MyPrefix')
    config.write('http://example.com/pytest-test-cocrawler.py', 'UserAgent',
                 'URL')
    # and configure url_allowed
    config.write('AllDomains', 'Plugins', 'url_allowed')

    crawler = cocrawler.Crawler()

    crawler.add_url(0, {'url': URL('http://example1.com/')})
    crawler.add_url(0, {'url': URL('http://example2.com/')})
    crawler.add_url(0, {'url': URL('http://example3.com/')})

    assert crawler.qsize == 3

    f = tempfile.NamedTemporaryFile(delete=False)
    name = f.name

    with open(name, 'wb') as f:
        crawler.save(f)
    assert crawler.qsize == 0

    crawler.add_url(0, {'url': URL('http://example4.com/')})
    assert crawler.qsize == 1

    with open(name, 'rb') as f:
        crawler.load(f)

    assert crawler.qsize == 3

    os.unlink(name)
    assert not os.path.exists(name)

    # clear out the existing capture
    out, err = capsys.readouterr()

    crawler.summarize()

    out, err = capsys.readouterr()

    assert err == ''
    assert len(
        out) >= 200  # not a very good test, but at least it is something

    await crawler.close()  # needed for smooth shutdown
Пример #3
0
async def test_cocrawler(capsys):
    config.config(None, None)

    # we have to get around the useragent checks
    config.write('pytest', 'UserAgent', 'MyPrefix')
    config.write('http://example.com/pytest-test-cocrawler.py', 'UserAgent', 'URL')
    # and configure url_allowed
    config.write('AllDomains', 'Plugins', 'url_allowed')

    crawler = cocrawler.Crawler()

    crawler.add_url(0, {'url': URL('http://example1.com/')})
    crawler.add_url(0, {'url': URL('http://example2.com/')})
    crawler.add_url(0, {'url': URL('http://example3.com/')})

    assert crawler.qsize == 3

    f = tempfile.NamedTemporaryFile(delete=False)
    name = f.name

    with open(name, 'wb') as f:
        crawler.save(f)
    assert crawler.qsize == 0

    crawler.add_url(0, {'url': URL('http://example4.com/')})
    assert crawler.qsize == 1

    with open(name, 'rb') as f:
        crawler.load(f)

    assert crawler.qsize == 3

    os.unlink(name)
    assert not os.path.exists(name)

    # clear out the existing capture
    out, err = capsys.readouterr()

    crawler.summarize()

    out, err = capsys.readouterr()

    assert err == ''
    assert len(out) >= 200  # not a very good test, but at least it is something

    await crawler.close()  # needed for smooth shutdown
Пример #4
0
import asyncio

import cocrawler.dns as dns
import cocrawler.config as config

ARGS = argparse.ArgumentParser(description='CoCrawler dns fetcher')
ARGS.add_argument('--config', action='append')
ARGS.add_argument('--configfile', action='store')
ARGS.add_argument('--no-confighome', action='store_true')
ARGS.add_argument('--type', default='A')
ARGS.add_argument('hosts', nargs='+', help='list of hostnames to query')

args = ARGS.parse_args()

config.config(args.configfile, args.config, confighome=not args.no_confighome)

ns = config.read('Fetcher', 'Nameservers')
if not isinstance(ns, list):
    ns = [ns]

dns.setup_resolver(ns)
print('set nameservers to', ns)


async def main(hosts):
    for host in hosts:
        try:
            result = await dns.query(host, args.type)
            print(host, result)
        except Exception as e:
Пример #5
0
def main():
    '''
    Main program: parse args, read config, set up event loop, run the crawler.
    '''

    args = ARGS.parse_args()

    if args.printdefault:
        config.print_default()
        sys.exit(1)

    loglevel = os.getenv('COCRAWLER_LOGLEVEL') or args.loglevel
    logging.basicConfig(level=loglevel)

    config.config(args.configfile, args.config)

    if args.printfinal:
        config.print_final()
        sys.exit(1)

    memory.limit_resources()

    if os.getenv('PYTHONASYNCIODEBUG') is not None:
        logging.captureWarnings(True)
        warnings.simplefilter('default', category=ResourceWarning)
        if LOGGER.getEffectiveLevel() > logging.WARNING:
            LOGGER.setLevel(logging.WARNING)
            LOGGER.warning(
                'Lowered logging level to WARNING because PYTHONASYNCIODEBUG env var is set'
            )
        LOGGER.warning(
            'Configured logging system to show ResourceWarning because PYTHONASYNCIODEBUG env var is set'
        )
        LOGGER.warning(
            'Note that this does have a significant impact on asyncio overhead'
        )
    if os.getenv('COCRAWLER_GC_DEBUG') is not None:
        LOGGER.warning('Configuring gc debugging')
        gc.set_debug(gc.DEBUG_STATS | gc.DEBUG_UNCOLLECTABLE)

    kwargs = {}
    if args.load:
        kwargs['load'] = args.load
    if args.no_test:
        kwargs['no_test'] = True

    crawler = cocrawler.Crawler(**kwargs)
    loop = asyncio.get_event_loop()
    slow_callback_duration = os.getenv('ASYNCIO_SLOW_CALLBACK_DURATION')
    if slow_callback_duration:
        loop.slow_callback_duration = float(slow_callback_duration)
        LOGGER.warning('set slow_callback_duration to %f',
                       slow_callback_duration)

    if config.read('CarbonStats'):
        timer.start_carbon()

    if config.read('REST'):
        app = webserver.make_app()
    else:
        app = None

    try:
        loop.run_until_complete(crawler.crawl())
    except KeyboardInterrupt:
        sys.stderr.flush()
        print('\nInterrupt. Exiting cleanly.\n')
        crawler.cancel_workers()
    finally:
        loop.run_until_complete(crawler.close())
        if app:
            webserver.close(app)
        if config.read('CarbonStats'):
            timer.close()
        # apparently this is needed for full aiohttp cleanup -- or is it cargo cult
        loop.stop()
        loop.run_forever()
        loop.close()
Пример #6
0
import random
import os

from cocrawler.urls import URL
import cocrawler.dns as dns
import cocrawler.config as config

ARGS = argparse.ArgumentParser(description='CoCrawler dns benchmark')
ARGS.add_argument('--config', action='append')
ARGS.add_argument('--configfile', action='store')
ARGS.add_argument('--count', type=int, default=1000)
ARGS.add_argument('--expect-not-suitable', action='store_true')

args = ARGS.parse_args()

config.config(args.configfile, args.config)
max_workers = config.read('Crawl', 'MaxWorkers')
ns = config.read('Fetcher', 'Nameservers')
if isinstance(ns, str):
    ns = [ns]
    config.write(ns, 'Fetcher', 'Nameservers')

exit_value = 0

resolver = dns.get_resolver()


def create_queue():
    queue = asyncio.Queue()

    # add a fake domain to make sure the dns doesn't send unknown hosts to a search
Пример #7
0
def main():
    '''
    Main program: parse args, read config, set up event loop, run the crawler.
    '''

    args = ARGS.parse_args()

    if args.printdefault:
        config.print_default()
        sys.exit(1)

    loglevel = os.getenv('COCRAWLER_LOGLEVEL')
    if loglevel is None and args.loglevel:
        loglevel = args.loglevel
    if loglevel is None and args.verbose:
        loglevel = 'DEBUG'

    logging.basicConfig(level=loglevel)

    config.config(args.configfile, args.config)

    if args.printfinal:
        config.print_final()
        sys.exit(1)

    memory.limit_resources()

    if os.getenv('PYTHONASYNCIODEBUG') is not None:
        logging.captureWarnings(True)
        warnings.simplefilter('default', category=ResourceWarning)
        if LOGGER.getEffectiveLevel() > logging.WARNING:
            LOGGER.setLevel(logging.WARNING)
            LOGGER.warning('Lowered logging level to WARNING because PYTHONASYNCIODEBUG env var is set')
        LOGGER.warning('Configured logging system to show ResourceWarning because PYTHONASYNCIODEBUG env var is set')
        LOGGER.warning('Note that this does have a significant impact on asyncio overhead')
    if os.getenv('COCRAWLER_GC_DEBUG') is not None:
        LOGGER.warning('Configuring gc debugging')
        gc.set_debug(gc.DEBUG_STATS | gc.DEBUG_UNCOLLECTABLE)

    kwargs = {}
    if args.load:
        kwargs['load'] = args.load
    if args.no_test:
        kwargs['no_test'] = True

    crawler = cocrawler.Crawler(**kwargs)
    loop = asyncio.get_event_loop()
    slow_callback_duration = os.getenv('ASYNCIO_SLOW_CALLBACK_DURATION')
    if slow_callback_duration:
        loop.slow_callback_duration = float(slow_callback_duration)
        LOGGER.warning('set slow_callback_duration to %f', slow_callback_duration)

    if config.read('CarbonStats'):
        timer.start_carbon()

    if config.read('REST'):
        app = webserver.make_app()
    else:
        app = None

    try:
        loop.run_until_complete(crawler.crawl())
    except KeyboardInterrupt:
        sys.stderr.flush()
        print('\nInterrupt. Exiting cleanly.\n')
        crawler.cancel_workers()
    finally:
        loop.run_until_complete(crawler.close())
        if app:
            webserver.close(app)
        if config.read('CarbonStats'):
            timer.close()
        # vodoo recommended by advanced aiohttp docs for graceful shutdown
        # https://github.com/aio-libs/aiohttp/issues/1925
        loop.run_until_complete(asyncio.sleep(0.250))
        loop.close()
Пример #8
0
import sys

import cocrawler
import cocrawler.config as config

f = sys.argv[1]
config.config(None, None)
crawler = cocrawler.Crawler(load=f)

# at this point the crawler won't start until we call loop.run_until_complete ...

if sys.argv[2] == 'frontier':
    crawler.scheduler.dump_frontier()
Пример #9
0
import random
import os

from cocrawler.urls import URL
import cocrawler.dns as dns
import cocrawler.config as config

ARGS = argparse.ArgumentParser(description='CoCrawler dns benchmark')
ARGS.add_argument('--config', action='append')
ARGS.add_argument('--configfile', action='store')
ARGS.add_argument('--count', type=int, default=1000)
ARGS.add_argument('--expect-not-suitable', action='store_true')

args = ARGS.parse_args()

config.config(args.configfile, args.config)
max_workers = config.read('Crawl', 'MaxWorkers')
ns = config.read('Fetcher', 'Nameservers')
if isinstance(ns, str):
    ns = [ns]
    config.write(ns, 'Fetcher', 'Nameservers')

exit_value = 0

resolver = dns.get_resolver()


def create_queue():
    queue = asyncio.Queue()

    # add a fake domain to make sure the dns doesn't send unknown hosts to a search