Exemplo n.º 1
0
def generator():
    import gevent.local
    lo = gevent.local.local()
    lo.tid = 1
    with store.begin(write=True):
        Host.by_unchecked.find()
    with store.begin():
        for host in Host.by_unchecked.values(max=1000):
            pool.wait_available()
            pool.add(gevent.spawn(worker, host))
def crawl(start_url, concurrency_level, visited_link_limit):
    """
    Main crawling function. Uses a pool of greenlets to get the job done
    :param start_url: URL to start crawling from
    :param concurrency_level: number of concurrent downloads
    :param visited_link_limit: maximum number of links to crawl
    :return: None
    """

    print('start crawling from %s' % start_url)
    print('concurrency level: %s, visited link limit: %s' %
          (concurrency_level, visited_link_limit))

    # init our pending links with start_url
    pending_links.append(start_url)

    pool = gevent.pool.Pool(concurrency_level)

    # limit number of visited links, just for testing purposes
    while len(visited_links) < visited_link_limit and (
            len(pending_links) > 0 or len(crawlin_links) > 0):
        # if there is nothing more to schedule, then wait for current jobs to complete and try again
        if not pending_links:
            pool.join()
            continue

        link = pending_links.pop(0)
        crawlin_links.add(link)

        pool.wait_available()
        pool.add(gevent.spawn(crawl_one, link))

        # print('%s - current visited: %s' % (threading.currentThread(), visited_links))

    pool.join()

    # print('%s - visited links: %s' % (threading.currentThread(), visited_links))
    # print('%s - pending links: %s' % (threading.currentThread(), pending_links))

    print('Done. %s links visited.' % len(visited_links))
Exemplo n.º 3
0
def wait_available(pool, pool_name):
    statsd = stats.get_statsd_client()
    if pool.full():
        statsd.incr('%s.pool.full' % pool_name)
        pool.wait_available()
Exemplo n.º 4
0
 def schedule():
     while True:
         pool.wait_available()
         print('Starting greenlet')
         pool.apply_async(main)
Exemplo n.º 5
0
def _run(ToolImplementation, tool_reads_stdin):
    '''
            tool_reads_stdin - tool uses stdin for other purpose than
                               to obtain startup arguments
    '''
    args = sys.argv[:]
    action_name = args[0]
    #args = list(args)[1:]

    _E_INVALID_ARGUMENT_STDIN = \
        '-: Invalid argument, target hosts may not be specified on stdin.'

    _usage = __usage__.replace('{{tool}}', action_name)
    if hasattr(ToolImplementation, '__itemname__'):
        itemname = ToolImplementation.__itemname__
        _usage = _usage.replace('item', itemname)
    parser = XNetOptionParser(ToolImplementation.cmdline_options, usage=_usage)
    (options, args) = parser.parse_args(args)
    SignalHandler.setup(options)
    cmdlineinputs = args[1:]

    #vprint = VerbosityPrinter(options)
    vprint_stderr = VerbosityPrinter(options, sys.stderr)
    #
    # chain input sources, handle options
    # precedence order: cmdline, -r, stdin
    #
    # If --nr-processes=n and n > 1, then let this process
    # produce for a number of child processes.
    #
    _inputs = []
    if '-' in cmdlineinputs:
        if tool_reads_stdin:
            raise Exception(_E_INVALID_ARGUMENT_STDIN)
        cmdlineinputs.remove('-')
        _inputs.append(sys.stdin)
    if options.read:
        f = open(options.read)
        _inputs.insert(0, f)
    if len(cmdlineinputs):
        _inputs.insert(0, cmdlineinputs)
    inputchain = itertools.chain(*_inputs)

    _nr_processes = 1
    if options.nr_processes:
        _nr_processes = int(options.nr_processes)

    if options.pdb:
        import xnet.debug
        xnet.debug.interactive_debugger_on_exception(True)

    if options.print_source:
        ToolImplementation.print_source()
        return

    _nr_microthreads = 256
    if not options.nr_microthreads is None:
        _nr_microthreads = int(options.nr_microthreads)

    _wait = None
    if not options.wait is None:
        _wait = float(options.wait)

    _interval = 0.0
    if not options.interval is None:
        _interval = float(options.interval)

    _repeat = 1
    if not options.repeat is None:
        _repeat = int(options.repeat)

    if options.format_help:
        print ToolImplementation.__format_help__()
        sys.exit(0)

    if options.split_tee:
        options.split_output = options.split_tee

    _outfile = sys.stdout

    #
    # Handle SSH-distributed execution.
    # If SSH-dist., main process returns on do_ssh() here.
    #
    if options.ssh_nodes_file:
        #
        # __massage__() expands wildcard IP-ranges such as 10.0.0.*.
        # This should be done in parent in order to split the expanded
        # set of IP:s across its children.
        #
        inputchain = ToolImplementation.__massage__(inputchain, options)
        if _repeat != 1:
            inputchain = repeaterator(inputchain, _repeat)
        return do_ssh(options, ToolImplementation, args, inputchain)

    #
    # Handle multiproc.
    # If multiproc, main process returns on do_fork() here.
    #
    if _nr_processes == 1:
        pass
    elif _nr_processes < 1:
        errmsg = 'invalid number of processes: {0}'.format(_nr_processes)
        sys.stderr.write(errmsg)
        sys.exit(1)
    else:
        #
        # __massage__() expands wildcard IP-ranges such as 10.0.0.*.
        # This should be done in parent in order to split the expanded
        # set of IP:s across its children.
        #
        inputchain = ToolImplementation.__massage__(inputchain, options)
        if _repeat != 1:
            inputchain = repeaterator(inputchain, _repeat)
        return do_fork(options, ToolImplementation, tool_reads_stdin, args,
                       _nr_processes, _nr_microthreads, inputchain)

    #
    # verify commandline options and do preparations
    #
    ToolImplementation.__setup__(options)

    #
    # Errors if pool is too small.
    #
    pool = gevent.pool.Pool(_nr_microthreads)
    #pool = gevent.pool.Pool(options.nr_microthreads or DEFAULT_NR_MICROTHREADS)
    greenlets = []

    wkpool = None
    if _wait:
        wkpool = gevent.pool.Pool(_nr_microthreads)

    def waitkill(g, killtime):
        from xnet.tools import WaitTimeout
        sleeptime = killtime - time.time()
        if sleeptime > 0:
            gevent.sleep(sleeptime)
        if not g.ready():
            vprint_stderr(3, '[*] xx - kill {0}:{1}\n'.format(
                os.getpid(), g.action._greenlet_id
            ))
            gevent.kill(g, WaitTimeout)
            tstart = time.time()
            g.join()
            tdiff = time.time() - tstart
            vprint_stderr(3, '[*] yy - joined {0}:{1} ({2:.1f}s)\n'.format(
                os.getpid(), g.action._greenlet_id, tdiff
            ))
            assert g.ready()

    inputchain = ToolImplementation.__massage__(inputchain, options)

    kwargs = {}

    if tool_reads_stdin:
        gevent.spawn(stdin_disperser_greenlet, pool)

    _t = 0.0
    killtime = 0

    if _repeat != 1:
        inputchain = repeaterator(inputchain, _repeat)

    greenlet_id = 0
    for (i, line) in enumerate(inputchain):
        action = ToolImplementation(options, greenlet_id=greenlet_id, **kwargs)
        greenlet_id += 1
        pool.wait_available()
        if _wait:
            killtime = time.time() + _wait
        vprint_stderr(2, '[*] ++ spawning greenlet {0}:{1}\n'.format(os.getpid(), greenlet_id))
        g = pool.spawn(action, line, inputchain)
        g.action = action
        greenlets.append(g)
        #
        # Timeout seems unreliable on debian squeeze, use waitkill instead.
        #
        if _wait:
            wkpool.wait_available()
            wkpool.spawn(waitkill, g, killtime)

        #
        # handle finished actions
        #
        vprint_stderr(3, '[*] ii len(greenlets) = {0}'.format(len(greenlets)))
        vprint_stderr(3, ', not_ready={0}\n'.format(len(filter(
            lambda g: not g.ready(), greenlets
        ))))

        #while len(greenlets) and greenlets[0].ready():
        #    action = greenlets[0].action
        #    vprint_stderr(2, '[*] -- collecting greenlet: {0}:{1} (running: {2})\n'.format(
        #        os.getpid(), action._greenlet_id, len(greenlets)
        #    ))
        #    output_action_result(action, options, _outfile)
        #    del action
        #    del greenlets[0]

        del_indexes = []
        for (g_index, g) in enumerate(greenlets[:]):
            if not g.ready():
                continue
            vprint_stderr(2, '[*] -- collecting greenlet: {0}:{1} (running: {2})\n'.format(
                os.getpid(), g.action._greenlet_id, len(greenlets)
            ))
            output_action_result(g.action, options, _outfile)
            del_indexes.append(g_index)
        while len(del_indexes):
            di = del_indexes.pop()
            del greenlets[di]

        #
        # handle interval
        #
        _this_interval = _interval - (time.time() - _t)
        if _this_interval > 0:
            gevent.sleep(_this_interval)
        _t = time.time()

    if wkpool:
        wkpool.join()
    gevent.joinall(greenlets)  # timeout=timeout
    not_done = filter(lambda g: (not g.ready()), greenlets)
    if len(not_done) > 0 and not _wait is None:
        vprint_stderr(0, 'ERROR: not_done has contents inspite of _wait and grace time\n')
        vprint_stderr(0,  not_done)

    #
    # Force-kill greenlets that didn't die in spite of 1 sec of grace time.
    #
    gevent.killall(not_done, block=True)
    gevent.joinall(not_done)

    #
    # cleanups
    #
    ToolImplementation.__teardown__(options)

    #
    # print results
    #
    for g in greenlets:
        action = g.action
        output_action_result(action, options, _outfile)
        del action
Exemplo n.º 6
0
def traverse_delete(up, start_path, datelist, authstr):
    """
    遍历指定bucket从某个start_path开始的目录,删除之下的所有文件或空目录
    """
    global job_files, jobs, deleted_bytes, deleted_files, pool, logger
    children = getlist(up, start_path)  # 先获取当前目录下的子目录或文件
    if children is None:
        return
    if len(children) == 0:  # 目录为空。如果不是根目录,就删除这个目录。
        if start_path != u'/':
            pool.spawn(up_delete, up, start_path)
        return
    files = []
    for f in children:
        try:
            if f['type'] == u'N':
                files.append(f)
        except KeyError:
            logger.error('KeyError: no type in f %s' % str(f))
            sys.exit(-1)
    if len(files) != 0:  # 目录下有文件,就批量删掉
        if start_path == u'/':
            jobs.extend(
                [pool.spawn(up_delete, up, '/%s' % f['name']) for f in files])
        else:
            jobs.extend([
                pool.spawn(async_delete_file, up, authstr, start_path, f)
                for f in files
            ])
        job_files += len(files)
        try:
            deleted_bytes += sum([int(f['size']) for f in files])
        except ValueError:
            # {'time': u'1491871202', 'type': u'N', 'name': u'1491871201.853000.jpg', 'size': u'undefined'}
            traceback.print_exc()
            print f
        deleted_files += len(files)
        if job_files >= 5000:  # 超过5000个,清零
            logger.warning(u'deleted %d MB, 空间占用 %d MB' %
                           (deleted_bytes / 1024.0 / 1024,
                            int(up.usage()) / 1024.0 / 1024))
            pool.wait_available()
            job_files = 0
            jobs = []
    # 接下来递归处理子目录
    # 注意:如果当前是在根目录,就要判断子目录是否在datelist要删除的日期列表里
    if start_path == u'/':
        folders = [
            f for f in children if f['type'] == u'F' and f['name'] in datelist
        ]
    else:
        folders = [f for f in children if f['type'] == u'F']
    for folder in folders:
        if start_path == u'/':
            traverse_delete(up, '/%s' % folder['name'], datelist, authstr)
        else:
            traverse_delete(up, '%s/%s' % (start_path, folder['name']),
                            datelist, authstr)
    # 其他节点?
    others = [f for f in children if f['type'] not in (u'F', u'N')]
    if len(others) != 0:
        logger.error('others: %s' % str(others))
        sys.exit(0)
Exemplo n.º 7
0
def wait_available(pool, pool_name):
    statsd = stats.get_statsd_client()
    if pool.full():
        statsd.incr('%s.pool.full' % pool_name)
        pool.wait_available()
    return not STATE['shutdown']