Пример #1
0
def extract_async(url):
    data = yield from fetch_async(url)
    found_urls = set()
    for match in URL_EXPR.finditer(data):
        found = canonicalize(match.group('url'))
        if same_domain(url, found):
            found_urls.add(urljoin(url, found))
    return url, data, sorted(found_urls)
Пример #2
0
def crawl_parallel(start_url, max_depth):
    fetch_queue = Queue()  # (crawl_depth, url)
    fetch_queue.put((0, canonicalize(start_url)))

    seen_urls, result = set(), []
    func = lambda: consumer(fetch_queue, max_depth, seen_urls, result)
    for _ in range(3):
        Thread(target=func, daemon=True).start()

    fetch_queue.join()
    return result
Пример #3
0
def crawl_async(start_url, max_depth):
    seen_urls = set()
    to_fetch = [canonicalize(start_url)]
    results = []
    for depth in range(max_depth + 1):
        batch = yield from extract_multi_async(to_fetch, seen_urls)
        to_fetch = []
        for url, data, found_urls in batch:
            results.append((depth, url, data))
            to_fetch.extend(found_urls)

    return results
Пример #4
0
def main():
    url = canonicalize(argv[1])

    # Bridge the gap between sync and async
    future = asyncio.Task(extract_async(url))
    loop = asyncio.get_event_loop()
    loop.run_until_complete(future)
    loop.close()

    _, data, found_urls = future.result()  # Will raise exception
    print('%s is %d bytes, %d urls:\n%s' %
          (url, len(data), len(found_urls), '\n'.join(found_urls)))
Пример #5
0
def parallel_wordcount(start_url, max_depth, word_length):
    fetch_queue = Queue()  # (crawl_depth, url)
    fetch_queue.put((0, canonicalize(start_url)))
    count_queue = Queue()  # (url, data)

    seen_urls = set()
    func = lambda: fetcher(fetch_queue, max_depth, seen_urls, count_queue)
    for _ in range(3):
        Thread(target=func, daemon=True).start()

    result = []
    func = lambda: counter(count_queue, word_length, result)
    for _ in range(3):
        Thread(target=func, daemon=True).start()

    fetch_queue.join()
    count_queue.join()
    return result