예제 #1
0
def extract_multi(to_fetch, seen_urls):
    results = []
    for url in to_fetch:
        if url in seen_urls: continue
        seen_urls.add(url)
        try:
            results.append(extract(url))
        except Exception:
            continue
    return results
예제 #2
0
def extract_multi(to_fetch, seen_urls):
    results = []
    for url in to_fetch:
        if url in seen_urls: continue
        seen_urls.add(url)
        try:
            results.append(extract(url))
        except Exception:
            continue
    return results
예제 #3
0
def fetcher(fetch_queue, output_queue):
    logging.info('Starting fetcher thread')
    while True:
        state, depth, url = fetch_queue.get()
        logging.info('%s: Fetching in thread: %s', id(state), url)
        try:
            try:
                _, data, found_urls = extract(url)
            except Exception:
                data, found_urls = None, []

            output_queue.put(FetchResult(state, depth, url, data, found_urls))
        finally:
            fetch_queue.task_done()
예제 #4
0
def fetcher(fetch_queue, max_depth, seen_urls, output_queue):
    while True:
        depth, url = fetch_queue.get()
        try:
            if depth > max_depth: continue  # Ignore URLs that are too deep
            if url in seen_urls: continue   # Prevent infinite loops

            seen_urls.add(url)              # GIL :/
            try:
                _, data, found_urls = extract(url)
            except Exception:
                continue

            output_queue.put((url, data))
            for found in found_urls:
                fetch_queue.put((depth + 1, found))
        finally:
            fetch_queue.task_done()
예제 #5
0
def consumer(fetch_queue, max_depth, seen_urls, result):
    while True:
        depth, url = fetch_queue.get()
        try:
            if depth > max_depth: continue
            if url in seen_urls: continue      # GIL :|

            seen_urls.add(url)                 # GIL :/
            try:
                _, data, found_urls = extract(url)
            except Exception:
                continue

            result.append((depth, url, data))  # GIL :(
            for found in found_urls:
                fetch_queue.put((depth + 1, found))
        finally:
            fetch_queue.task_done()