def extract_multi(to_fetch, seen_urls): results = [] for url in to_fetch: if url in seen_urls: continue seen_urls.add(url) try: results.append(extract(url)) except Exception: continue return results
def fetcher(fetch_queue, output_queue): logging.info('Starting fetcher thread') while True: state, depth, url = fetch_queue.get() logging.info('%s: Fetching in thread: %s', id(state), url) try: try: _, data, found_urls = extract(url) except Exception: data, found_urls = None, [] output_queue.put(FetchResult(state, depth, url, data, found_urls)) finally: fetch_queue.task_done()
def fetcher(fetch_queue, max_depth, seen_urls, output_queue): while True: depth, url = fetch_queue.get() try: if depth > max_depth: continue # Ignore URLs that are too deep if url in seen_urls: continue # Prevent infinite loops seen_urls.add(url) # GIL :/ try: _, data, found_urls = extract(url) except Exception: continue output_queue.put((url, data)) for found in found_urls: fetch_queue.put((depth + 1, found)) finally: fetch_queue.task_done()
def consumer(fetch_queue, max_depth, seen_urls, result): while True: depth, url = fetch_queue.get() try: if depth > max_depth: continue if url in seen_urls: continue # GIL :| seen_urls.add(url) # GIL :/ try: _, data, found_urls = extract(url) except Exception: continue result.append((depth, url, data)) # GIL :( for found in found_urls: fetch_queue.put((depth + 1, found)) finally: fetch_queue.task_done()