def fetch_and_extract(url, data_queue): r = urllib2.urlopen(url) html = r.read() hrefs = demo_helpers.extract_hrefs(html) #print('fetch ', url, html, hrefs) data_queue.put_nowait((url, hrefs))
def url_worker(name, tq, dq): print('worker start', name) while True: url = tq.get() print('worker {} get {}'.format(name, url)) r = urllib2.urlopen(url) html = r.read() hrefs = demo_helpers.extract_hrefs(html) dq.put_nowait((url, hrefs))
def url_worker(name, processed_urls, add_to_all, q): print("worker start", name) while True: url = q.get() print("worker {} get {}".format(name, url)) r = urllib2.urlopen(url) html = r.read() hrefs = demo_helpers.extract_hrefs(html) # print('fetch ', url, html, hrefs) for sub_url in hrefs: add_to_all(sub_url) if sub_url not in processed_urls: q.put_nowait(sub_url) if url in processed_urls: print("Duplicate processed url {}".format(url)) else: processed_urls.add(url) q.task_done()