class Crawler: def __init__(self, root_url, max_redirect): self.max_tasks = 10 self.max_redirect = max_redirect self.q = Queue() self.seen_urls = set() # aiohttp's ClientSession does connection pooling and # HTTP keep-alives for us. self.session = aiohttp.ClientSession(loop=loop) # Put (URL, max_redirect) in the queue. self.q.put((root_url, self.max_redirect)) @asyncio.coroutine def crawl(self): """Run the crawler until all work is done.""" workers = [asyncio.Task(self.work()) for _ in range(self.max_tasks)] # When all work is done, exit. yield from self.q.join() for w in workers: w.cancel() @asyncio.coroutine def work(self): while True: url, max_redirect = yield from self.q.get() # Download page and add new links to self.q. yield from self.fetch(url, max_redirect) # 把新links加入q后再task_done() self.q.task_done()
async def scrape(client, url): tasks = [] url_queue = Queue() archive = { 'top': url, 'seen': {}, 'items': [] } await url_queue.put(url) def task_completed(future): exc = future.exception() if exc: log.error('Worker finished with error: {} '.format(exc), exc_info=True) for _ in range(CONCURRENCY): crawler_future = ensure_future(crawler(client, url_queue, archive)) crawler_future.add_done_callback(task_completed) tasks.append(crawler_future) await wait_for(url_queue.join(), TIMEOUT) for task in tasks: task.cancel() client.close() webarchive = { 'WebMainResource': archive['items'].pop(0), 'WebSubresources': archive['items'] } writePlist(webarchive, OUTPUT_FILENAME)
class Crawler: def __init__(self, root_url, max_redirect): self.max_tasks = 10 self.max_redirect = max_redirect self.q = Queue() self.seen_urls = set() # aiohttp's ClientSession does connection pooling and # HTTP keep-alives for us. self.session = aiohttp.ClientSession(loop=loop) # Put (URL, max_redirect) in the Queue self.q.put((root_url, self.max_redirect)) @asyncio.coroutine def crawl(self): '''Run the crawler untill all work is done.''' workers = [asyncio.Task(self.work()) for _ in range(self.max_tasks)] # When all work is done, exit. yield from self.q.join() for w in workers: w.cancel() @asyncio.coroutine def work(self): while True: url, max_redirect = yield from self.q.get() # Download page and add new links to self.q yield from self.fetch(url, max_redirect) self.q.task_done() @asyncio.coroutine def fetch(self, url, max_redirect): # Handle redirects ourselves. response = yield from self.session.get(url, allow_redirects=False) try: if is_redirect(response): if max_redirect > 0: next_url = response.headers['location'] if next_url in self.seen_urls: # We have done this before. return # Remember we have seen this url. self.seen_urls.add(next_url) # Follow the redirect. One less redirect remains. self.q.put_nowait((next_url, max_redirect - 1)) else: links = yield from self.parse_links(response) # Python set-logic: for link in links.difference(self.seen_urls): self.q.put_nowait((link, self.max_redirect)) self.seen_urls.update(links) finally: # Return connection to pool. yield from response.release()
class AbstractQueue(AbstractDataset, ABC): def __init__(self, maxsize=3, *args): super().__init__(*args) # initialise queue self.queue = Queue(maxsize=maxsize) return # ********************************************** # def __iter__(self): # self.background_worker = Thread(target=self.enqueue(), daemon=True) # self.background_worker.start() return self # ********************************************** # def __next__(self): if not self.queue.empty(): batch = self.queue.get() self.queue.task_done() return batch else: raise StopIteration async def request_sample(self, idx): start = time.time() print("Fetching new sample...", end="\r") sample = await self.samples[idx] print("Fetched in %4f" % (time.time() - start)) return sample def _producer(self): # this is async for frame in self.frames: t = Thread(target=self.request_sample, args=frame) self.queue.put(t) self.queue.join() return def print_random(self): idx = rn.randint(0, len(self)) sample = self.request_sample(idx) print(f"Example batch {idx:s}") print(sample)
async def _await_job(queue: asyncio.Queue, event: asyncio.Event, /) -> None: job_done = False while not (event.is_set() or job_done): try: await asyncio.wait_for(queue.join(), timeout=1) except asyncio.TimeoutError: pass else: job_done = True
async def main(): customer_queue = Queue() all_products = [Product('beer', 2), Product('bananas', .5), Product('sausage', .2), Product('diapers', .2)] for i in range(10): #C products = [all_products[randrange(len(all_products))] for _ in range(randrange(10))] customer_queue.put_nowait(Customer(i, products)) cashiers = [asyncio.create_task(checkout_customer(customer_queue, i)) for i in range(3)] #D await asyncio.gather(customer_queue.join(), *cashiers)
class Crawler: def __init__(self,roots,max_task=5): self.roots = roots self.queue = Queue() self.max_task = max_task #self.session = aiohttp.ClientSession() self.seen_urls = set() for root in roots: self.add_url(root) def add_url(self,root): if root in self.seen_urls: return self.queue.put_nowait(root) @asyncio.coroutine def crawler(self): tasks = [asyncio.Task(self.work()) for _ in range(self.max_task)] #pdb.set_trace() yield from self.queue.join() for w in tasks: w.cancel() # w is the object of Task @asyncio.coroutine def work(self): '''consume coroutine''' try: while True: # when the queue has no links , the queue.get blocked url = yield from self.queue.get() print('{}'.format(threading.currentThread())) yield from self.fetch(url) self.queue.task_done() except asyncio.CancelledError: pass @asyncio.coroutine def fetch(self,url): yield from asyncio.sleep(1)
class Crawler: def __init__(self, domain, max_redirects=10, max_retries=3, max_tasks=10): self.domain = domain self.max_redirects = max_redirects self.max_tasks = max_tasks self.max_retries = max_retries # self.loop = loop or asyncio.get_event_loop() self.q = Queue() self.urls_seen = set() self.session = aiohttp.ClientSession() self.add_url('/') @asyncio.coroutine def crawl(self): workers = [asyncio.Task(self.work()) for _ in range(self.max_tasks)] yield from self.q.join() for worker in workers: worker.cancel() def close(self): self.session.close() @asyncio.coroutine def work(self): try: while True: url, max_redirects = yield from self.q.get() # LOGGER.debug('fetching {}'.format(url)) yield from self.fetch(url, max_redirects) self.q.task_done() except asyncio.CancelledError: pass @asyncio.coroutine def fetch(self, url, max_redirects): retry = 0 while retry < self.max_retries: try: response = yield from self.session.get(self.domain+url, allow_redirects=False) LOGGER.debug('fetched {}'.format(url)) break except aiohttp.ClientError as client_err: retry += 1 LOGGER.info('fetching {} failed {} times with error {}'.format(url, retry, client_err)) except Exception as e: LOGGER.error('fetching {} with error: {}'.format(url, e)) return else: LOGGER.error('fetching {} out of max retry times'.format(url)) return if self.is_redirect(response): location = response.headers['location'] next_url = urllib.parse.urlparse(location).path next_url = urllib.parse.urljoin(url, next_url) if next_url in self.urls_seen: pass elif max_redirects > 0: self.add_url(next_url, max_redirects-1) LOGGER.info('redirect from {} to {}'.format(url, next_url)) else: LOGGER.error('redirect from {} to {} out of times'.format(url, next_url)) else: links = yield from self.parse_links(response) LOGGER.debug('parsed {} links from {}'.format(len(links), url)) for link in links.difference(self.urls_seen): self.q.put_nowait((link, self.max_redirects)) self.urls_seen.update(links) yield from response.release() def add_url(self, url, max_redirects=None): max_redi = max_redirects or self.max_redirects self.urls_seen.add(url) self.q.put_nowait((url, max_redi)) def is_redirect(self, response): return response.status in (300, 301, 302, 303, 307) @asyncio.coroutine def parse_links(self, response): links = set() if response.status == 200: content_type = response.headers.get('content-type', '') if content_type and content_type.startswith('text/html'): text = yield from response.text() urls = set(re.findall(r'''(?i)href=["']([^\s"'<>]+)''', text)) if urls: LOGGER.info('got {} distinct urls from {}'.format( len(urls), response.url)) for url in urls: norm_url = urllib.parse.urljoin(response.url, url) url_parts = urllib.parse.urlparse(norm_url) if url_parts.scheme not in ('http', 'https', ''): continue host, port = urllib.parse.splitport(url_parts.netloc) host = host.lower() host = host[4:] if host.startswith('www.') else host if host and not host in self.domain: continue defragmented, frag = urllib.parse.urldefrag(url_parts.path) links.add(defragmented) return links
def workerTask(q): while not q.empty(): processImage(q.get_nowait()[0]) q.task_done() if not os.path.exists("__working"): os.mkdir("__working") convertPdfs(pdfList) q = Queue(maxsize=0) num_threads = 4 #put files in queue for fileName in os.listdir("__working"): if fileName.endswith(".pbm"): q.put_nowait(("__working/" + fileName,)) threads = [] for i in range(num_threads): worker = Thread(target=workerTask, args=(q,)) worker.start() threads.append(worker) q.join() for thread in threads: thread.join() subprocess.run("rm -r __working", shell=True)
class Crawler: def __init__(self, root_url, max_redirect): self.max_tasks = 10 self.max_redirect = max_redirect self.q = Queue() self.seen_urls = set() # aiohttp's ClientSession does connection pooling and # HTTP keep-alives for us. self.session = aiohttp.ClientSession(loop=loop) # Put (URL, max_redirect) in the Queue self.q.put((root_url, self.max_redirect)) @asyncio.coroutine def crawl(self): '''Run the crawler untill all work is done.''' workers = [asyncio.Task(self.work()) for _ in range(self.max_tasks)] # When all work is done, exit. yield from self.q.join() for w in workers: w.cancel() @asyncio.coroutine def work(self): while True: url, max_redirect = yield from self.q.get() # Download page and add new links to self.q yield from self.fetch(url, max_redirect) self.q.task_done() @asyncio.coroutine def fetch(self, url, max_redirect): # Handle redirects ourselves. response = yield from self.session.get( url, allow_redirects=False) try: if is_redirect(response): if max_redirect > 0: next_url = response.headers['location'] if next_url in self.seen_urls: # We have done this before. return # Remember we have seen this url. self.seen_urls.add(next_url) # Follow the redirect. One less redirect remains. self.q.put_nowait((next_url, max_redirect -1)) else: links = yield from self.parse_links(response) # Python set-logic: for link in links.difference(self.seen_urls): self.q.put_nowait((link, self.max_redirect)) self.seen_urls.update(links) finally: # Return connection to pool. yield from response.release()
class Crawler: """Crawl a set of URLs. This manages two sets of URLs: 'urls' and 'done'. 'urls' is a set of URLs seen, and 'done' is a list of FetchStatistics. """ def __init__(self, roots, exclude=None, strict=True, # What to crawl. max_redirect=10, max_tries=4, # Per-url limits. max_tasks=10, *, loop=None): self.loop = loop or asyncio.get_event_loop() self.roots = roots self.exclude = exclude self.strict = strict self.max_redirect = max_redirect self.max_tries = max_tries self.max_tasks = max_tasks self.q = Queue(loop=self.loop) self.seen_urls = BloomFilter(10000000, 0.01) self.done = [] self.session = aiohttp.ClientSession(loop=self.loop) self.root_domains = set() for root in roots: parts = urllib.parse.urlparse(root) host, port = urllib.parse.splitport(parts.netloc) if not host: continue if re.match(r'\A[\d\.]*\Z', host): self.root_domains.add(host) else: host = host.lower() if self.strict: self.root_domains.add(host) else: self.root_domains.add(lenient_host(host)) for root in roots: self.add_url(root) self.t0 = time.time() self.t1 = None def close(self): """Close resources.""" self.session.close() def host_okay(self, host): """Check if a host should be crawled. A literal match (after lowercasing) is always good. For hosts that don't look like IP addresses, some approximate matches are okay depending on the strict flag. """ host = host.lower() if host in self.root_domains: return True if re.match(r'\A[\d\.]*\Z', host): return False if self.strict: return self._host_okay_strictish(host) else: return self._host_okay_lenient(host) def _host_okay_strictish(self, host): """Check if a host should be crawled, strict-ish version. This checks for equality modulo an initial 'www.' component. """ host = host[4:] if host.startswith('www.') else 'www.' + host return host in self.root_domains def _host_okay_lenient(self, host): """Check if a host should be crawled, lenient version. This compares the last two components of the host. """ return lenient_host(host) in self.root_domains def record_statistic(self, fetch_statistic): """Record the FetchStatistic for completed / failed URL.""" self.done.append(fetch_statistic) async def parse_links(self, response): """Return a FetchStatistic and list of links.""" links = set() content_type = None encoding = None body = await response.read() if response.status == 200: content_type = response.headers.get('content-type') pdict = {} if content_type: content_type, pdict = cgi.parse_header(content_type) encoding = pdict.get('charset', 'utf-8') if content_type in ('text/html', 'application/xml'): text = await response.text() # Replace href with (?:href|src) to follow image links. urls = set(re.findall(r'''(?i)href=["']([^\s"'<>]+)''', text)) if urls: LOGGER.info('got %r distinct urls from %r', len(urls), response.url) for url in urls: LOGGER.info("response.url:%s,type:%s", response.url, type(response.url)) LOGGER.info("parse_links url:%s,type:%s", url, type(url)) normalized = urllib.parse.urljoin(str(response.url), url) defragmented, frag = urllib.parse.urldefrag(normalized) if self.url_allowed(defragmented): links.add(defragmented) stat = FetchStatistic( url=response.url, next_url=None, status=response.status, exception=None, size=len(body), content_type=content_type, encoding=encoding, num_urls=len(links), num_new_urls=len(links) - len(self.seen_urls)) return stat, links async def fetch(self, url, max_redirect): """Fetch one URL.""" tries = 0 exception = None while tries < self.max_tries: try: response = await self.session.get( url, allow_redirects=False) if tries > 1: LOGGER.info('try %r for %r success', tries, url) break except aiohttp.ClientError as client_error: LOGGER.info('try %r for %r raised %r', tries, url, client_error) exception = client_error tries += 1 else: # We never broke out of the loop: all tries failed. LOGGER.error('%r failed after %r tries', url, self.max_tries) self.record_statistic(FetchStatistic(url=url, next_url=None, status=None, exception=exception, size=0, content_type=None, encoding=None, num_urls=0, num_new_urls=0)) return try: if is_redirect(response): location = response.headers['location'] next_url = urllib.parse.urljoin(url, location) self.record_statistic(FetchStatistic(url=url, next_url=next_url, status=response.status, exception=None, size=0, content_type=None, encoding=None, num_urls=0, num_new_urls=0)) if next_url in self.seen_urls: return if max_redirect > 0: LOGGER.info('redirect to %r from %r', next_url, url) self.add_url(next_url, max_redirect - 1) else: LOGGER.error('redirect limit reached for %r from %r', next_url, url) else: stat, links = await self.parse_links(response) self.record_statistic(stat) for link in utils.difference(links, self.seen_urls): # for link in links.difference(self.seen_urls): self.q.put_nowait((link, self.max_redirect)) # self.seen_urls.update(links) self.seen_urls.update(links) finally: await response.release() async def work(self): """Process queue items forever.""" try: while True: url, max_redirect = await self.q.get() assert url in self.seen_urls LOGGER.info("url:%s", url) LOGGER.info("max_redirect:%s", max_redirect) await self.fetch(url, max_redirect) self.q.task_done() except asyncio.CancelledError: pass def url_allowed(self, url): if self.exclude and re.search(self.exclude, url): return False parts = urllib.parse.urlparse(url) if parts.scheme not in ('http', 'https'): LOGGER.debug('skipping non-http scheme in %r', url) return False host, port = urllib.parse.splitport(parts.netloc) if not self.host_okay(host): LOGGER.debug('skipping non-root host in %r', url) return False return True def add_url(self, url, max_redirect=None): """Add a URL to the queue if not seen before.""" if max_redirect is None: max_redirect = self.max_redirect LOGGER.debug('adding %r %r', url, max_redirect) self.seen_urls.add(url) self.q.put_nowait((url, max_redirect)) async def crawl(self): """Run the crawler until all finished.""" workers = [asyncio.Task(self.work(), loop=self.loop) for _ in range(self.max_tasks)] self.t0 = time.time() yield self.q.join() self.t1 = time.time() for w in workers: w.cancel()
class Scraper: """Crawl a set of URLs. This manages two sets of URLs: 'urls' and 'done'. 'urls' is a set of URLs seen, and 'done' is a list of FetchStatistics. """ def __init__( self, roots, exclude=None, strict=True, # What to crawl. max_redirect=10, max_tries=4, # Per-url limits. max_tasks=10, *, loop=None): self.loop = loop or asyncio.get_event_loop() self.roots = roots self.exclude = exclude self.strict = strict self.max_redirect = max_redirect self.max_tries = max_tries self.max_tasks = max_tasks self.q = Queue(loop=self.loop) self.seen_urls = set() self.done = [] self.session = aiohttp.ClientSession(loop=self.loop) self.root_domains = set() for root in roots: parts = urllib.parse.urlparse(root) host, port = urllib.parse.splitport(parts.netloc) if not host: continue if re.match(r'\A[\d\.]*\Z', host): self.root_domains.add(host) else: host = host.lower() if self.strict: self.root_domains.add(host) else: self.root_domains.add(lenient_host(host)) for root in roots: self.add_url(root) self.t0 = time.time() self.t1 = None def close(self): """Close resources.""" self.session.close() def host_okay(self, host): """Check if a host should be crawled. A literal match (after lowercasing) is always good. For hosts that don't look like IP addresses, some approximate matches are okay depending on the strict flag. """ host = host.lower() if host in self.root_domains: return True if re.match(r'\A[\d\.]*\Z', host): return False if self.strict: return self._host_okay_strictish(host) else: return self._host_okay_lenient(host) def _host_okay_strictish(self, host): """Check if a host should be crawled, strict-ish version. This checks for equality modulo an initial 'www.' component. """ host = host[4:] if host.startswith('www.') else 'www.' + host return host in self.root_domains def _host_okay_lenient(self, host): """Check if a host should be crawled, lenient version. This compares the last two components of the host. """ return lenient_host(host) in self.root_domains def record_statistic(self, fetch_statistic): """Record the FetchStatistic for completed / failed URL.""" self.done.append(fetch_statistic) @asyncio.coroutine def parse_links(self, response): """Return a FetchStatistic and list of links.""" links = set() content_type = None encoding = None body = yield from response.read() if response.status == 200: content_type = response.headers.get('content-type') pdict = {} if content_type: content_type, pdict = cgi.parse_header(content_type) encoding = pdict.get('charset', 'utf-8') if content_type in ('text/html', 'application/xml'): text = yield from response.text() # Replace href with (?:href|src) to follow image links. urls = set(re.findall(r'''(?i)href=["']([^\s"'<>]+)''', text)) if urls: LOGGER.info('got %r distinct urls from %r', len(urls), response.url) for url in urls: normalized = urllib.parse.urljoin(response.url, url) defragmented, frag = urllib.parse.urldefrag(normalized) if self.url_allowed(defragmented): links.add(defragmented) stat = FetchStatistic(url=response.url, next_url=None, status=response.status, exception=None, size=len(body), content_type=content_type, encoding=encoding, num_urls=len(links), num_new_urls=len(links - self.seen_urls)) return stat, links @asyncio.coroutine def fetch(self, url, max_redirect): """Fetch one URL.""" tries = 0 exception = None while tries < self.max_tries: try: response = yield from self.session.get(url, allow_redirects=False) if tries > 1: LOGGER.info('try %r for %r success', tries, url) break except aiohttp.ClientError as client_error: LOGGER.info('try %r for %r raised %r', tries, url, client_error) exception = client_error tries += 1 else: # We never broke out of the loop: all tries failed. LOGGER.error('%r failed after %r tries', url, self.max_tries) self.record_statistic( FetchStatistic(url=url, next_url=None, status=None, exception=exception, size=0, content_type=None, encoding=None, num_urls=0, num_new_urls=0)) return try: if is_redirect(response): location = response.headers['location'] next_url = urllib.parse.urljoin(url, location) self.record_statistic( FetchStatistic(url=url, next_url=next_url, status=response.status, exception=None, size=0, content_type=None, encoding=None, num_urls=0, num_new_urls=0)) if next_url in self.seen_urls: return if max_redirect > 0: LOGGER.info('redirect to %r from %r', next_url, url) self.add_url(next_url, max_redirect - 1) else: LOGGER.error('redirect limit reached for %r from %r', next_url, url) else: stat, links = yield from self.parse_links(response) self.record_statistic(stat) for link in links.difference(self.seen_urls): self.q.put_nowait((link, self.max_redirect)) self.seen_urls.update(links) finally: yield from response.release() @asyncio.coroutine def work(self): """Process queue items forever.""" try: while True: url, max_redirect = yield from self.q.get() assert url in self.seen_urls yield from self.fetch(url, max_redirect) self.q.task_done() except asyncio.CancelledError: pass def url_allowed(self, url): if self.exclude and re.search(self.exclude, url): return False parts = urllib.parse.urlparse(url) if parts.scheme not in ('http', 'https'): LOGGER.debug('skipping non-http scheme in %r', url) return False host, port = urllib.parse.splitport(parts.netloc) if not self.host_okay(host): LOGGER.debug('skipping non-root host in %r', url) return False return True def add_url(self, url, max_redirect=None): """Add a URL to the queue if not seen before.""" if max_redirect is None: max_redirect = self.max_redirect LOGGER.debug('adding %r %r', url, max_redirect) self.seen_urls.add(url) self.q.put_nowait((url, max_redirect)) @asyncio.coroutine def scrape(self): """Run the crawler until all finished.""" workers = [ asyncio.Task(self.work(), loop=self.loop) for _ in range(self.max_tasks) ] self.t0 = time.time() yield from self.q.join() self.t1 = time.time() for w in workers: w.cancel()
class Crawler: """Crawl a set of URLs. This manages two sets of URLs: 'urls' and 'done'. 'urls' is a set of URLs seen, and 'done' is a list of FetchStatistics. """ def __init__(self, roots, exclude=None, strict=True, # What to crawl. max_redirect=10, max_tries=4, # Per-url limits. max_tasks=10, *, loop=None): self.loop = loop or asyncio.get_event_loop() self.roots = roots self.exclude = exclude self.strict = strict self.max_redirect = max_redirect self.max_tries = max_tries self.max_tasks = max_tasks self.q = Queue(loop=self.loop) self.seen_urls = set() self.done = [] self.session = aiohttp.ClientSession(loop=self.loop) self.root_domains = set() for root in roots: parts = urllib.parse.urlparse(root) host, port = urllib.parse.splitport(parts.netloc) if not host: continue if re.match(r'\A[\d\.]*\Z', host): self.root_domains.add(host) else: host = host.lower() if self.strict: self.root_domains.add(host) else: self.root_domains.add(lenient_host(host)) for root in roots: self.add_url(root) self.t0 = time.time() self.t1 = None def close(self): """Close resources.""" self.session.close() def host_okay(self, host): """Check if a host should be crawled. A literal match (after lowercasing) is always good. For hosts that don't look like IP addresses, some approximate matches are okay depending on the strict flag. """ host = host.lower() if host in self.root_domains: return True if re.match(r'\A[\d\.]*\Z', host): return False if self.strict: return self._host_okay_strictish(host) else: return self._host_okay_lenient(host) def _host_okay_strictish(self, host): """Check if a host should be crawled, strict-ish version. This checks for equality modulo an initial 'www.' component. """ host = host[4:] if host.startswith('www.') else 'www.' + host return host in self.root_domains def _host_okay_lenient(self, host): """Check if a host should be crawled, lenient version. This compares the last two components of the host. """ return lenient_host(host) in self.root_domains def record_statistic(self, fetch_statistic): """Record the FetchStatistic for completed / failed URL.""" self.done.append(fetch_statistic) @asyncio.coroutine def parse_links(self, response): """Return a FetchStatistic and list of links.""" links = set() content_type = None encoding = None body = yield from response.read() if response.status == 200: content_type = response.headers.get('content-type') pdict = {} if content_type: content_type, pdict = cgi.parse_header(content_type) encoding = pdict.get('charset', 'utf-8') if content_type in ('text/html', 'application/xml'): text = yield from response.text() # Replace href with (?:href|src) to follow image links. urls = set(re.findall(r'''(?i)href=["']([^\s"'<>]+)''',text)) if urls: LOGGER.info('got %r distinct urls from %r',len(urls), response.url) for url in urls: normalized = urllib.parse.urljoin(response.url, url) defragmented, frag = urllib.parse.urldefrag(normalized) if self.url_allowed(defragmented): links.add(defragmented) stat = FetchStatistic( url=response.url, next_url=None, status=response.status, exception=None, size=len(body), content_type=content_type, encoding=encoding, num_urls=len(links), num_new_urls=len(links - self.seen_urls)) return stat, links @asyncio.coroutine def fetch(self, url, max_redirect): """Fetch one URL.""" tries = 0 exception = None while tries < self.max_tries: try: response = yield from self.session.get(url, allow_redirects=False) #1 break #2 except aiohttp.ClientError as client_error: LOGGER.info('try %r for %r raised %r', tries, url, client_error) exception = client_error else: return try: if is_redirect(response): location = response.headers['location'] else: #4 stat, links = yield from self.parse_links(response) self.record_statistic(stat) for link in links.difference(self.seen_urls): self.q.put_nowait((link, self.max_redirect)) self.seen_urls.update(links) finally: yield from response.release() @asyncio.coroutine def work(self): """Process queue items forever.""" try: while True: url, max_redirect = yield from self.q.get() #q.get() Remove and return an item from the queue. If queue is empty, wait until an item is available. #print('url',url, 'max_redirect', max_redirect) assert url in self.seen_urls #assert 断言,异常会直接抛出 yield from self.fetch(url, max_redirect) self.q.task_done() #Indicate that a formerly enqueued task is complete.表明以前排队的任务完成 except asyncio.CancelledError: pass def url_allowed(self, url): if self.exclude and re.search(self.exclude, url): return False parts = urllib.parse.urlparse(url) if parts.scheme not in ('http', 'https'): LOGGER.debug('skipping non-http scheme in %r', url) return False host, port = urllib.parse.splitport(parts.netloc) if not self.host_okay(host): LOGGER.debug('skipping non-root host in %r', url) return False return True def add_url(self, url, max_redirect=None): """Add a URL to the queue if not seen before.""" if max_redirect is None: max_redirect = self.max_redirect LOGGER.debug('adding %r %r', url, max_redirect) self.seen_urls.add(url) self.q.put_nowait((url, max_redirect)) #put_nowait() Put an item into the queue without blocking.此句实际最先执行 @asyncio.coroutine def crawl(self): """Run the crawler until all finished.""" workers = [asyncio.Task(self.work(), loop=self.loop) for _ in range(self.max_tasks)] self.t0 = time.time() yield from self.q.join() #Block until all items in the queue have been gotten and processed.保持阻塞状态,直到处理了队列中的所有项目为止 self.t1 = time.time() for w in workers: w.cancel()
class Crawler(object): """Crawl a set of URLs. This manages two sets of URLs: 'urls' and 'done'. 'urls' is a set of URLs seen, and 'done' is a list of FetchStatistics. """ def __init__( self, roots, scraper=None, data_handler=None, exclude=None, strict=True, # What to crawl. max_redirect=5, max_tries=10, # Per-url limits. max_tasks=10, max_connections_per_host=3, *, loop=None ): self.loop = loop or asyncio.get_event_loop() self.roots = roots self.max_connections_per_host = max_connections_per_host self.scraper = scraper self.data_handler = data_handler self.exclude = exclude self.strict = strict self.max_redirect = max_redirect self.max_tries = max_tries self.max_tasks = max_tasks self.q = Queue(loop=self.loop) self.seen_urls = set() self.done = [] self.session = aiohttp.ClientSession(loop=self.loop) self.root_domains = set() for root in roots: parts = urllib.parse.urlparse(root) host, port = urllib.parse.splitport(parts.netloc) if not host: continue if re.match(r"\A[\d\.]*\Z", host): self.root_domains.add(host) else: host = host.lower() if self.strict: self.root_domains.add(host) else: self.root_domains.add(lenient_host(host)) for root in roots: self.add_urls(root) self.t0 = time.time() self.t1 = None def record_statistic( self, url=None, next_url=None, status=None, exception=None, content_type=None, encoding=None, num_urls=0, num_new_urls=0, ): """Record the FetchStatistic for completed / failed URL.""" fetch_statistic = FetchStatistic( url=url, next_url=next_url, status=status, size=0, exception=exception, content_type=content_type, encoding=encoding, num_urls=num_urls, num_new_urls=num_new_urls, ) self.done.append(fetch_statistic) def extract_data(self, root_url, html): raise NotImplementedError("You need to define a extract_data method!") def close(self): """Close resources.""" LOGGER.debug("closing resources") self.session.close() @asyncio.coroutine def parse_links(self, web_page_html, base_url, _content_type, _encoding): """Return a list of links.""" links = set() tree = html.fromstring(web_page_html) tree.make_links_absolute(base_url) urls = [link[2] for link in tree.iterlinks()] for url in urls: defragmented, frag = urllib.parse.urldefrag(url) if verify.url_allowed( defragmented, self.root_domains, exclude=self.exclude ): # Select Valid links, testing against regexp and root_domains links.add(defragmented) if urls: LOGGER.info( "got %r urls from %r new links: %i visited: %i", len(urls), base_url, len(links - self.seen_urls), len(self.seen_urls), ) new_links = [link for link in links.difference(self.seen_urls)] self.record_statistic( url=base_url, content_type=_content_type, encoding=_encoding, num_urls=len(links), num_new_urls=len(links - self.seen_urls), ) return new_links def handle_redirect(self, response, url, max_redirect): location = response.headers["location"] next_url = urllib.parse.urljoin(url, location) self.record_statistic(url=url, next_url=next_url, status=response.status) if next_url in self.seen_urls: return if max_redirect > 0: LOGGER.info("redirect to %r from %r max_redir: %i", next_url, url, max_redirect - 1) self.add_urls(next_url, max_redirect - 1) else: LOGGER.error("redirect limit reached for %r from %r", next_url, url) return @asyncio.coroutine def fetch(self, url, max_redirect, sem): """Fetch one URL.""" tries = 0 web_page = None exception = None _url = None _encoding = None _content_type = None sleep_time = 0 while tries < self.max_tries: try: with (yield from sem): response = yield from asyncio.wait_for( self.session.get(url, allow_redirects=False), 10, loop=self.loop ) if tries > 1: LOGGER.debug("try %r for %r success", tries, url) break except Exception as client_error: sleep_time += 5 yield from asyncio.sleep(sleep_time) LOGGER.error("try %r for %r raised %r", tries, url, client_error) exception = client_error tries += 1 else: # We never broke out of the loop: all tries failed. LOGGER.error("%r failed after %r tries", url, self.max_tries) self.record_statistic(url=url, exception=exception) return (web_page, _url, _content_type, _encoding) try: _url, _content_type, _encoding = get_content_type_and_encoding(response) if is_redirect(response): self.handle_redirect(response, url, max_redirect) web_page = "redirect" elif response.status == 200 and _content_type in ("text/html", "application/xml"): web_page = yield from response.text() else: self.record_statistic( url=response.url, status=response.status, content_type=_content_type, encoding=_encoding ) except Exception as e: print("*******error**********") finally: yield from response.release() return (web_page, _url, _content_type, _encoding) def add_urls(self, urls, max_redirect=None): """Add a URL to the queue if not seen before.""" if max_redirect is None: max_redirect = self.max_redirect if not isinstance(urls, str): urls = set(urls) for link in urls.difference(self.seen_urls): self.q.put_nowait((link, max_redirect)) self.seen_urls.update(urls) elif urls not in self.seen_urls: self.q.put_nowait((urls, max_redirect)) self.seen_urls.add(urls) @asyncio.coroutine def work(self, sem): """Process queue items forever.""" try: while True: url, max_redirect = yield from self.q.get() # assert url in self.seen_urls web_page, url, content_type, encoding = yield from self.fetch(url, max_redirect, sem) if web_page and web_page != "redirect": new_links = yield from self.parse_links(web_page, url, content_type, encoding) if self.scraper: data = self.scraper.scrape(url, web_page) if self.data_handler: self.data_handler.handle(data) self.add_urls(new_links) self.q.task_done() except (asyncio.CancelledError,): print("error") @asyncio.coroutine def crawl(self): sem = asyncio.Semaphore(value=self.max_connections_per_host, loop=self.loop) """Run the crawler until all finished.""" LOGGER.info("Starting crawl...") workers = [asyncio.Task(self.work(sem), loop=self.loop) for _ in range(self.max_tasks)] self.t0 = time.time() yield from self.q.join() self.t1 = time.time() for w in workers: w.cancel()
class Crawl: def __init__(self, url, test_url, *, number=10, max_tasks=5): self.url = url self.test_url = test_url self.number = number self.max_tasks = max_tasks self.url_queue = Queue() self.raw_proxy_queue = Queue() self.session = aiohttp.ClientSession() # tips: connection pool async def fetch_page(self, url): async with aiohttp.get(url) as response: try: assert response.status == 200 print("OK!", response.url) return await response.text() except AssertionError: print('Error!', response.url, response.status) async def filter_page(self, url): page = await self.fetch_page(url) if page: pattern = re.compile( r'<tr>.*?<td>(.*?)</td>.*?<td>(.*?)</td>.*?<td>.*?</td>.*?<td>(.*?)</td>.*?<td>(.*?)</td>.*?<td>(.*?)</td>.*?<td>(.*?)</td>.*?</tr>', re.S) data = pattern.findall(page) print(len(data)) for raw in data: item = list(map(lambda word: word.lower(), raw)) await self.raw_proxy_queue.put({ 'ip': item[0], 'port': item[1], 'anonymous': item[2], 'protocol': item[3], 'speed': item[4], 'checking-time': item[5] }) if not self.raw_proxy_queue.empty(): print('OK! raw_proxy_queue size: ', self.raw_proxy_queue.qsize()) async def verify_proxy(self, proxy): addr = proxy['protocol'] + '://' + proxy['ip'] + ':' + proxy['port'] conn = aiohttp.ProxyConnector(proxy=addr) try: session = aiohttp.ClientSession(connector=conn) with aiohttp.Timeout(10): start = time.time() async with session.get( self.test_url ) as response: # close connection and response, otherwise will tip: Unclosed connection and Unclosed response end = time.time() try: assert response.status == 200 print('Good proxy: {} {}s'.format( proxy['ip'], end - start)) except: # ProxyConnectionError, HttpProxyError and etc? print('Bad proxy: {}, {}, {}s'.format( proxy['ip'], response.status, end - start)) except: print('timeout {}, q size: {}'.format( proxy['speed'], self.raw_proxy_queue.qsize())) finally: # close session when timeout session.close() async def fetch_worker(self): while True: url = await self.url_queue.get() try: await self.filter_page(url) finally: self.url_queue.task_done() async def verify_worker(self): while True: raw_proxy = await self.raw_proxy_queue.get() if raw_proxy['protocol'] == 'https': # only http can be used continue try: await self.verify_proxy(raw_proxy) finally: try: self.raw_proxy_queue.task_done() except: pass async def run(self): await asyncio.wait([ self.url_queue.put(self.url + repr(i + 1)) for i in range(self.number) ]) fetch_tasks = [ asyncio.ensure_future(self.fetch_worker()) for _ in range(self.max_tasks) ] verify_tasks = [ asyncio.ensure_future(self.verify_worker()) for _ in range(10 * self.max_tasks) ] tasks = fetch_tasks + verify_tasks await self.url_queue.join() self.session.close() # close session, otherwise shows error print("url_queue done") self.raw_proxy_queue.join() print("raw_proxy_queue done") await self.proxy_queue.join() for task in tasks: task.cancel()
class Crawler(object): """Crawl a set of URLs. This manages two sets of URLs: 'urls' and 'done'. 'urls' is a set of URLs seen, and 'done' is a list of FetchStatistics. """ def __init__( self, roots, scraper=None, data_handler=None, exclude=None, strict=True, # What to crawl. max_redirect=5, max_tries=10, # Per-url limits. max_tasks=10, max_connections_per_host=3, *, loop=None): self.loop = loop or asyncio.get_event_loop() self.roots = roots self.max_connections_per_host = max_connections_per_host self.scraper = scraper self.data_handler = data_handler self.exclude = exclude self.strict = strict self.max_redirect = max_redirect self.max_tries = max_tries self.max_tasks = max_tasks self.q = Queue(loop=self.loop) self.seen_urls = set() self.done = [] self.session = aiohttp.ClientSession(loop=self.loop) self.root_domains = set() for root in roots: parts = urllib.parse.urlparse(root) host, port = urllib.parse.splitport(parts.netloc) if not host: continue if re.match(r'\A[\d\.]*\Z', host): self.root_domains.add(host) else: host = host.lower() if self.strict: self.root_domains.add(host) else: self.root_domains.add(lenient_host(host)) for root in roots: self.add_urls(root) self.t0 = time.time() self.t1 = None def record_statistic(self, url=None, next_url=None, status=None, exception=None, content_type=None, encoding=None, num_urls=0, num_new_urls=0): """Record the FetchStatistic for completed / failed URL.""" fetch_statistic = FetchStatistic(url=url, next_url=next_url, status=status, size=0, exception=exception, content_type=content_type, encoding=encoding, num_urls=num_urls, num_new_urls=num_new_urls) self.done.append(fetch_statistic) def extract_data(self, root_url, html): raise NotImplementedError('You need to define a extract_data method!') def close(self): """Close resources.""" LOGGER.debug("closing resources") self.session.close() @asyncio.coroutine def parse_links(self, web_page_html, base_url, _content_type, _encoding): """Return a list of links.""" links = set() tree = html.fromstring(web_page_html) tree.make_links_absolute(base_url) urls = [link[2] for link in tree.iterlinks()] for url in urls: defragmented, frag = urllib.parse.urldefrag(url) if verify.url_allowed( defragmented, self.root_domains, exclude=self.exclude ): # Select Valid links, testing against regexp and root_domains links.add(defragmented) if urls: LOGGER.info('got %r urls from %r new links: %i visited: %i', len(urls), base_url, len(links - self.seen_urls), len(self.seen_urls)) new_links = [link for link in links.difference(self.seen_urls)] self.record_statistic(url=base_url, content_type=_content_type, encoding=_encoding, num_urls=len(links), num_new_urls=len(links - self.seen_urls)) return new_links def handle_redirect(self, response, url, max_redirect): location = response.headers['location'] next_url = urllib.parse.urljoin(url, location) self.record_statistic(url=url, next_url=next_url, status=response.status) if next_url in self.seen_urls: return if max_redirect > 0: LOGGER.info('redirect to %r from %r max_redir: %i', next_url, url, max_redirect - 1) self.add_urls(next_url, max_redirect - 1) else: LOGGER.error('redirect limit reached for %r from %r', next_url, url) return @asyncio.coroutine def fetch(self, url, max_redirect, sem): """Fetch one URL.""" tries = 0 web_page = None exception = None _url = None _encoding = None _content_type = None sleep_time = 0 while tries < self.max_tries: try: with (yield from sem): response = yield from asyncio.wait_for(self.session.get( url, allow_redirects=False), 10, loop=self.loop) if tries > 1: LOGGER.debug('try %r for %r success', tries, url) break except Exception as client_error: sleep_time += 5 yield from asyncio.sleep(sleep_time) LOGGER.error('try %r for %r raised %r', tries, url, client_error) exception = client_error tries += 1 else: # We never broke out of the loop: all tries failed. LOGGER.error('%r failed after %r tries', url, self.max_tries) self.record_statistic(url=url, exception=exception) return (web_page, _url, _content_type, _encoding) try: _url, _content_type, _encoding = get_content_type_and_encoding( response) if is_redirect(response): self.handle_redirect(response, url, max_redirect) web_page = 'redirect' elif response.status == 200 and _content_type in ( 'text/html', 'application/xml'): web_page = yield from response.text() else: self.record_statistic(url=response.url, status=response.status, content_type=_content_type, encoding=_encoding) except Exception as e: print('*******error**********') finally: yield from response.release() return (web_page, _url, _content_type, _encoding) def add_urls(self, urls, max_redirect=None): """Add a URL to the queue if not seen before.""" if max_redirect is None: max_redirect = self.max_redirect if not isinstance(urls, str): urls = set(urls) for link in urls.difference(self.seen_urls): self.q.put_nowait((link, max_redirect)) self.seen_urls.update(urls) elif urls not in self.seen_urls: self.q.put_nowait((urls, max_redirect)) self.seen_urls.add(urls) @asyncio.coroutine def work(self, sem): """Process queue items forever.""" try: while True: url, max_redirect = yield from self.q.get() #assert url in self.seen_urls web_page, url, content_type, encoding = yield from self.fetch( url, max_redirect, sem) if web_page and web_page != 'redirect': new_links = yield from self.parse_links( web_page, url, content_type, encoding) if self.scraper: data = self.scraper.scrape(url, web_page) if self.data_handler: self.data_handler.handle(data) self.add_urls(new_links) self.q.task_done() except (asyncio.CancelledError, ): print('error') @asyncio.coroutine def crawl(self): sem = asyncio.Semaphore(value=self.max_connections_per_host, loop=self.loop) """Run the crawler until all finished.""" LOGGER.info('Starting crawl...') workers = [ asyncio.Task(self.work(sem), loop=self.loop) for _ in range(self.max_tasks) ] self.t0 = time.time() yield from self.q.join() self.t1 = time.time() for w in workers: w.cancel()