async def check_all_urls(urls, checker_results): urls_per_host = defaultdict(list) for url in urls: urls_per_host[urlparse(url).hostname].append(url) #print(json.dumps({host: urls for host, urls in urls_per_host.items() if len(urls)>1}, indent=4), file=sys.stderr) progress_step = len(urls) // 10 queue = aio.Queue() async with aiohttp.ClientSession( raise_for_status=True, connector=aiohttp.TCPConnector(verify_ssl=False, limit=100), headers={'User-Agent': USER_AGENT}) as client: # default UA: https://github.com/aio-libs/aiohttp/blob/master/aiohttp/http.py#L34 aio.gather( check_one_host_urls(client, queue, one_host_urls) for one_host_urls in urls_per_host.values()) start = perf_counter() with timeout(20 * 60): for _ in range(len(urls_per_host)): resps = await queue.get() checker_results.extend(resps) count = len(checker_results) if progress_step and count % progress_step == 0: # those do not get printed progressively :( print('#> {:.1f}% processed : count={} time={}'.format( count * 100.0 / len(urls), count, perf_counter() - start), file=sys.stderr)
def send(self): try: resp = self.session.get(robots_txt_url(self.urls[0]), verify=False) resp.raise_for_status() robots_txt_content = resp.text except BaseException as error: robots_txt_content = '' resps = [] for url in self.urls: if robots_txt_content and not robot_can_fetch( robots_txt_content, url): resps.append((url, 'ROBOT FORBIDDEN', None, None)) continue if resps: sleep(2) # rate-limiting 1 request every 2s per hostname start = perf_counter() try: response = self.session.get( url, verify=False, headers={'User-Agent': USER_AGENT} ) # default requests UA is often blacklisted: https://github.com/kennethreitz/requests/blob/master/requests/utils.py#L731 resps.append( (url, response.status_code, perf_counter() - start, response.elapsed.total_seconds())) except Exception as error: resps.append( (url, error2str(error), perf_counter() - start, None)) return resps
async def check_one_host_urls(client, queue, urls): try: async with client.get(robots_txt_url(urls[0]), raise_for_status=True) as response: robots_txt_content = await response.text() except Exception: robots_txt_content = '' resps = [] for url in urls: if robots_txt_content and not robot_can_fetch(robots_txt_content, url): resps.append((url, 'ROBOT FORBIDDEN', None)) continue if resps: aio.sleep(2) # rate-limiting 1 request every 2s per hostname try: start = perf_counter() async with client.get(url, timeout=60) as response: resps.append((url, response.status, perf_counter() - start)) except Exception as error: resps.append((url, error2str(error), perf_counter() - start)) await queue.put(resps)
loop = aio.get_event_loop() loop.set_debug(True) loop.slow_callback_duration = 1 # seconds try: loop.run_until_complete(check_all_urls(urls, checker_results)) except aio.TimeoutError: unprocessed_urls = set(urls) - set(resp[0] for resp in checker_results) print('20min TIMEOUT', file=sys.stderr) print(unprocessed_urls, file=sys.stderr) return checker_results if __name__ == '__main__': urls = set(html.unescape(url.strip()) for url in sys.stdin.readlines()) timings = {} start = perf_counter() for url, status_or_error, exec_duration in url_checker(urls): if exec_duration: timings[url] = exec_duration if status_or_error != 200: print( str(status_or_error) or type(status_or_error), status_or_error, url) print('#= Done in', perf_counter() - start, file=sys.stderr) print('# perf timing stats:', file=sys.stderr) print(json.dumps(compute_timing_stats(timings.values()), indent=4), file=sys.stderr) print('## Top10 slow requests:', file=sys.stderr) top_slow_urls = sorted(timings.keys(), key=timings.get)[-10:] print('\n'.join('- {} : {:.2f}s'.format(url, timings[url]) for url in top_slow_urls),