def crawl_proxies(): loginfo.info('crawl proxies in goubanjia') root = get_html_to_tree(etc.goubanjia_url) proxies_list = get_proxies_info(root) proxy_db = proxy_io.ProxiesIO(db=etc.crawl_db) for proxy in proxies_list: # verify_proxy_validity.verify_proxy(proxy) proxy_db.insert_proxy(proxy)
def crawl_proxies(): loginfo.info('crawl proxies in n89ip') for i in range(1, 20): time.sleep(3) root = get_html_to_tree(etc.s_89ip_url.format(i)) proxies_list = get_proxies_info(root) proxy_db = proxy_io.ProxiesIO(db=etc.crawl_db) for proxy in proxies_list: proxy_db.insert_proxy(proxy)
def crawl_proxies(a=1, b=50): loginfo.info('crawl proxies in kuaidaili') for i in range(a, b): time.sleep(3) soup = get_html_to_soup(etc.s_kuaidaili_inha_url.format(i)) proxies_list = get_proxies_info(soup) proxy_db = proxy_io.ProxiesIO(db=etc.crawl_db) for proxy in proxies_list: proxy_db.insert_proxy(proxy)
def crawl_proxies(): loginfo.info('crawl proxies in mimvp not in loop') proxy_db = proxy_io.ProxiesIO(db=etc.crawl_db) j_dict = get_html_to_json(etc.mimvp_api_url) proxies_list = get_proxies_info(j_dict) for proxy in proxies_list: proxy_db.insert_proxy(proxy) loginfo.info('crawl proxies in mimvp not in loop end') if len(proxies_list) < 20: return False return True
def crawl_proxies_loop(): proxy_db = proxy_io.ProxiesIO(db=etc.crawl_db) while True: loginfo.info('crawl proxies in mimvp in loop') s_time = time.time() j_dict = get_html_to_json(etc.mimvp_api_url) proxies_list = get_proxies_info(j_dict) for proxy in proxies_list: proxy_db.insert_proxy(proxy) time.sleep(0 if time.time() - s_time > 60 else 60 - (time.time() - s_time))