def case_10(): dao = ProxyDBManager() dao.create_proxy_table() proxy = ProxyModel() ip = '125.115.141.6' port = 8118 http_type = 'HTTPS' anonymity = '高匿' area = '浙江宁波' speed = '0.148秒' agent = 'agent' survival_time = '4小时' proxy.set_ip(ip) proxy.set_port(port) proxy.set_type(http_type) proxy.set_anonymity(anonymity) # 处理空地区 if area is None: proxy.set_area('') else: proxy.set_area(area) proxy.set_speed(speed) proxy.set_agent(agent) proxy.set_survival_time(survival_time) dao.insert_proxy_table(proxy) proxy_address = dao.select_random_proxy() print(proxy_address) if 'http://' in proxy_address: proxy_address = proxy_address.replace('http://', '') else: proxy_address = proxy_address.replace('https://', '') old_ip = proxy_address.split(':')[0] print('old IP : ', old_ip) dao.plus_proxy_faild_time(old_ip)
class ProxyPoolWorker(object): __MIN_PROXY_NUM = 15 def __init__(self): self.__first = True # 连接数据库 self.dbmanager = ProxyDBManager() # 创建数据库表 self.dbmanager.drop_proxy_table() self.dbmanager.create_proxy_table() """ 把 ProxyPoolWorker 实现为单例 """ def __new__(cls, *args, **kwargs): if not hasattr(cls, '__instance'): new = super(ProxyPoolWorker, cls) cls.__instance = new.__new__(cls, *args) return cls.__instance """ 开始爬取 IP 代理 """ def start_work(self): self.crawl_proxy_web() scheduler = BackgroundScheduler() # 后台调度器 # 后台每 10 秒执行一次 # scheduler.add_job(self.__timedTask, 'interval', seconds=10) # 后台每 10 分钟执行一次 scheduler.add_job(self.__check_ip_availability_task, 'interval', minutes=10) scheduler.start() """ 检查 IP 是否可用 """ def __check_ip_availability_task(self): pass def crawl_proxy_web(self): spiders = [ XiciSpider, Data5uSpider, KuaidailiSpider, ] # for spider in spiders: # 修改为随机抓取某个代理网站 spider = random.choice(spiders) models = spider.get_proxies() filtered_models = requestEnginer.filter_unavailable_proxy(models) for each in filtered_models: self.dbmanager.insert_proxy_table(each) """ 随机获取一个 IP 代理地址 """ def select_proxy_data(self): proxy = self.dbmanager.select_random_proxy() if proxy is not '': proxy = self.dbmanager.select_random_proxy() return proxy """ 代理地址失效, 数据库连接失败次数 +1 """ def plus_proxy_faild_time(self, ip): self.dbmanager.plus_proxy_faild_time(ip) """ 停止爬取 IP 代理 """ def stop_work(self): self.dbmanager.close_connection()