def insert_one(self, item: ProxyDO): if isinstance(item, ProxyDO): session = self.session() try: m = session.query(ProxyDO).filter( ProxyDO.ip == item.ip).first() if m is None: # 插入数据 session.add(item) session.flush() else: m.origin = item.origin m.update_time = item.update_time m.failed_count = item.failed_count m.response_speed = item.response_speed m.validity = item.validity session.commit() except Exception as error: session.rollback() utils.log(error) raise finally: session.close()
def check_ip_availability_task(self): last_check_time = self.redis_client.get(REDIS_KEY_LAST_CHECK_IP_TIME) now_time = datetime.utcnow().timestamp() if last_check_time is not None and ( now_time - float(last_check_time)) < (TASK_INTERVAL * 60): return self.redis_client.set(REDIS_KEY_LAST_CHECK_IP_TIME, now_time) proxy_list = self.collection.find() for proxy in proxy_list: ip = proxy['ip'] start_time = time.time() response = utils.http_request('http://lwons.com/wx', timeout=10) is_success = response.status_code == 200 response.close() if not is_success: try: self.collection.delete_one({'ip': ip}) except: pass utils.log('Check ip %s FAILED' % ip) else: elapsed = round(time.time() - start_time, 4) try: self.collection.update_one({'ip': ip}, { "$set": { 'update_time': utils.get_utc_time(), 'response_speed': elapsed, 'validity': True } }) except: pass utils.log('Check ip %s SUCCESS' % ip)
def check_ip_availability_task(self): # redis获取上次自检时间,如果未达到设定时间则不在检查 last_check_time = self.redis_client.get(REDIS_KEY_LAST_CHECK_IP_TIME) now_time = datetime.utcnow().timestamp() if last_check_time is not None and ( now_time - float(last_check_time)) < (TASK_INTERVAL * 60): return self.redis_client.set(REDIS_KEY_LAST_CHECK_IP_TIME, now_time) proxy_list = self.db.find_all() for proxy in proxy_list: ip = proxy.ip start_time = time.time() # 这个自己机制就是通过代理ip来ping数据量很小的网站。如果ping失败了则直接删除该ip response = utils.http_request('http://www.baidu.com', timeout=10) is_success = response.status_code == 200 response.close() if not is_success: # 如果请求失败,直接删除IP try: self.db.delete_one(ip) except: pass utils.log('Check ip %s FAILED' % ip) else: # 如果请求成功,在数据库中记录该ip最后响应的时间,下次取ip时优先取出使用 elapsed = round(time.time() - start_time, 4) try: proxy.update_time = utils.get_utc_time() proxy.response_speed = elapsed proxy.validity = 1 self.db.insert_one(proxy) except: pass utils.log('Check ip %s SUCCESS' % ip)
def count(self): session = self.session() try: count = session.query(ProxyDO).count() return count except Exception as error: utils.log(error) raise finally: session.close()
def detele_all(self): session = self.session() try: session.query(ProxyDO).delete() session.commit() except Exception as error: session.rollback() utils.log(error) raise finally: session.close()
def detele_one(self, ip): session = self.session() try: session.query(ProxyDO).filter(ProxyDO.ip == ip).delete() session.commit() except Exception as error: session.rollback() utils.log(error) raise finally: session.close()
def find_one(self, ip: str): session = self.session() try: m = session.query(ProxyDO).filter(ProxyDO.ip == ip).first() return m except Exception as error: session.rollback() utils.log(error) raise finally: session.close()
def db_connect_engine(): utils.log('db_connect_engine') engine = create_engine( "%s://%s:%s@%s:%s/%s?charset=utf8mb4" % (DATABASES['DRIVER'], DATABASES['USER'], DATABASES['PASSWORD'], DATABASES['HOST'], DATABASES['PORT'], DATABASES['NAME']), echo=False) try: if not database_exists(engine.url): create_database(engine.url) # 创建库 Base.metadata.create_all(engine) # 创建表 except Exception as e: log.error(e) return engine
def _thread_check_ip(self, proxy): with lock: ip = proxy.ip start_time = time.time() proxy.last_use_time = utils.get_utc_date() proxies = { "http": "http://" + ip, "https": "http://" + ip, } try: response = utils.http_request('https://google.com', timeout=5, proxies=proxies) with lock: proxy.external_validity = response.status_code == 200 proxy.used_count = proxy.used_count + 1 proxy.external_response_speed = round(time.time() - start_time, 4) * 1000 response.close() except (KeyboardInterrupt): exit() except: with lock: proxy.external_validity = False proxy.external_response_speed = -1 start_time = time.time() try: response = utils.http_request('https://www.baidu.com', timeout=5, proxies=proxies) with lock: proxy.internal_validity = response.status_code == 200 proxy.used_count = proxy.used_count + 1 proxy.internal_response_speed = round(time.time() - start_time, 4) * 1000 response.close() except (KeyboardInterrupt): exit() except: with lock: proxy.internal_validity = False proxy.internal_response_speed = -1 with lock: utils.log('Check IP:' + ip + ' finished i:' + str(proxy.internal_validity) + ' e:' + str(proxy.external_validity)) self.calc_proxy_weight(proxy) self.session.commit()
def find_all(self): session = self.session() try: m = session.query(ProxyDO).order_by( ProxyDO.failed_count.asc(), ProxyDO.validity.desc(), ProxyDO.response_speed.asc(), ProxyDO.update_time.desc()).all() return m except Exception as error: session.rollback() utils.log(error) raise finally: session.close()
def crawl_proxy_task(self, check_num: bool = True): if check_num: count = self.collection.count() if count > MIN_PROXY_COUNT: return utils.log("开始抓取代理") proxy_list = proxy_strategy.crawl_proxy() utils.log("开始保存") for proxy in proxy_list: if not self.collection.find_one({'ip': proxy.ip}): self.collection.insert_one(proxy.__dict__) utils.log('保存了:' + proxy.ip) utils.log("保存结束")
def crawl_proxy_task(self, check_num: bool = True): if check_num: count = self.db.count() if count > MIN_PROXY_COUNT: return utils.log("开始抓取代理") proxy_list = proxy_strategy.crawl_proxy() utils.log("开始保存") for proxy in proxy_list: if not self.db.find_one(proxy.ip): self.db.insert_one(self.db.convert(proxy)) utils.log('保存了:' + proxy.ip) utils.log("保存结束")
def add_failed_time(self, ip): proxy = self.collection.find_one({'ip': ip}) if proxy is not None: failed_count = proxy['failed_count'] + 1 utils.log("ip: %s 失败次数+1 已失败次数%s次" % (ip, failed_count)) if failed_count <= FAILED_COUNT_BORDER: try: self.collection.update_one({'ip': ip}, { "$set": { 'update_time': utils.get_utc_time(), 'failed_count': failed_count } }) except: pass else: try: self.collection.delete_one({'ip': ip}) except: pass self.crawl_proxy_task()
def add_failed_time(self, ip): proxy = self.db.find_one(ip) if proxy is not None: failed_count = proxy.failed_count + 1 utils.log("ip: %s 失败次数+1 已失败次数%s次" % (ip, failed_count)) if failed_count <= FAILED_COUNT_BORDER: # 如果未达到最大失败次数,则在数据库中添加一次失败 try: proxy.update_time = utils.get_utc_time() proxy.failed_count = failed_count self.db.insert_one(proxy) except: pass else: # 达到最大失败次数,则在数据库中删除 try: self.db.detele_one(ip) except: pass # 检查数据库中IP是否足够 self.crawl_proxy_task()
def check_ip_availability_task(self, time): need_update_date = utils.get_utc_date(-time) proxy_list = self.session.query(Proxy).filter( Proxy.last_use_time < need_update_date).all() utils.log('Start check count:' + str(len(proxy_list))) self._check_ip_availability_task(proxy_list)