def check_ip_availability_task(self): # redis获取上次自检时间,如果未达到设定时间则不在检查 last_check_time = self.redis_client.get(REDIS_KEY_LAST_CHECK_IP_TIME) now_time = datetime.utcnow().timestamp() if last_check_time is not None and ( now_time - float(last_check_time)) < (TASK_INTERVAL * 60): return self.redis_client.set(REDIS_KEY_LAST_CHECK_IP_TIME, now_time) proxy_list = self.db.find_all() for proxy in proxy_list: ip = proxy.ip start_time = time.time() # 这个自己机制就是通过代理ip来ping数据量很小的网站。如果ping失败了则直接删除该ip response = utils.http_request('http://www.baidu.com', timeout=10) is_success = response.status_code == 200 response.close() if not is_success: # 如果请求失败,直接删除IP try: self.db.delete_one(ip) except: pass utils.log('Check ip %s FAILED' % ip) else: # 如果请求成功,在数据库中记录该ip最后响应的时间,下次取ip时优先取出使用 elapsed = round(time.time() - start_time, 4) try: proxy.update_time = utils.get_utc_time() proxy.response_speed = elapsed proxy.validity = 1 self.db.insert_one(proxy) except: pass utils.log('Check ip %s SUCCESS' % ip)
def check_ip_availability_task(self): last_check_time = self.redis_client.get(REDIS_KEY_LAST_CHECK_IP_TIME) now_time = datetime.utcnow().timestamp() if last_check_time is not None and ( now_time - float(last_check_time)) < (TASK_INTERVAL * 60): return self.redis_client.set(REDIS_KEY_LAST_CHECK_IP_TIME, now_time) proxy_list = self.collection.find() for proxy in proxy_list: ip = proxy['ip'] start_time = time.time() response = utils.http_request('http://lwons.com/wx', timeout=10) is_success = response.status_code == 200 response.close() if not is_success: try: self.collection.delete_one({'ip': ip}) except: pass utils.log('Check ip %s FAILED' % ip) else: elapsed = round(time.time() - start_time, 4) try: self.collection.update_one({'ip': ip}, { "$set": { 'update_time': utils.get_utc_time(), 'response_speed': elapsed, 'validity': True } }) except: pass utils.log('Check ip %s SUCCESS' % ip)
def get_model(self, design_topic_item: DesignTopicItem) -> DesignTopicModel: design_topic_model = DesignTopicModel() design_topic_model._id = utils.get_uuid() design_topic_model.title = design_topic_item['title'] design_topic_model.description = design_topic_item['description'] design_topic_model.html_url = design_topic_item['html_url'] design_topic_model.article = design_topic_item['article'] design_topic_model.create_time = utils.get_utc_time() return design_topic_model
def create(ip, origin): proxy = Proxy() proxy.ip = ip proxy.origin = origin proxy.create_time = utils.get_utc_time() proxy.update_time = proxy.create_time proxy.failed_count = 0 proxy.response_speed = -1 proxy.validity = False return proxy
def get_design_strategy_model( self, design_strategy_item: DesignStrategyItem) -> DesignStrategyModel: design_strategy_model = DesignStrategyModel() design_strategy_model.id = utils.get_uuid() design_strategy_model.title = design_strategy_item['title'] design_strategy_model.html_url = design_strategy_item['html_url'] design_strategy_model.description = design_strategy_item['description'] design_strategy_model.content = design_strategy_item['content'] design_strategy_model.category = design_strategy_item['category'] design_strategy_model.create_time = utils.get_utc_time() return design_strategy_model
def create_design_picture_summary_model( self, design_picture_model: DesignPictureModel ) -> DesignPictureSummaryModel: design_picture_summary_model = DesignPictureSummaryModel() design_picture_summary_model.id = design_picture_model.fid design_picture_summary_model.cid = [design_picture_model.id] design_picture_summary_model.title = design_picture_model.title design_picture_summary_model.description = design_picture_model.description design_picture_summary_model.tags = design_picture_model.tags design_picture_summary_model.html_url = design_picture_model.html_url design_picture_summary_model.create_time = utils.get_utc_time() design_picture_summary_model.update_time = design_picture_summary_model.create_time design_picture_summary_model.cover_img_url = design_picture_model.img_url design_picture_summary_model.cover_img_width = design_picture_model.img_width design_picture_summary_model.cover_img_height = design_picture_model.img_height design_picture_summary_model.cover_img_name = design_picture_model.img_name return design_picture_summary_model
def get_design_picture_model( self, design_picture_item: DesignPictureItem) -> DesignPictureModel: design_picture_model = DesignPictureModel() design_picture_model.id = utils.get_uuid() design_picture_model.fid = design_picture_item['fid'] design_picture_model.title = design_picture_item['title'] design_picture_model.sub_title = design_picture_item['sub_title'] design_picture_model.html_url = design_picture_item['html_url'] design_picture_model.tags = design_picture_item['tags'] design_picture_model.description = design_picture_item['description'] design_picture_model.img_url = design_picture_item['img_url'] design_picture_model.img_width = design_picture_item['img_width'] design_picture_model.img_height = design_picture_item['img_height'] design_picture_model.img_name = design_picture_item['img_name'] design_picture_model.create_time = utils.get_utc_time() return design_picture_model
def random_choice_proxy(self, bInternal) -> str: now_time = utils.get_utc_date() available_time = utils.get_utc_time(-60) if bInternal: proxy = self.session.query(Proxy).filter( Proxy.last_use_time < available_time, Proxy.internal_validity == True, Proxy.internal_response_speed > 0).order_by( Proxy.internal_response_speed).first() else: proxy = self.session.query(Proxy).filter( Proxy.last_use_time < available_time, Proxy.external_validity == True, Proxy.external_response_speed > 0).order_by( Proxy.external_response_speed).first() proxy.last_use_time = now_time proxy.used_count = proxy.used_count + 1 self.calc_proxy_weight(proxy) self.session.commit() return proxy.ip
def add_failed_time(self, ip): proxy = self.collection.find_one({'ip': ip}) if proxy is not None: failed_count = proxy['failed_count'] + 1 utils.log("ip: %s 失败次数+1 已失败次数%s次" % (ip, failed_count)) if failed_count <= FAILED_COUNT_BORDER: try: self.collection.update_one({'ip': ip}, { "$set": { 'update_time': utils.get_utc_time(), 'failed_count': failed_count } }) except: pass else: try: self.collection.delete_one({'ip': ip}) except: pass self.crawl_proxy_task()
def add_failed_time(self, ip): proxy = self.db.find_one(ip) if proxy is not None: failed_count = proxy.failed_count + 1 utils.log("ip: %s 失败次数+1 已失败次数%s次" % (ip, failed_count)) if failed_count <= FAILED_COUNT_BORDER: # 如果未达到最大失败次数,则在数据库中添加一次失败 try: proxy.update_time = utils.get_utc_time() proxy.failed_count = failed_count self.db.insert_one(proxy) except: pass else: # 达到最大失败次数,则在数据库中删除 try: self.db.detele_one(ip) except: pass # 检查数据库中IP是否足够 self.crawl_proxy_task()
def handle_item(self, design_picture_item: DesignPictureItem): if self.is_duplicate_url(design_picture_item['img_url']): return design_picture_model = self.get_design_picture_model( design_picture_item) self.save_to_database(self.collection, design_picture_model) summary_model = self.find_one(self.summary_collection, {'id': design_picture_model.fid}) if summary_model is None: summary_model = self.create_design_picture_summary_model( design_picture_model) self.save_to_database(self.summary_collection, summary_model) else: tags = list( set(summary_model['tags']).union(set( design_picture_model.tags))) summary_model['cid'].append(design_picture_model.id) self.update_one(self.summary_collection, {'id': summary_model['id']}, { 'update_time': utils.get_utc_time(), 'tags': tags, 'cid': summary_model['cid'] }) self.insert_to_redis(design_picture_model.img_url) log.info( "=========================================================================================" ) log.info("title:" + design_picture_item['title']) log.info("sub_title:" + design_picture_item['sub_title']) log.info("original_width:" + design_picture_item['img_width']) log.info("original_height:" + design_picture_item['img_height']) log.info("html_url:" + design_picture_item['html_url']) log.info("img_url:" + design_picture_item['img_url']) log.info("description:" + design_picture_item['description']) log.info("tags:%s" % ','.join(map(str, design_picture_item['tags']))) log.info( "=========================================================================================" )
def generate_name(key): create_time = utils.get_utc_time() img_name = "/" + PROJECT_NAME + "/" + create_time[ 0:10] + "/" + utils.get_md5(create_time + key) return img_name