예제 #1
0
    def check_ip_availability_task(self):
        # redis获取上次自检时间,如果未达到设定时间则不在检查
        last_check_time = self.redis_client.get(REDIS_KEY_LAST_CHECK_IP_TIME)
        now_time = datetime.utcnow().timestamp()
        if last_check_time is not None and (
                now_time - float(last_check_time)) < (TASK_INTERVAL * 60):
            return
        self.redis_client.set(REDIS_KEY_LAST_CHECK_IP_TIME, now_time)

        proxy_list = self.db.find_all()
        for proxy in proxy_list:
            ip = proxy.ip
            start_time = time.time()
            # 这个自己机制就是通过代理ip来ping数据量很小的网站。如果ping失败了则直接删除该ip
            response = utils.http_request('http://www.baidu.com', timeout=10)
            is_success = response.status_code == 200
            response.close()
            if not is_success:
                # 如果请求失败,直接删除IP
                try:
                    self.db.delete_one(ip)
                except:
                    pass
                utils.log('Check ip %s FAILED' % ip)
            else:
                # 如果请求成功,在数据库中记录该ip最后响应的时间,下次取ip时优先取出使用
                elapsed = round(time.time() - start_time, 4)
                try:
                    proxy.update_time = utils.get_utc_time()
                    proxy.response_speed = elapsed
                    proxy.validity = 1
                    self.db.insert_one(proxy)
                except:
                    pass
                utils.log('Check ip %s SUCCESS' % ip)
    def check_ip_availability_task(self):
        last_check_time = self.redis_client.get(REDIS_KEY_LAST_CHECK_IP_TIME)
        now_time = datetime.utcnow().timestamp()
        if last_check_time is not None and (
                now_time - float(last_check_time)) < (TASK_INTERVAL * 60):
            return
        self.redis_client.set(REDIS_KEY_LAST_CHECK_IP_TIME, now_time)

        proxy_list = self.collection.find()
        for proxy in proxy_list:
            ip = proxy['ip']
            start_time = time.time()
            response = utils.http_request('http://lwons.com/wx', timeout=10)
            is_success = response.status_code == 200
            response.close()
            if not is_success:
                try:
                    self.collection.delete_one({'ip': ip})
                except:
                    pass
                utils.log('Check ip %s FAILED' % ip)
            else:
                elapsed = round(time.time() - start_time, 4)
                try:
                    self.collection.update_one({'ip': ip}, {
                        "$set": {
                            'update_time': utils.get_utc_time(),
                            'response_speed': elapsed,
                            'validity': True
                        }
                    })
                except:
                    pass
                utils.log('Check ip %s SUCCESS' % ip)
예제 #3
0
 def get_model(self,
               design_topic_item: DesignTopicItem) -> DesignTopicModel:
     design_topic_model = DesignTopicModel()
     design_topic_model._id = utils.get_uuid()
     design_topic_model.title = design_topic_item['title']
     design_topic_model.description = design_topic_item['description']
     design_topic_model.html_url = design_topic_item['html_url']
     design_topic_model.article = design_topic_item['article']
     design_topic_model.create_time = utils.get_utc_time()
     return design_topic_model
예제 #4
0
 def create(ip, origin):
     proxy = Proxy()
     proxy.ip = ip
     proxy.origin = origin
     proxy.create_time = utils.get_utc_time()
     proxy.update_time = proxy.create_time
     proxy.failed_count = 0
     proxy.response_speed = -1
     proxy.validity = False
     return proxy
 def get_design_strategy_model(
         self,
         design_strategy_item: DesignStrategyItem) -> DesignStrategyModel:
     design_strategy_model = DesignStrategyModel()
     design_strategy_model.id = utils.get_uuid()
     design_strategy_model.title = design_strategy_item['title']
     design_strategy_model.html_url = design_strategy_item['html_url']
     design_strategy_model.description = design_strategy_item['description']
     design_strategy_model.content = design_strategy_item['content']
     design_strategy_model.category = design_strategy_item['category']
     design_strategy_model.create_time = utils.get_utc_time()
     return design_strategy_model
예제 #6
0
 def create_design_picture_summary_model(
         self, design_picture_model: DesignPictureModel
 ) -> DesignPictureSummaryModel:
     design_picture_summary_model = DesignPictureSummaryModel()
     design_picture_summary_model.id = design_picture_model.fid
     design_picture_summary_model.cid = [design_picture_model.id]
     design_picture_summary_model.title = design_picture_model.title
     design_picture_summary_model.description = design_picture_model.description
     design_picture_summary_model.tags = design_picture_model.tags
     design_picture_summary_model.html_url = design_picture_model.html_url
     design_picture_summary_model.create_time = utils.get_utc_time()
     design_picture_summary_model.update_time = design_picture_summary_model.create_time
     design_picture_summary_model.cover_img_url = design_picture_model.img_url
     design_picture_summary_model.cover_img_width = design_picture_model.img_width
     design_picture_summary_model.cover_img_height = design_picture_model.img_height
     design_picture_summary_model.cover_img_name = design_picture_model.img_name
     return design_picture_summary_model
예제 #7
0
 def get_design_picture_model(
         self,
         design_picture_item: DesignPictureItem) -> DesignPictureModel:
     design_picture_model = DesignPictureModel()
     design_picture_model.id = utils.get_uuid()
     design_picture_model.fid = design_picture_item['fid']
     design_picture_model.title = design_picture_item['title']
     design_picture_model.sub_title = design_picture_item['sub_title']
     design_picture_model.html_url = design_picture_item['html_url']
     design_picture_model.tags = design_picture_item['tags']
     design_picture_model.description = design_picture_item['description']
     design_picture_model.img_url = design_picture_item['img_url']
     design_picture_model.img_width = design_picture_item['img_width']
     design_picture_model.img_height = design_picture_item['img_height']
     design_picture_model.img_name = design_picture_item['img_name']
     design_picture_model.create_time = utils.get_utc_time()
     return design_picture_model
예제 #8
0
 def random_choice_proxy(self, bInternal) -> str:
     now_time = utils.get_utc_date()
     available_time = utils.get_utc_time(-60)
     if bInternal:
         proxy = self.session.query(Proxy).filter(
             Proxy.last_use_time < available_time,
             Proxy.internal_validity == True,
             Proxy.internal_response_speed > 0).order_by(
                 Proxy.internal_response_speed).first()
     else:
         proxy = self.session.query(Proxy).filter(
             Proxy.last_use_time < available_time,
             Proxy.external_validity == True,
             Proxy.external_response_speed > 0).order_by(
                 Proxy.external_response_speed).first()
     proxy.last_use_time = now_time
     proxy.used_count = proxy.used_count + 1
     self.calc_proxy_weight(proxy)
     self.session.commit()
     return proxy.ip
 def add_failed_time(self, ip):
     proxy = self.collection.find_one({'ip': ip})
     if proxy is not None:
         failed_count = proxy['failed_count'] + 1
         utils.log("ip: %s 失败次数+1 已失败次数%s次" % (ip, failed_count))
         if failed_count <= FAILED_COUNT_BORDER:
             try:
                 self.collection.update_one({'ip': ip}, {
                     "$set": {
                         'update_time': utils.get_utc_time(),
                         'failed_count': failed_count
                     }
                 })
             except:
                 pass
         else:
             try:
                 self.collection.delete_one({'ip': ip})
             except:
                 pass
     self.crawl_proxy_task()
예제 #10
0
 def add_failed_time(self, ip):
     proxy = self.db.find_one(ip)
     if proxy is not None:
         failed_count = proxy.failed_count + 1
         utils.log("ip: %s 失败次数+1 已失败次数%s次" % (ip, failed_count))
         if failed_count <= FAILED_COUNT_BORDER:
             # 如果未达到最大失败次数,则在数据库中添加一次失败
             try:
                 proxy.update_time = utils.get_utc_time()
                 proxy.failed_count = failed_count
                 self.db.insert_one(proxy)
             except:
                 pass
         else:
             # 达到最大失败次数,则在数据库中删除
             try:
                 self.db.detele_one(ip)
             except:
                 pass
     # 检查数据库中IP是否足够
     self.crawl_proxy_task()
예제 #11
0
    def handle_item(self, design_picture_item: DesignPictureItem):
        if self.is_duplicate_url(design_picture_item['img_url']):
            return
        design_picture_model = self.get_design_picture_model(
            design_picture_item)
        self.save_to_database(self.collection, design_picture_model)

        summary_model = self.find_one(self.summary_collection,
                                      {'id': design_picture_model.fid})
        if summary_model is None:
            summary_model = self.create_design_picture_summary_model(
                design_picture_model)
            self.save_to_database(self.summary_collection, summary_model)
        else:
            tags = list(
                set(summary_model['tags']).union(set(
                    design_picture_model.tags)))
            summary_model['cid'].append(design_picture_model.id)
            self.update_one(self.summary_collection,
                            {'id': summary_model['id']}, {
                                'update_time': utils.get_utc_time(),
                                'tags': tags,
                                'cid': summary_model['cid']
                            })
        self.insert_to_redis(design_picture_model.img_url)

        log.info(
            "========================================================================================="
        )
        log.info("title:" + design_picture_item['title'])
        log.info("sub_title:" + design_picture_item['sub_title'])
        log.info("original_width:" + design_picture_item['img_width'])
        log.info("original_height:" + design_picture_item['img_height'])
        log.info("html_url:" + design_picture_item['html_url'])
        log.info("img_url:" + design_picture_item['img_url'])
        log.info("description:" + design_picture_item['description'])
        log.info("tags:%s" % ','.join(map(str, design_picture_item['tags'])))
        log.info(
            "========================================================================================="
        )
예제 #12
0
 def generate_name(key):
     create_time = utils.get_utc_time()
     img_name = "/" + PROJECT_NAME + "/" + create_time[
         0:10] + "/" + utils.get_md5(create_time + key)
     return img_name