Exemplo n.º 1
0
    def __init__(self, redis_connection, response_buffer, account_manager,
                 is_proxy_service_enable, session_pool_size,
                 download_thread_num, network_retry_times, connect_timeout,
                 download_interval):
        # 设置下载线程的数量
        self.download_thread_num = download_thread_num
        # 设置 Redis 连接
        self.redis_connection = redis_connection
        # 设置 response 缓存队列
        self.response_buffer = response_buffer
        # 设置账户认证管理器
        self.account_manager = account_manager
        # 设置并启动sessionManager
        self.session_manager = SessionManager(session_pool_size,
                                              account_manager,
                                              is_proxy_service_enable)

        # 设置网络连接参数
        self.NETWORK_RETRY_TIMES = network_retry_times
        self.CONNECT_TIMEOUT = connect_timeout
        self.DOWNLOAD_INTERVAL = download_interval

        # 初始化下载线程
        self.download_thread_list = []
        for i in range(self.download_thread_num):
            download_thread = DownloadThread(
                'thread' + str(i), self.session_manager, self.redis_connection,
                self.response_buffer, self.NETWORK_RETRY_TIMES,
                self.CONNECT_TIMEOUT, self.DOWNLOAD_INTERVAL)
            self.download_thread_list.append(download_thread)

        if log.isEnabledFor(logging.INFO):
            log.info("Downloader 模块初始化完毕")
Exemplo n.º 2
0
    def run(self):
        if log.isEnabledFor(logging.INFO):
            log.info('数据处理线程' + self.thread_id + '启动')

        try:
            while True:
                # 获取Response数据
                response = self.response_buffer.get_response_from_buffer()

                if response is None or len(response) < 2:
                    continue

                # 判断Response类型
                response_type = response[0]

                # 分派给对应的处理方法
                if response_type == 'info':
                    self.parse_user_info(response)
                elif response_type == 'list':
                    self.parse_follow_info(response)
                time.sleep(0.1)

        except Exception as e:
            self.thread_status = 'error'
            if log.isEnabledFor(logging.ERROR):
                log.exception(e)
Exemplo n.º 3
0
    def start_processor(self):
        # 启动处理线程
        for process_thread in self.processor_list:
            process_thread.start()

        if log.isEnabledFor(logging.INFO):
            log.info('Processor 模块启动成功')
Exemplo n.º 4
0
    def __init__(self, process_thread_num, is_parser_following_list,
                 is_parser_follower_list, is_parser_follow_relation,
                 redis_connection, response_buffer):
        # 设置数据处理器数量
        self.process_thread_num = process_thread_num
        # 设置 Redis 连接
        self.redis_connection = redis_connection
        # 创建 Token 过滤器
        self.token_filter = TokenFilter(self.redis_connection)
        # 设置 response 缓存队列
        self.response_buffer = response_buffer

        # 是否解析正在关注列表
        self.is_parser_following_list = is_parser_following_list
        # 是否解析关注者列表
        self.is_parser_follower_list = is_parser_follower_list
        # 是否解析关注关系
        self.is_parser_follow_relation = is_parser_follow_relation

        # 创建处理器
        self.processor_list = []
        for i in range(process_thread_num):
            process_thread = ProcessThread(
                'thread' + str(i), self.redis_connection, self.token_filter,
                self.response_buffer, self.is_parser_following_list,
                self.is_parser_follower_list, self.is_parser_follow_relation)
            self.processor_list.append(process_thread)

        if log.isEnabledFor(logging.INFO):
            log.info('Processor 模块初始化完毕')
Exemplo n.º 5
0
    def run(self):

        if log.isEnabledFor(logging.INFO):
            log.info('Scheduler 模块启动成功')

        while True:
            # 当 urlQueue 队列中元素太多时,停止放入
            while self.redis_connection.llen(self.url_queue_name) > 500:
                time.sleep(180)

            # 当队列中均没有元素时,暂停添加
            follow_info_queue_length = self.redis_connection.llen(
                self.follow_info_url_queue)
            user_info_queue_length = self.redis_connection.llen(
                self.user_info_url_queue)
            if follow_info_queue_length == 0 and user_info_queue_length == 0:
                time.sleep(20)
                continue

            # 分别从两个队列中获取设定比例的数量的元素添加到下载URL队列
            for i in range(self.url_rate):
                url_info = self.redis_connection.lpop(self.user_info_url_queue)
                if url_info is not None:
                    self.redis_connection.rpush(self.url_queue_name, url_info)
                    del url_info

            for i in range(10 - self.url_rate):
                url_info = self.redis_connection.lpop(
                    self.follow_info_url_queue)
                if url_info is not None:
                    self.redis_connection.rpush(self.url_queue_name, url_info)
                    del url_info
Exemplo n.º 6
0
    def start_downloader(self):
        # 启动下载线程
        for download_thread in self.download_thread_list:
            download_thread.start()

        if log.isEnabledFor(logging.INFO):
            log.info('Downloader 模块启动成功')
Exemplo n.º 7
0
    def __init__(self, session_pool_size, account_manager,
                 is_proxy_service_enable):
        # session pool 大小
        self.session_pool_size = session_pool_size
        # 已经创建的session数量
        self.created_session_num = 0
        # 当前池中的session数量
        self.available_session_num = 0
        # 是否启用代理服务
        self.is_proxy_service_enable = is_proxy_service_enable
        # 账号认证管理器
        self.account_manager = account_manager

        # available session num 锁
        self.available_session_lock = threading.Lock()
        # created session num 锁
        self.created_session_lock = threading.Lock()

        # 创建 session pool
        self.session_pool = queue.Queue(session_pool_size)
        # 创建并启动代理服务
        if self.is_proxy_service_enable is True:
            self.proxy_service = proxyCore.ProxyService()
            self.proxy_service.start_proxy_service()

        if log.isEnabledFor(logging.INFO):
            log.info("Session Manager 启动成功")
Exemplo n.º 8
0
 def check_and_restart(self):
     if self.persistent_thread.thread_status == 'error':
         self.persistent_thread = PersistentThread(
             self.db_connection, self.redis_connection,
             self.persistent_cache_size,
             self.follow_relation_persistent_cache_size)
         self.persistent_thread.start()
         if log.isEnabledFor(logging.INFO):
             log.info('DataPersistent模块持久化线程中重新启动')
Exemplo n.º 9
0
 def login(self):
     if self.is_login_by_cookie is True:
         if log.isEnabledFor(logging.INFO):
             log.info('使用Cookie登陆方式登陆')
         return self.cookie_login()
     else:
         if log.isEnabledFor(logging.INFO):
             log.info('使用邮箱或手机号码登陆方式登陆')
         return self.common_login()
Exemplo n.º 10
0
 def check_and_restart(self):
     if self.email_service_thread.thread_status == 'error':
         self.email_service_thread = EmailServiceThread(
             self.smtp_server_host, self.smtp_server_port,
             self.smtp_server_password, self.smtp_from_addr,
             self.smtp_to_addr, self.smtp_email_header,
             self.smtp_send_interval, self.data_persistent)
         self.email_service_thread.start()
         if log.isEnabledFor(logging.INFO):
             log.info('EmailService线程重新启动')
Exemplo n.º 11
0
    def load_init_data(self, token_list):
        if token_list is None:
            return

        for token in token_list:
            # 封装 URL 信息
            url_info = ['info', URL_PUBLIC + token + URL_PINS, token]
            self.redis_connection.rpush('userInfoURLQueue', url_info)
            del url_info

        if log.isEnabledFor(logging.INFO):
            log.info('初始用户Token载入完毕')
Exemplo n.º 12
0
 def check_and_restart(self):
     for download_thread in self.download_thread_list:
         if download_thread.thread_status == 'error':
             thread_id = download_thread.thread_id
             self.download_thread_list.remove(download_thread)
             download_thread = DownloadThread(
                 thread_id, self.session_manager, self.redis_connection,
                 self.response_buffer, self.NETWORK_RETRY_TIMES,
                 self.CONNECT_TIMEOUT, self.DOWNLOAD_INTERVAL)
             self.download_thread_list.append(download_thread)
             download_thread.start()
             if log.isEnabledFor(logging.INFO):
                 log.info('数据下载线程' + thread_id + '重新启动')
Exemplo n.º 13
0
    def __init__(self, redis_connection, url_rate):
        threading.Thread.__init__(self)
        # 设置Redis连接
        self.redis_connection = redis_connection
        # 设置 url 的调度比例
        self.url_rate = url_rate

        # Following & Follower URL 队列名称
        self.follow_info_url_queue = 'followInfoURLQueue'
        # User info URL 队列名称
        self.user_info_url_queue = 'userInfoURLQueue'
        # 待下载URL队列名称
        self.url_queue_name = 'urlQueue'

        if log.isEnabledFor(logging.INFO):
            log.info('Scheduler 模块初始化完毕')
Exemplo n.º 14
0
    def common_login(self):
        # 创建会话
        session = requests.session()
        session.headers = requestHeader

        # 获取 _xsrf
        try:
            response = session.get(mainPageURL)
            input_tag = BeautifulSoup(response.text, 'html.parser').find(
                'input', attrs={'name': '_xsrf'})
            if input_tag is None:
                return False
            _xsrf = input_tag['value']

            # login
            form_data = {
                '_xsrf': _xsrf,
                'email': self.login_token,
                'password': self.password
            }
            requestHeader.update({
                'X-Requested-With': 'XMLHttpRequest',
                'X-Xsrftoken': _xsrf
            })
            session.headers = requestHeader
            response = session.post(url=loginURL, data=form_data)
            if response.status_code == 200:
                # 检查是否已经登陆成功
                response = session.get(authTestURL)
                if response.status_code == 200:
                    # 保存登陆认证cookie
                    self.auth_token = session.cookies.get_dict()
                    if log.isEnabledFor(logging.INFO):
                        log.info('知乎账户登陆成功')
                    return True

            # 登陆失败
            if log.isEnabledFor(logging.INFO):
                log.info('知乎账户登陆失败')
            return False
        except Exception as e:
            if log.isEnabledFor(logging.ERROR):
                log.error(e)
        finally:
            session.close()
Exemplo n.º 15
0
    def __init__(self, persistent_cache_size,
                 follow_relation_persistent_cache_size, db_connection,
                 redis_connection):
        # 设置用户信息数据持久化缓存大小
        self.persistent_cache_size = persistent_cache_size
        # 设置用户关注关系持久化缓存大小
        self.follow_relation_persistent_cache_size = follow_relation_persistent_cache_size
        # 设置数据库连接
        self.db_connection = db_connection
        # 设置Redis连接
        self.redis_connection = redis_connection
        # 创建数据库持久化线程
        self.persistent_thread = PersistentThread(
            self.db_connection, self.redis_connection,
            self.persistent_cache_size,
            self.follow_relation_persistent_cache_size)

        if log.isEnabledFor(logging.INFO):
            log.info('DataPersistent 模块初始化完毕')
Exemplo n.º 16
0
    def __init__(self, smtp_server_host, smtp_server_port,
                 smtp_server_password, smtp_from_addr, smtp_to_addr,
                 smtp_email_header, smtp_send_interval, data_persistent):
        self.data_persistent = data_persistent
        # 设置参数
        self.smtp_server_host = smtp_server_host
        self.smtp_server_port = smtp_server_port
        self.smtp_server_password = smtp_server_password
        self.smtp_from_addr = smtp_from_addr
        self.smtp_to_addr = smtp_to_addr
        self.smtp_email_header = smtp_email_header
        self.smtp_send_interval = smtp_send_interval

        # 创建邮件定时发送线程
        self.email_service_thread = EmailServiceThread(
            self.smtp_server_host, self.smtp_server_port,
            self.smtp_server_password, self.smtp_from_addr, self.smtp_to_addr,
            self.smtp_email_header, self.smtp_send_interval,
            self.data_persistent)

        if log.isEnabledFor(logging.INFO):
            log.info('EmailService 模块初始化完毕')
Exemplo n.º 17
0
    def cookie_login(self):
        # 创建会话
        session = requests.session()
        session.headers = requestHeader

        # 获取基本的cookie
        session.get(mainPageURL)

        # 添加用户配置的认证Cookie
        cookie = {'z_c0': self.z_c0}
        requests.utils.add_dict_to_cookiejar(session.cookies, cookie)

        # 检验是否成功登陆
        response = session.get(authTestURL)
        if response.status_code == 200:
            # 保存已经被认证Cookie
            self.auth_token = session.cookies.get_dict()
            if log.isEnabledFor(logging.INFO):
                log.info('知乎账户登陆成功')
            return True
        else:
            if log.isEnabledFor(logging.INFO):
                log.info('知乎账户登陆失败')
            return False
Exemplo n.º 18
0
    def parse_user_info(self, response_info):
        # 获取ResponseInfo中的信息
        data = response_info[1]
        token = response_info[2]

        # 提取JSON信息
        user_info_entities = None
        try:
            bs_obj = BeautifulSoup(data, 'html.parser')
            data_json = bs_obj.find('div', attrs={'id': 'data'})
            if data_json is None:
                return
            else:
                data_json = data_json['data-state']

            # 字符串处理
            # 处理转义字符
            data_json = html.unescape(data_json)
            # 处理html标签
            data_json = BeautifulSoup(data_json, 'html.parser').text

            # 转换为JSON对象
            data_json = json.loads(data_json)

            # 提取实体
            if 'entities' not in data_json:
                return
            entities = data_json['entities']

            # 提取用户信息
            if 'users' not in entities:
                return
            users = entities['users']

            # 提取目标用户信息
            if token not in users:
                return
            user_info = users[token]

            # 提取目标用户的个人信息
            avatar_url_template = None
            name = None
            headline = None
            locations = []
            business = None
            employments = []
            educations = []
            description = None
            gender = None
            following_count = None
            follower_count = None
            answer_count = None
            question_count = None
            voteup_count = None
            if USER_AVATAR_URL_TEMPLATE in user_info:
                avatar_url_template = user_info[USER_AVATAR_URL_TEMPLATE]

            if USER_NAME in user_info:
                name = user_info[USER_NAME]

            if USER_HEADLINE in user_info:
                headline = user_info[USER_HEADLINE]

            if USER_LOCATIONS in user_info:
                for location in user_info[USER_LOCATIONS]:
                    locations.append(location['name'])

            if USER_BUSINESS in user_info:
                business = user_info[USER_BUSINESS]['name']

            if USER_EMPLOYMENTS in user_info:
                for employment in user_info[USER_EMPLOYMENTS]:
                    elem = {}
                    if 'job' in employment:
                        job = employment['job']['name']
                        elem.update({'job': job})
                    if 'company' in employment:
                        company = employment['company']['name']
                        elem.update({'company': company})
                    employments.append(elem)

            if USER_EDUCATIONS in user_info:
                for education in user_info[USER_EDUCATIONS]:
                    if 'school' in education:
                        school = education['school']['name']
                        educations.append(school)

            if USER_DESCRIPTION in user_info:
                description = user_info[USER_DESCRIPTION]

            if USER_GENDER in user_info:
                gender = user_info[USER_GENDER]

            if USER_FOLLOWING_COUNT in user_info:
                following_count = user_info[USER_FOLLOWING_COUNT]

            if USER_FOLLOWER_COUNT in user_info:
                follower_count = user_info[USER_FOLLOWER_COUNT]

            if USER_ANSWER_COUNT in user_info:
                answer_count = user_info[USER_ANSWER_COUNT]

            if USER_QUESTION_COUNT in user_info:
                question_count = user_info[USER_QUESTION_COUNT]

            if USER_VOTE_UP_COUNT in user_info:
                voteup_count = user_info[USER_VOTE_UP_COUNT]

            # 构造用户信息实体
            user_info_entities = {
                USER_AVATAR_URL_TEMPLATE: avatar_url_template,
                USER_URL_TOKEN: token,
                USER_NAME: name,
                USER_HEADLINE: headline,
                USER_LOCATIONS: locations,
                USER_BUSINESS: business,
                USER_EMPLOYMENTS: employments,
                USER_EDUCATIONS: educations,
                USER_DESCRIPTION: description,
                USER_GENDER: gender,
                USER_FOLLOWING_COUNT: following_count,
                USER_FOLLOWER_COUNT: follower_count,
                USER_ANSWER_COUNT: answer_count,
                USER_QUESTION_COUNT: question_count,
                USER_VOTE_UP_COUNT: voteup_count
            }

        except Exception as e:
            if log.isEnabledFor(logging.ERROR):
                log.error('User info 数据解析错误')
                log.exception(e)

        # 处理提取的信息
        if user_info_entities is None:
            return

        # 再次检查用户是否已经添加,若已经添加则不再继续
        if self.token_filter.check_token(token) is True:
            return

        # 标记提取的用户信息
        self.token_filter.mark_token(token)

        # 生成 Following List URL
        if self.is_parser_following_list is True:
            pipe = self.redis_connection.pipeline()
            following_count = user_info_entities[USER_FOLLOWING_COUNT]
            if following_count is not None:
                offset = 0
                limit = 20
                while offset < following_count:
                    url_info = [
                        'list',
                        self.generate_following_info_url(token, offset, limit),
                        token, 'followingList'
                    ]
                    offset += limit
                    pipe.rpush(self.follow_info_url_queue, url_info)
                pipe.execute()

        # 生成 Follower List URL
        if self.is_parser_follower_list is True:
            pipe = self.redis_connection.pipeline()
            follower_count = user_info_entities[USER_FOLLOWER_COUNT]
            if follower_count is not None:
                offset = 0
                limit = 20
                while offset < follower_count:
                    url_info = [
                        'list',
                        self.generate_follower_info_url(token, offset, limit),
                        token, 'followerList'
                    ]
                    offset += limit
                    pipe.rpush(self.follow_info_url_queue, url_info)
                pipe.execute()

        # 保存提取到的用户信息
        if log.isEnabledFor(logging.DEBUG):
            log.info('成功获取一个用户的详细信息')
        self.redis_connection.rpush(self.persistent_cache, user_info_entities)
Exemplo n.º 19
0
    def start_spider_core(self):
        if log.isEnabledFor(logging.INFO):
            log.info('Spider 开始启动')

        try:
            # 创建Redis连接
            redis_connect_retry_times = 3
            while redis_connect_retry_times > 0:
                self.redis_connection = redis.StrictRedis(
                    host=self.redis_host,
                    port=self.redis_port,
                    db=self.redis_db,
                    password=self.redis_password)
                ping = self.redis_connection.ping()
                if ping is True:
                    if log.isEnabledFor(logging.INFO):
                        log.info('Redis 服务器连接成功')
                    break
                else:
                    if log.isEnabledFor(logging.INFO):
                        log.info('Redis 服务器连接失败')
                    redis_connect_retry_times -= 1
                    time.sleep(5)

            # 若连接不成功则退出
            if redis_connect_retry_times <= 0:
                raise Exception()

            # 创建MySQL连接
            self.mysql_connection = pymysql.connect(host=self.mysql_host,
                                                    user=self.mysql_username,
                                                    passwd=self.mysql_password,
                                                    db=self.mysql_database,
                                                    charset=self.mysql_charset)

        except Exception as e:
            if log.isEnabledFor(logging.ERROR):
                log.error('Redis 启动失败')
                log.exception(e)
            return

        # 创建 response 缓存队列
        self.response_buffer = ResponseBuffer()

        # 启动账户管理器并登陆
        self.account_manager = AccountManager(self.login_token, self.password,
                                              self.is_login_by_cookie,
                                              self.z_c0)
        is_login = self.account_manager.login()
        if not is_login:
            return

        # 启动Downloader
        self.downloader = Downloader(
            self.redis_connection, self.response_buffer, self.account_manager,
            self.is_proxy_service_enable, self.session_pool_size,
            self.download_thread_num, self.network_retry_times,
            self.connect_timeout, self.download_interval)
        self.downloader.start_downloader()

        # 启动Scheduler
        self.schedule = Scheduler(self.redis_connection, self.url_rate)
        self.schedule.start()

        # 启动 DataPersistent
        self.dataPersistent = DataPersistent(
            self.persistent_cache_size,
            self.follow_relation_persistent_cache_size, self.mysql_connection,
            self.redis_connection)
        self.dataPersistent.start_data_persistent()

        # 启动Processor
        self.processor = Processor(self.process_thread_num,
                                   self.is_parser_following_list,
                                   self.is_parser_follower_list,
                                   self.is_parser_follow_relation,
                                   self.redis_connection, self.response_buffer)
        self.processor.start_processor()
        self.processor.load_init_data(self.init_token)

        # 启动邮件服务
        if self.is_email_service_enable is True:
            self.email_service = EmailService(
                self.smtp_server_host, self.smtp_server_port,
                self.smtp_server_password, self.smtp_from_addr,
                self.smtp_to_addr, self.smtp_email_header,
                self.smtp_send_interval, self.dataPersistent)
            self.email_service.start_email_service()
            self.email_service.send_message('Spider 启动完毕')

        if log.isEnabledFor(logging.INFO):
            log.info('Spider 启动完毕')

        # 模块异常检查
        while True:
            # Downloader模块异常检查
            self.downloader.check_and_restart()
            # EmailService 模块异常检查
            if self.is_email_service_enable is True:
                self.email_service.check_and_restart()
            # DataPersistent 模块异常检查
            self.dataPersistent.check_and_restart()
            # Scheduler 模块异常检查
            # Processor 模块异常检查
            self.processor.check_and_restart()
            # 检查间隔
            time.sleep(180)
            gc.collect()
Exemplo n.º 20
0
    def load_config(self):
        section = "spider_core"
        config = configparser.ConfigParser()
        config.read("Core/Config/SpiderCoreConfig.conf", encoding="utf8")

        # 读取 downloader 模块配置
        self.is_proxy_service_enable = True if int(
            config.get(section, 'isProxyServiceEnable')) == 1 else False
        self.session_pool_size = int(config.get(section, 'sessionPoolSize'))
        self.download_thread_num = int(config.get(section,
                                                  'downloadThreadNum'))
        self.network_retry_times = int(config.get(section,
                                                  'networkRetryTimes'))
        self.connect_timeout = int(config.get(section, 'connectTimeout'))
        self.download_interval = int(config.get(section, 'downloadInterval'))

        # 读取 Processor 模块配置
        self.process_thread_num = int(config.get(section, 'processThreadNum'))
        self.is_parser_following_list = True if int(
            config.get(section, 'isParserFollowingList')) == 1 else False
        self.is_parser_follower_list = True if int(
            config.get(section, 'isParserFollowerList')) == 1 else False
        self.is_parser_follow_relation = True if int(
            config.get(section, 'isParserFollowRelation')) == 1 else False

        # 读取 Scheduler 模块配置
        self.url_rate = int(config.get(section, 'urlRate'))

        # 读取 DataPersistent 模块配置
        self.persistent_cache_size = int(
            config.get(section, 'persistentCacheSize'))
        self.follow_relation_persistent_cache_size = int(
            config.get(section, 'followRelationPersistentCacheSize'))

        # 读取邮件服务配置
        self.is_email_service_enable = True if int(
            config.get(section, 'isEmailServiceEnable')) == 1 else False
        self.smtp_server_host = config.get(section, 'smtpServerHost')
        self.smtp_server_port = int(config.get(section, 'smtpServerPort'))
        self.smtp_server_password = config.get(section, 'smtpServerPassword')
        self.smtp_from_addr = config.get(section, 'smtpFromAddr')
        self.smtp_to_addr = config.get(section, 'smtpToAddr')
        self.smtp_email_header = config.get(section, 'smtpEmailHeader')
        self.smtp_send_interval = int(config.get(section, 'smtpSendInterval'))

        # 读取 Redis 数据库配置
        self.redis_host = config.get(section, 'redisHost')
        self.redis_port = int(config.get(section, 'redisPort'))
        self.redis_db = int(config.get(section, 'redisDB'))
        self.redis_password = config.get(section, 'redisPassword')

        # 读取 MySQL 数据库配置
        self.mysql_host = config.get(section, 'mysqlHost')
        self.mysql_username = config.get(section, 'mysqlUsername')
        self.mysql_password = config.get(section, 'mysqlPassword')
        self.mysql_database = config.get(section, 'mysqlDatabase')
        self.mysql_charset = config.get(section, 'mysqlCharset')

        # 读取知乎账户配置
        self.is_login_by_cookie = True if int(
            config.get(section, 'isLoginByCookie')) == 1 else False
        self.z_c0 = config.get(section, 'z_c0')
        self.login_token = config.get(section, 'loginToken')
        self.password = config.get(section, 'password')

        # 读取初始token
        token_list = config.get(section, 'initToken')
        for token in token_list.split(','):
            self.init_token.append(str(token).strip())

        if log.isEnabledFor(logging.INFO):
            log.info('配置文件读取并配置完毕')
Exemplo n.º 21
0
    def start_email_service(self):
        # 启动线程
        self.email_service_thread.start()

        if log.isEnabledFor(logging.INFO):
            log.info('EmailService 模块启动成功')
Exemplo n.º 22
0
    def start_data_persistent(self):
        # 启动线程
        self.persistent_thread.start()

        if log.isEnabledFor(logging.INFO):
            log.info('DataPersistent 模块启动成功')
Exemplo n.º 23
0
    def run(self):
        if log.isEnabledFor(logging.INFO):
            log.info('数据下载线程' + self.thread_id + '启动')

        # 初次启动,阻塞至获取足够的代理
        self.session_manager.init_get()

        # 保存上一次未下载的url info
        previous_url_info = None
        while True:
            # 获取session
            session = self.session_manager.get_session_connection()

            # 尝试下载数据
            network_retry_times = 0
            while network_retry_times < self.NETWORK_RETRY_TIMES:
                try:
                    # 获取URL
                    if previous_url_info is None:
                        url_info = self.get_url_info_from_queue()
                        previous_url_info = url_info
                    else:
                        url_info = previous_url_info
                    url = url_info[1]

                    # 下载数据
                    response = session.get(url, timeout=self.CONNECT_TIMEOUT)

                    if log.isEnabledFor(logging.DEBUG):
                        log.debug(response.status_code)

                    # 检查返回结果
                    if response.status_code == 200:
                        # 封装下载的数据(包括原来的数据)
                        response_info = url_info
                        response_info[1] = response.text
                        self.put_response_info_to_queue(response_info)
                        previous_url_info = None
                        if log.isEnabledFor(logging.DEBUG):
                            log.debug('下载成功')
                        break
                    elif response.status_code == 403:
                        if log.isEnabledFor(logging.ERROR):
                            log.error('账号认证失败')
                        break
                    elif response.status_code == 429:
                        if log.isEnabledFor(logging.DEBUG):
                            log.debug('[' + str(self.thread_id) + ']' +
                                      '访问太频繁,稍候重新访问,响应码为:' +
                                      str(response.status_code))
                        previous_url_info = url_info
                        break
                    elif response.status_code == 404 or response.status_code == 410:
                        previous_url_info = None
                        del url_info
                        break
                    else:
                        if log.isEnabledFor(logging.ERROR):
                            log.error(response.status_code)
                        network_retry_times += 1
                except Exception as e:
                    network_retry_times += 1
                    time.sleep(self.DOWNLOAD_INTERVAL)
                    if log.isEnabledFor(logging.DEBUG):
                        log.debug('[' + str(self.thread_id) + ']' +
                                  '下载异常,正在重新连接...(第' +
                                  str(network_retry_times) + '次重试)')
                    if log.isEnabledFor(logging.DEBUG):
                        log.error(e)

            # 下载间隔
            time.sleep(self.DOWNLOAD_INTERVAL)

            # 归还session
            if network_retry_times < self.NETWORK_RETRY_TIMES:
                self.session_manager.return_session_connection(session)
            else:
                self.session_manager.return_and_switch_proxy(session)