def run(self): if log.isEnabledFor(logging.DEBUG): log.debug("邮件服务线程启动") try: while True: time.sleep(self.smtp_send_interval) # 准备发送的内容 msg = MIMEText(self.get_email_content(), 'plain', 'utf-8') msg['from'] = self.smtp_from_addr msg['to'] = self.smtp_to_addr msg['Subject'] = Header(self.get_email_header(), 'utf-8').encode() # 发送 smtp_server = smtplib.SMTP(self.smtp_server_host, self.smtp_server_port) smtp_server.login(self.smtp_from_addr, self.smtp_server_password) smtp_server.sendmail(self.smtp_from_addr, [self.smtp_to_addr], msg.as_string()) smtp_server.quit() # 更新最后一次发送时间 self.lastSendTime = datetime.datetime.now() except Exception as e: if log.isEnabledFor(logging.ERROR): log.error("邮件发送失败") log.exception(e) self.thread_status = 'error'
def run(self): if log.isEnabledFor(logging.INFO): log.info('数据处理线程' + self.thread_id + '启动') try: while True: # 获取Response数据 response = self.response_buffer.get_response_from_buffer() if response is None or len(response) < 2: continue # 判断Response类型 response_type = response[0] # 分派给对应的处理方法 if response_type == 'info': self.parse_user_info(response) elif response_type == 'list': self.parse_follow_info(response) time.sleep(0.1) except Exception as e: self.thread_status = 'error' if log.isEnabledFor(logging.ERROR): log.exception(e)
def login(self): if self.is_login_by_cookie is True: if log.isEnabledFor(logging.INFO): log.info('使用Cookie登陆方式登陆') return self.cookie_login() else: if log.isEnabledFor(logging.INFO): log.info('使用邮箱或手机号码登陆方式登陆') return self.common_login()
def parse_follow_info(self, response_info): # 获取ResponseInfo中的信息 data = response_info[1] # 提取JSON follow_list_token = [] try: # 转为JSON 对象 json_data = json.loads(data) # 提取用户列表信息 if 'data' not in json_data: return data = json_data['data'] # 提取用户 token for follow_info in data: if 'url_token' in follow_info: token = follow_info['url_token'] # 检查重复并添加 if self.token_filter.check_token(token) is False: follow_list_token.append(token) except Exception as e: if log.isEnabledFor(logging.ERROR): log.error('Follower & Following List 数据解析错误') if log.isEnabledFor(logging.DEBUG): log.exception(e) return # 添加token url 到队列中 for token in follow_list_token: # 封装 URL 信息(List) url_info = ['info', self.generate_user_info_url(token), token] self.redis_connection.rpush(self.user_info_url_queue, url_info) # 提取用户的关注关系(即 following) # (返回的Response内容[info, data, token, followingList/followerList]) if self.is_parser_follow_relation is True: # 关注列表类型 follow_list_type = response_info[3] # 用户Token token = response_info[2] if follow_list_type == 'followingList': pipe = self.redis_connection.pipeline() for following_token in follow_list_token: # 封装关注关系 follow_relation = { FOLLOW_FROM: token, FOLLOW_TO: following_token } pipe.rpush(self.follow_relation_persistent_cache, follow_relation) pipe.execute()
def start_downloader(self): # 启动下载线程 for download_thread in self.download_thread_list: download_thread.start() if log.isEnabledFor(logging.INFO): log.info('Downloader 模块启动成功')
def __init__(self, redis_connection, response_buffer, account_manager, is_proxy_service_enable, session_pool_size, download_thread_num, network_retry_times, connect_timeout, download_interval): # 设置下载线程的数量 self.download_thread_num = download_thread_num # 设置 Redis 连接 self.redis_connection = redis_connection # 设置 response 缓存队列 self.response_buffer = response_buffer # 设置账户认证管理器 self.account_manager = account_manager # 设置并启动sessionManager self.session_manager = SessionManager(session_pool_size, account_manager, is_proxy_service_enable) # 设置网络连接参数 self.NETWORK_RETRY_TIMES = network_retry_times self.CONNECT_TIMEOUT = connect_timeout self.DOWNLOAD_INTERVAL = download_interval # 初始化下载线程 self.download_thread_list = [] for i in range(self.download_thread_num): download_thread = DownloadThread( 'thread' + str(i), self.session_manager, self.redis_connection, self.response_buffer, self.NETWORK_RETRY_TIMES, self.CONNECT_TIMEOUT, self.DOWNLOAD_INTERVAL) self.download_thread_list.append(download_thread) if log.isEnabledFor(logging.INFO): log.info("Downloader 模块初始化完毕")
def start_processor(self): # 启动处理线程 for process_thread in self.processor_list: process_thread.start() if log.isEnabledFor(logging.INFO): log.info('Processor 模块启动成功')
def __init__(self, process_thread_num, is_parser_following_list, is_parser_follower_list, is_parser_follow_relation, redis_connection, response_buffer): # 设置数据处理器数量 self.process_thread_num = process_thread_num # 设置 Redis 连接 self.redis_connection = redis_connection # 创建 Token 过滤器 self.token_filter = TokenFilter(self.redis_connection) # 设置 response 缓存队列 self.response_buffer = response_buffer # 是否解析正在关注列表 self.is_parser_following_list = is_parser_following_list # 是否解析关注者列表 self.is_parser_follower_list = is_parser_follower_list # 是否解析关注关系 self.is_parser_follow_relation = is_parser_follow_relation # 创建处理器 self.processor_list = [] for i in range(process_thread_num): process_thread = ProcessThread( 'thread' + str(i), self.redis_connection, self.token_filter, self.response_buffer, self.is_parser_following_list, self.is_parser_follower_list, self.is_parser_follow_relation) self.processor_list.append(process_thread) if log.isEnabledFor(logging.INFO): log.info('Processor 模块初始化完毕')
def run(self): if log.isEnabledFor(logging.INFO): log.info('Scheduler 模块启动成功') while True: # 当 urlQueue 队列中元素太多时,停止放入 while self.redis_connection.llen(self.url_queue_name) > 500: time.sleep(180) # 当队列中均没有元素时,暂停添加 follow_info_queue_length = self.redis_connection.llen( self.follow_info_url_queue) user_info_queue_length = self.redis_connection.llen( self.user_info_url_queue) if follow_info_queue_length == 0 and user_info_queue_length == 0: time.sleep(20) continue # 分别从两个队列中获取设定比例的数量的元素添加到下载URL队列 for i in range(self.url_rate): url_info = self.redis_connection.lpop(self.user_info_url_queue) if url_info is not None: self.redis_connection.rpush(self.url_queue_name, url_info) del url_info for i in range(10 - self.url_rate): url_info = self.redis_connection.lpop( self.follow_info_url_queue) if url_info is not None: self.redis_connection.rpush(self.url_queue_name, url_info) del url_info
def validate_proxy_ip(self, proxy_ip_info): from Proxy import proxyCore if proxy_ip_info is None: return False # 构造代理信息 proxy_ip = proxy_ip_info[proxyCore.PROXY_IP] proxy_port = proxy_ip_info[proxyCore.PROXY_PORT] proxy_protocol = proxy_ip_info[proxyCore.PROXY_PROTOCOL].lower() proxy = {proxy_protocol: proxy_ip + ':' + proxy_port} # 使用代理进行连接 retry_time = 0 while retry_time < NETWORK_RECONNECT_TIMES: try: response = requests.get(url, timeout=CONNECT_TIMEOUT, headers=header, proxies=proxy) # 解析返回的当前使用的IP并判断是否有效 match_list = re.findall(r'[0-9]+(?:\.[0-9]+){3}', response.text) if len(match_list) > 0: current_ip = match_list.pop() if current_ip is not None and current_ip == proxy_ip: if log.isEnabledFor(logging.DEBUG): log.debug("获取到一个可用的代理IP") return True else: retry_time += 1 time.sleep(1) except Exception: retry_time += 1 return False
def __init__(self, session_pool_size, account_manager, is_proxy_service_enable): # session pool 大小 self.session_pool_size = session_pool_size # 已经创建的session数量 self.created_session_num = 0 # 当前池中的session数量 self.available_session_num = 0 # 是否启用代理服务 self.is_proxy_service_enable = is_proxy_service_enable # 账号认证管理器 self.account_manager = account_manager # available session num 锁 self.available_session_lock = threading.Lock() # created session num 锁 self.created_session_lock = threading.Lock() # 创建 session pool self.session_pool = queue.Queue(session_pool_size) # 创建并启动代理服务 if self.is_proxy_service_enable is True: self.proxy_service = proxyCore.ProxyService() self.proxy_service.start_proxy_service() if log.isEnabledFor(logging.INFO): log.info("Session Manager 启动成功")
def common_login(self): # 创建会话 session = requests.session() session.headers = requestHeader # 获取 _xsrf try: response = session.get(mainPageURL) input_tag = BeautifulSoup(response.text, 'html.parser').find( 'input', attrs={'name': '_xsrf'}) if input_tag is None: return False _xsrf = input_tag['value'] # login form_data = { '_xsrf': _xsrf, 'email': self.login_token, 'password': self.password } requestHeader.update({ 'X-Requested-With': 'XMLHttpRequest', 'X-Xsrftoken': _xsrf }) session.headers = requestHeader response = session.post(url=loginURL, data=form_data) if response.status_code == 200: # 检查是否已经登陆成功 response = session.get(authTestURL) if response.status_code == 200: # 保存登陆认证cookie self.auth_token = session.cookies.get_dict() if log.isEnabledFor(logging.INFO): log.info('知乎账户登陆成功') return True # 登陆失败 if log.isEnabledFor(logging.INFO): log.info('知乎账户登陆失败') return False except Exception as e: if log.isEnabledFor(logging.ERROR): log.error(e) finally: session.close()
def run(self): try: while True: if proxy_pool.qsize( ) < PROXY_POOL_SIZE and unchecked_proxy_list.qsize( ) < PROXY_POOL_SIZE: self.fetch_and_parse_proxy() elif proxy_pool.qsize() == PROXY_POOL_SIZE: if log.isEnabledFor(logging.DEBUG): log.debug('代理池更新') self.scan_proxy_pool() time.sleep(PROXY_POOL_SCAN_INTERVAL) else: time.sleep(60) except Exception as e: if log.isEnabledFor(logging.ERROR): log.exception(e) self.status = 'error'
def check_and_restart(self): if self.persistent_thread.thread_status == 'error': self.persistent_thread = PersistentThread( self.db_connection, self.redis_connection, self.persistent_cache_size, self.follow_relation_persistent_cache_size) self.persistent_thread.start() if log.isEnabledFor(logging.INFO): log.info('DataPersistent模块持久化线程中重新启动')
def check_and_restart(self): if self.email_service_thread.thread_status == 'error': self.email_service_thread = EmailServiceThread( self.smtp_server_host, self.smtp_server_port, self.smtp_server_password, self.smtp_from_addr, self.smtp_to_addr, self.smtp_email_header, self.smtp_send_interval, self.data_persistent) self.email_service_thread.start() if log.isEnabledFor(logging.INFO): log.info('EmailService线程重新启动')
def load_init_data(self, token_list): if token_list is None: return for token in token_list: # 封装 URL 信息 url_info = ['info', URL_PUBLIC + token + URL_PINS, token] self.redis_connection.rpush('userInfoURLQueue', url_info) del url_info if log.isEnabledFor(logging.INFO): log.info('初始用户Token载入完毕')
def run(self): # 初始化配置 self.init() # 启动代理检验线程 validate_thread_list = [] for i in range(PROXY_VALIDATE_THREAD_NUM): validate_thread = ProxyValidateThread() validate_thread_list.append(validate_thread) validate_thread.start() if log.isEnabledFor(logging.DEBUG): log.debug("代理验证线程启动") # 启动代理池扫描线程 scan_thread = ProxyPoolScanThread() scan_thread.start() if log.isEnabledFor(logging.DEBUG): log.debug("代理池扫描线程启动") # 检查是否有线程出现异常并将其重启 while True: # 检查代理验证线程 for thread in validate_thread_list: if thread.status == 'error': validate_thread_list.remove(thread) thread = ProxyValidateThread() validate_thread_list.append(thread) thread.start() if log.error(logging.ERROR): log.error('代理验证线程重新启动') # 检查代理池扫描线程 if scan_thread.status == 'error': scan_thread = ProxyPoolScanThread() scan_thread.start() if log.isEnabledFor(logging.ERROR): log.error("代理池扫描线程重新启动") time.sleep(180)
def check_and_restart(self): for download_thread in self.download_thread_list: if download_thread.thread_status == 'error': thread_id = download_thread.thread_id self.download_thread_list.remove(download_thread) download_thread = DownloadThread( thread_id, self.session_manager, self.redis_connection, self.response_buffer, self.NETWORK_RETRY_TIMES, self.CONNECT_TIMEOUT, self.DOWNLOAD_INTERVAL) self.download_thread_list.append(download_thread) download_thread.start() if log.isEnabledFor(logging.INFO): log.info('数据下载线程' + thread_id + '重新启动')
def check_and_restart(self): for process_thread in self.processor_list: if process_thread.thread_status == 'error': thread_id = process_thread.thread_id self.processor_list.remove(process_thread) del process_thread new_thread = ProcessThread(thread_id, self.redis_connection, self.token_filter, self.response_buffer, self.is_parser_following_list, self.is_parser_follower_list, self.is_parser_follow_relation) self.processor_list.append(new_thread) new_thread.start() if log.isEnabledFor(logging.ERROR): log.error('数据处理器线程[' + thread_id + ']重新启动')
def cookie_login(self): # 创建会话 session = requests.session() session.headers = requestHeader # 获取基本的cookie session.get(mainPageURL) # 添加用户配置的认证Cookie cookie = {'z_c0': self.z_c0} requests.utils.add_dict_to_cookiejar(session.cookies, cookie) # 检验是否成功登陆 response = session.get(authTestURL) if response.status_code == 200: # 保存已经被认证Cookie self.auth_token = session.cookies.get_dict() if log.isEnabledFor(logging.INFO): log.info('知乎账户登陆成功') return True else: if log.isEnabledFor(logging.INFO): log.info('知乎账户登陆失败') return False
def __init__(self, redis_connection, url_rate): threading.Thread.__init__(self) # 设置Redis连接 self.redis_connection = redis_connection # 设置 url 的调度比例 self.url_rate = url_rate # Following & Follower URL 队列名称 self.follow_info_url_queue = 'followInfoURLQueue' # User info URL 队列名称 self.user_info_url_queue = 'userInfoURLQueue' # 待下载URL队列名称 self.url_queue_name = 'urlQueue' if log.isEnabledFor(logging.INFO): log.info('Scheduler 模块初始化完毕')
def __init__(self, persistent_cache_size, follow_relation_persistent_cache_size, db_connection, redis_connection): # 设置用户信息数据持久化缓存大小 self.persistent_cache_size = persistent_cache_size # 设置用户关注关系持久化缓存大小 self.follow_relation_persistent_cache_size = follow_relation_persistent_cache_size # 设置数据库连接 self.db_connection = db_connection # 设置Redis连接 self.redis_connection = redis_connection # 创建数据库持久化线程 self.persistent_thread = PersistentThread( self.db_connection, self.redis_connection, self.persistent_cache_size, self.follow_relation_persistent_cache_size) if log.isEnabledFor(logging.INFO): log.info('DataPersistent 模块初始化完毕')
def __init__(self, smtp_server_host, smtp_server_port, smtp_server_password, smtp_from_addr, smtp_to_addr, smtp_email_header, smtp_send_interval, data_persistent): self.data_persistent = data_persistent # 设置参数 self.smtp_server_host = smtp_server_host self.smtp_server_port = smtp_server_port self.smtp_server_password = smtp_server_password self.smtp_from_addr = smtp_from_addr self.smtp_to_addr = smtp_to_addr self.smtp_email_header = smtp_email_header self.smtp_send_interval = smtp_send_interval # 创建邮件定时发送线程 self.email_service_thread = EmailServiceThread( self.smtp_server_host, self.smtp_server_port, self.smtp_server_password, self.smtp_from_addr, self.smtp_to_addr, self.smtp_email_header, self.smtp_send_interval, self.data_persistent) if log.isEnabledFor(logging.INFO): log.info('EmailService 模块初始化完毕')
def send_message(self, email_content): # 准备发送的内容 now = datetime.datetime.now() header = self.smtp_email_header + '[' + str(now.month) + '-' + str(now.day) + ' ' + \ str(now.hour) + ':' + str(now.minute) + ':' + str(now.second) + ']' msg = MIMEText(email_content, 'plain', 'utf-8') msg['from'] = self.smtp_from_addr msg['to'] = self.smtp_to_addr msg['Subject'] = Header(header, 'utf-8').encode() # 发送 try: smtp_server = smtplib.SMTP(self.smtp_server_host, self.smtp_server_port) smtp_server.login(self.smtp_from_addr, self.smtp_server_password) smtp_server.sendmail(self.smtp_from_addr, [self.smtp_to_addr], msg.as_string()) smtp_server.quit() except Exception as e: if log.isEnabledFor(logging.ERROR): log.error("邮件发送失败") log.exception(e)
def run(self): try: while True: # 若正在扫描代理池,则暂停 while is_scanning: time.sleep(3) if proxy_pool.qsize( ) < PROXY_POOL_SIZE and unchecked_proxy_list.qsize() > 0: unchecked_proxy = unchecked_proxy_list.get() is_available = self.dataValidateModule.validate_proxy_ip( unchecked_proxy) if is_available is True: proxy_pool.put(unchecked_proxy) # print(unchecked_proxy) time.sleep(1) else: time.sleep(5) except Exception as e: if log.isEnabledFor(logging.ERROR): log.exception(e) self.status = 'error'
def parse_user_info(self, response_info): # 获取ResponseInfo中的信息 data = response_info[1] token = response_info[2] # 提取JSON信息 user_info_entities = None try: bs_obj = BeautifulSoup(data, 'html.parser') data_json = bs_obj.find('div', attrs={'id': 'data'}) if data_json is None: return else: data_json = data_json['data-state'] # 字符串处理 # 处理转义字符 data_json = html.unescape(data_json) # 处理html标签 data_json = BeautifulSoup(data_json, 'html.parser').text # 转换为JSON对象 data_json = json.loads(data_json) # 提取实体 if 'entities' not in data_json: return entities = data_json['entities'] # 提取用户信息 if 'users' not in entities: return users = entities['users'] # 提取目标用户信息 if token not in users: return user_info = users[token] # 提取目标用户的个人信息 avatar_url_template = None name = None headline = None locations = [] business = None employments = [] educations = [] description = None gender = None following_count = None follower_count = None answer_count = None question_count = None voteup_count = None if USER_AVATAR_URL_TEMPLATE in user_info: avatar_url_template = user_info[USER_AVATAR_URL_TEMPLATE] if USER_NAME in user_info: name = user_info[USER_NAME] if USER_HEADLINE in user_info: headline = user_info[USER_HEADLINE] if USER_LOCATIONS in user_info: for location in user_info[USER_LOCATIONS]: locations.append(location['name']) if USER_BUSINESS in user_info: business = user_info[USER_BUSINESS]['name'] if USER_EMPLOYMENTS in user_info: for employment in user_info[USER_EMPLOYMENTS]: elem = {} if 'job' in employment: job = employment['job']['name'] elem.update({'job': job}) if 'company' in employment: company = employment['company']['name'] elem.update({'company': company}) employments.append(elem) if USER_EDUCATIONS in user_info: for education in user_info[USER_EDUCATIONS]: if 'school' in education: school = education['school']['name'] educations.append(school) if USER_DESCRIPTION in user_info: description = user_info[USER_DESCRIPTION] if USER_GENDER in user_info: gender = user_info[USER_GENDER] if USER_FOLLOWING_COUNT in user_info: following_count = user_info[USER_FOLLOWING_COUNT] if USER_FOLLOWER_COUNT in user_info: follower_count = user_info[USER_FOLLOWER_COUNT] if USER_ANSWER_COUNT in user_info: answer_count = user_info[USER_ANSWER_COUNT] if USER_QUESTION_COUNT in user_info: question_count = user_info[USER_QUESTION_COUNT] if USER_VOTE_UP_COUNT in user_info: voteup_count = user_info[USER_VOTE_UP_COUNT] # 构造用户信息实体 user_info_entities = { USER_AVATAR_URL_TEMPLATE: avatar_url_template, USER_URL_TOKEN: token, USER_NAME: name, USER_HEADLINE: headline, USER_LOCATIONS: locations, USER_BUSINESS: business, USER_EMPLOYMENTS: employments, USER_EDUCATIONS: educations, USER_DESCRIPTION: description, USER_GENDER: gender, USER_FOLLOWING_COUNT: following_count, USER_FOLLOWER_COUNT: follower_count, USER_ANSWER_COUNT: answer_count, USER_QUESTION_COUNT: question_count, USER_VOTE_UP_COUNT: voteup_count } except Exception as e: if log.isEnabledFor(logging.ERROR): log.error('User info 数据解析错误') log.exception(e) # 处理提取的信息 if user_info_entities is None: return # 再次检查用户是否已经添加,若已经添加则不再继续 if self.token_filter.check_token(token) is True: return # 标记提取的用户信息 self.token_filter.mark_token(token) # 生成 Following List URL if self.is_parser_following_list is True: pipe = self.redis_connection.pipeline() following_count = user_info_entities[USER_FOLLOWING_COUNT] if following_count is not None: offset = 0 limit = 20 while offset < following_count: url_info = [ 'list', self.generate_following_info_url(token, offset, limit), token, 'followingList' ] offset += limit pipe.rpush(self.follow_info_url_queue, url_info) pipe.execute() # 生成 Follower List URL if self.is_parser_follower_list is True: pipe = self.redis_connection.pipeline() follower_count = user_info_entities[USER_FOLLOWER_COUNT] if follower_count is not None: offset = 0 limit = 20 while offset < follower_count: url_info = [ 'list', self.generate_follower_info_url(token, offset, limit), token, 'followerList' ] offset += limit pipe.rpush(self.follow_info_url_queue, url_info) pipe.execute() # 保存提取到的用户信息 if log.isEnabledFor(logging.DEBUG): log.info('成功获取一个用户的详细信息') self.redis_connection.rpush(self.persistent_cache, user_info_entities)
def start_spider_core(self): if log.isEnabledFor(logging.INFO): log.info('Spider 开始启动') try: # 创建Redis连接 redis_connect_retry_times = 3 while redis_connect_retry_times > 0: self.redis_connection = redis.StrictRedis( host=self.redis_host, port=self.redis_port, db=self.redis_db, password=self.redis_password) ping = self.redis_connection.ping() if ping is True: if log.isEnabledFor(logging.INFO): log.info('Redis 服务器连接成功') break else: if log.isEnabledFor(logging.INFO): log.info('Redis 服务器连接失败') redis_connect_retry_times -= 1 time.sleep(5) # 若连接不成功则退出 if redis_connect_retry_times <= 0: raise Exception() # 创建MySQL连接 self.mysql_connection = pymysql.connect(host=self.mysql_host, user=self.mysql_username, passwd=self.mysql_password, db=self.mysql_database, charset=self.mysql_charset) except Exception as e: if log.isEnabledFor(logging.ERROR): log.error('Redis 启动失败') log.exception(e) return # 创建 response 缓存队列 self.response_buffer = ResponseBuffer() # 启动账户管理器并登陆 self.account_manager = AccountManager(self.login_token, self.password, self.is_login_by_cookie, self.z_c0) is_login = self.account_manager.login() if not is_login: return # 启动Downloader self.downloader = Downloader( self.redis_connection, self.response_buffer, self.account_manager, self.is_proxy_service_enable, self.session_pool_size, self.download_thread_num, self.network_retry_times, self.connect_timeout, self.download_interval) self.downloader.start_downloader() # 启动Scheduler self.schedule = Scheduler(self.redis_connection, self.url_rate) self.schedule.start() # 启动 DataPersistent self.dataPersistent = DataPersistent( self.persistent_cache_size, self.follow_relation_persistent_cache_size, self.mysql_connection, self.redis_connection) self.dataPersistent.start_data_persistent() # 启动Processor self.processor = Processor(self.process_thread_num, self.is_parser_following_list, self.is_parser_follower_list, self.is_parser_follow_relation, self.redis_connection, self.response_buffer) self.processor.start_processor() self.processor.load_init_data(self.init_token) # 启动邮件服务 if self.is_email_service_enable is True: self.email_service = EmailService( self.smtp_server_host, self.smtp_server_port, self.smtp_server_password, self.smtp_from_addr, self.smtp_to_addr, self.smtp_email_header, self.smtp_send_interval, self.dataPersistent) self.email_service.start_email_service() self.email_service.send_message('Spider 启动完毕') if log.isEnabledFor(logging.INFO): log.info('Spider 启动完毕') # 模块异常检查 while True: # Downloader模块异常检查 self.downloader.check_and_restart() # EmailService 模块异常检查 if self.is_email_service_enable is True: self.email_service.check_and_restart() # DataPersistent 模块异常检查 self.dataPersistent.check_and_restart() # Scheduler 模块异常检查 # Processor 模块异常检查 self.processor.check_and_restart() # 检查间隔 time.sleep(180) gc.collect()
def load_config(self): section = "spider_core" config = configparser.ConfigParser() config.read("Core/Config/SpiderCoreConfig.conf", encoding="utf8") # 读取 downloader 模块配置 self.is_proxy_service_enable = True if int( config.get(section, 'isProxyServiceEnable')) == 1 else False self.session_pool_size = int(config.get(section, 'sessionPoolSize')) self.download_thread_num = int(config.get(section, 'downloadThreadNum')) self.network_retry_times = int(config.get(section, 'networkRetryTimes')) self.connect_timeout = int(config.get(section, 'connectTimeout')) self.download_interval = int(config.get(section, 'downloadInterval')) # 读取 Processor 模块配置 self.process_thread_num = int(config.get(section, 'processThreadNum')) self.is_parser_following_list = True if int( config.get(section, 'isParserFollowingList')) == 1 else False self.is_parser_follower_list = True if int( config.get(section, 'isParserFollowerList')) == 1 else False self.is_parser_follow_relation = True if int( config.get(section, 'isParserFollowRelation')) == 1 else False # 读取 Scheduler 模块配置 self.url_rate = int(config.get(section, 'urlRate')) # 读取 DataPersistent 模块配置 self.persistent_cache_size = int( config.get(section, 'persistentCacheSize')) self.follow_relation_persistent_cache_size = int( config.get(section, 'followRelationPersistentCacheSize')) # 读取邮件服务配置 self.is_email_service_enable = True if int( config.get(section, 'isEmailServiceEnable')) == 1 else False self.smtp_server_host = config.get(section, 'smtpServerHost') self.smtp_server_port = int(config.get(section, 'smtpServerPort')) self.smtp_server_password = config.get(section, 'smtpServerPassword') self.smtp_from_addr = config.get(section, 'smtpFromAddr') self.smtp_to_addr = config.get(section, 'smtpToAddr') self.smtp_email_header = config.get(section, 'smtpEmailHeader') self.smtp_send_interval = int(config.get(section, 'smtpSendInterval')) # 读取 Redis 数据库配置 self.redis_host = config.get(section, 'redisHost') self.redis_port = int(config.get(section, 'redisPort')) self.redis_db = int(config.get(section, 'redisDB')) self.redis_password = config.get(section, 'redisPassword') # 读取 MySQL 数据库配置 self.mysql_host = config.get(section, 'mysqlHost') self.mysql_username = config.get(section, 'mysqlUsername') self.mysql_password = config.get(section, 'mysqlPassword') self.mysql_database = config.get(section, 'mysqlDatabase') self.mysql_charset = config.get(section, 'mysqlCharset') # 读取知乎账户配置 self.is_login_by_cookie = True if int( config.get(section, 'isLoginByCookie')) == 1 else False self.z_c0 = config.get(section, 'z_c0') self.login_token = config.get(section, 'loginToken') self.password = config.get(section, 'password') # 读取初始token token_list = config.get(section, 'initToken') for token in token_list.split(','): self.init_token.append(str(token).strip()) if log.isEnabledFor(logging.INFO): log.info('配置文件读取并配置完毕')
def run(self): debug_info = None try: while True: # 持久化用户信息 current_user_info_cache_size = self.redis_connection.llen( self.persistent_cache) if current_user_info_cache_size >= self.persistent_cache_size: self.lock.acquire() cursor = self.db_connection.cursor() for i in range(current_user_info_cache_size): user_info = self.redis_connection.lpop( self.persistent_cache) debug_info = user_info if user_info is not None: user_info = self.convert_user_info( eval(user_info.decode('utf-8'))) cursor.execute(INSERT_USER_INFO, [ user_info[USER_AVATAR_URL_TEMPLATE], user_info[USER_URL_TOKEN], user_info[USER_NAME], user_info[USER_HEADLINE], user_info[USER_LOCATIONS], user_info[USER_BUSINESS], user_info[USER_EMPLOYMENTS], user_info[USER_EDUCATIONS], user_info[USER_DESCRIPTION], user_info[USER_GENDER], user_info[USER_FOLLOWING_COUNT], user_info[USER_FOLLOWER_COUNT], user_info[USER_ANSWER_COUNT], user_info[USER_QUESTION_COUNT], user_info[USER_VOTE_UP_COUNT] ]) self.db_connection.commit() cursor.close() self.lock.release() # 持久化关注关系 current_follow_relation_cache_size = self.redis_connection.llen( self.follow_relation_persistent_cache) if current_follow_relation_cache_size >= self.follow_relation_persistent_cache_size: self.lock.acquire() cursor = self.db_connection.cursor() for i in range(current_follow_relation_cache_size): follow_relation = self.redis_connection.lpop( self.follow_relation_persistent_cache) debug_info = follow_relation if follow_relation is not None: follow_relation = eval( follow_relation.decode('utf-8')) cursor.execute(INSERT_FOLLOW_RELATION, [ follow_relation[FOLLOW_FROM], follow_relation[FOLLOW_TO] ]) self.db_connection.commit() cursor.close() self.lock.release() # 检查时间间隔 time.sleep(180) except Exception as e: if log.isEnabledFor(logging.ERROR): log.error('用户数据持久化线程异常退出') log.exception(e) log.debug(debug_info) self.thread_status = 'error'
def start_data_persistent(self): # 启动线程 self.persistent_thread.start() if log.isEnabledFor(logging.INFO): log.info('DataPersistent 模块启动成功')