def run(self): if log.isEnabledFor(logging.DEBUG): log.debug("邮件服务线程启动") try: while True: time.sleep(SMTP_SEND_INTERVAL) # 准备发送的内容 msg = MIMEText(self.get_email_content(), 'plain', 'utf-8') msg['from'] = SMTP_FROM_ADDR msg['to'] = SMTP_TO_ADDR msg['Subject'] = Header(self.get_email_header(), 'utf-8').encode() # 发送 smtp_server = smtplib.SMTP(SMTP_SERVER_HOST, SMTP_SERVER_PORT) smtp_server.login(SMTP_FROM_ADDR, SMTP_SERVER_PASSWORD) smtp_server.sendmail(SMTP_FROM_ADDR, [SMTP_TO_ADDR], msg.as_string()) smtp_server.quit() # 更新最后一次发送时间 self.lastSendTime = datetime.datetime.now() except Exception as e: if log.isEnabledFor(logging.ERROR): log.error("邮件发送失败") log.exception(e) self.status = 'error'
def run(self): try: while True: raw_data = None token = None thread_name = None data = self.cache_queue.get_data_from_user_list_cache_queue() if QUEUE_ELEM_HTML in data: raw_data = data[QUEUE_ELEM_HTML] if QUEUE_ELEM_TOKEN in data: token = data[QUEUE_ELEM_TOKEN] if QUEUE_ELEM_THREAD_NAME in data: thread_name = data[QUEUE_ELEM_THREAD_NAME] if raw_data is not None and token is not None: token_list = self.parse_user_list(raw_data, token) if token_list is not None: if log.isEnabledFor(logging.DEBUG): log.debug('[' + thread_name + ']开始分析用户“' + token + '”的关注列表') self.user_token_cache_queue.add_token_into_cache_queue( token_list) except Exception as e: if log.isEnabledFor(logging.ERROR): log.exception(e) self.status = 'error'
def run(self): if log.isEnabledFor(logging.DEBUG): log.debug('用户信息抓取线程' + self.thread_name + '启动') try: self.user_info_scrape() except Exception as e: if log.isEnabledFor(logging.ERROR): log.exception(e) self.status = 'error'
def validate_proxy_ip(self, proxy_ip_info): if proxy_ip_info is None: return False # 构造代理信息 proxy_ip = proxy_ip_info[proxyCore.PROXY_IP] proxy_port = proxy_ip_info[proxyCore.PROXY_PORT] proxy_protocol = proxy_ip_info[proxyCore.PROXY_PROTOCOL].lower() proxy = {proxy_protocol: proxy_ip + ':' + proxy_port} # 使用代理进行连接 self.session.headers = header self.session.proxies = proxy retry_time = 0 while retry_time < NETWORK_RECONNECT_TIMES: try: response = self.session.get(url, timeout=CONNECT_TIMEOUT) # 解析返回的当前使用的IP并判断是否有效 match_list = re.findall(r'[0-9]+(?:\.[0-9]+){3}', response.text) if len(match_list) > 0: current_ip = match_list.pop() if current_ip is not None and current_ip == proxy_ip: if log.isEnabledFor(logging.DEBUG): log.debug("获取到一个可用的代理IP") return True else: retry_time += 1 except Exception: retry_time += 1 return False
def run(self): try: while True: raw_data = None token = None thread_name = None data = self.cache_queue.get_data_from_user_info_cache_queue() if QUEUE_ELEM_HTML in data: raw_data = data[QUEUE_ELEM_HTML] if QUEUE_ELEM_TOKEN in data: token = data[QUEUE_ELEM_TOKEN] if QUEUE_ELEM_THREAD_NAME in data: thread_name = data[QUEUE_ELEM_THREAD_NAME] if raw_data is not None and token is not None: user_info = self.parse_user_information(raw_data, token) if user_info is not None: if log.isEnabledFor(logging.DEBUG): log.debug('[' + thread_name + "]搜索到一个用户:" + user_info[USER_NAME]) # 使用布隆过滤器标记 self.bloom_filter.mark_value(user_info[USER_URL_TOKEN]) self.db_connection.add_user_info( self.convert_user_info(user_info)) # 封装代分析用户关注列表的token信息 token_info = { USER_URL_TOKEN: user_info[USER_URL_TOKEN], USER_FOLLOWING_COUNT: user_info[USER_FOLLOWING_COUNT], USER_FOLLOWER_COUNT: user_info[USER_FOLLOWER_COUNT] } # print(token_info) self.user_token_cache_queue.add_token_into_analysed_cache_queue( [token_info]) except Exception as e: if log.isEnabledFor(logging.ERROR): log.exception(e) self.status = 'error'
def __init__(self): # 初始化配置参数 self.config_init() # 初始化数据库模块 self.DBConnectModule = DBConnector.DBConnectModule() # 初始化 BloomFilter 模块 self.bloomFilterModule = BloomFilter.BloomFilter() # 初始化用户Token缓存 self.userTokenCacheQueue = UserList.UserTokenCacheQueue( self.DBConnectModule) # 初始化待分析网页缓存 self.cacheQueue = DataParser.CacheQueue() # 初始化数据获取模块 self.dataFetchModule = DataFetch.DataFetchModule(IS_PROXY_ENABLE) # 初始化数据解析模块 self.dataParseModule = DataParser.DataParseModule( self.DBConnectModule, self.userTokenCacheQueue, self.cacheQueue, self.bloomFilterModule) # 初始化邮件服务模块 if IS_EMAIL_NOTIFICATION_ENABLE is True: self.emailService = EmailService.EmailService(self.DBConnectModule) # 初始化用户信息爬取线程 self.user_info_scrape_thread_list = [] for thread_count in range(USER_INFO_SCRAPE_THREAD_NUM): thread_name = 'Info-Thread' + str(thread_count) user_info_scrape_thread = UserInfoScrapeThread( thread_name, self.dataFetchModule, self.userTokenCacheQueue, self.cacheQueue, self.bloomFilterModule) self.user_info_scrape_thread_list.append(user_info_scrape_thread) # 初始化用户列表爬取线程 self.user_list_scrape_thread_list = [] for thread_count in range(USER_LIST_SCRAPE_THREAD_NUM): thread_name = 'List-Thread' + str(thread_count) user_list_scrape_thread = UserListScrapeThread( thread_name, self.DBConnectModule, self.dataFetchModule, self.userTokenCacheQueue, self.cacheQueue) self.user_info_scrape_thread_list.append(user_list_scrape_thread) # 若有起始token则放入 if start_token != '': self.userTokenCacheQueue.add_token_into_cache_queue([start_token]) if log.isEnabledFor(logging.DEBUG): log.debug("爬虫核心模块初始化完毕")
def user_info_scrape(self): # 为该线程绑定 session self.data_fetch_module.thread_bind_session(self.thread_name) if log.isEnabledFor(logging.DEBUG): log.debug('用户信息爬取线程[' + self.thread_name + ']开始运行') while True: # 从未分析 token 缓存列表中获取一个可用的token while True: token = self.user_token_cache_queue.get_token_from_cache_queue( ) if token is not None: if self.is_token_available(token) is False: break else: time.sleep(0.5) # 抓取 token 对应用户的个人信息,并保存 response = self.data_fetch_module.fetch_data_of_url( self.generate_user_info_url(token), self.thread_name) # 判断返回的数据是否有效,若有效再继续对数据进行分析 if response is not None: if response == 'reuse': # 将该 token 放回队列 self.user_token_cache_queue.add_token_into_cache_queue( [token]) else: # 添加到待分析队列 self.cache_queue.add_data_into_user_info_cache_queue({ DataParser.QUEUE_ELEM_HTML: response.text, DataParser.QUEUE_ELEM_TOKEN: token, DataParser.QUEUE_ELEM_THREAD_NAME: self.thread_name }) # 爬取时间间隔 time.sleep(SCRAPE_TIME_INTERVAL)
def __init__(self, db_connection): self.db_connection = db_connection self.analysed_cache_queue = queue.Queue(MAX_ANALYSED_CACHE_QUEUE_SIZE) self.cache_queue = queue.Queue(MAX_CACHE_QUEUE_SIZE) # 配置未分析用户信息列表 token_total = self.db_connection.get_user_token_num() if token_total > 0: temp_list = self.get_token_from_db(REMAIN_CACHE_QUEUE_SIZE) for token in temp_list: self.cache_queue.put(token) # 配置已分析用户信息列表 token_total = self.db_connection.get_analysed_token_num() if token_total > 0: temp_list = self.get_analysed_token_from_db( REMAIN_ANALYSED_CACHE_QUEUE_SIZE) for token in temp_list: self.analysed_cache_queue.put(token) if log.isEnabledFor(logging.DEBUG): log.debug('用户 Token 缓存列表初始化完毕')
def send_message(email_content): # 准备发送的内容 now = datetime.datetime.now() header = SMTP_EMAIL_HEADER + '[' + str(now.month) + '-' + str(now.day) + ' ' + \ str(now.hour) + ':' + str(now.minute) + ':' + str(now.second) + ']' msg = MIMEText(email_content, 'plain', 'utf-8') msg['from'] = SMTP_FROM_ADDR msg['to'] = SMTP_TO_ADDR msg['Subject'] = Header(header, 'utf-8').encode() # 发送 try: smtp_server = smtplib.SMTP(SMTP_SERVER_HOST, SMTP_SERVER_PORT) smtp_server.login(SMTP_FROM_ADDR, SMTP_SERVER_PASSWORD) smtp_server.sendmail(SMTP_FROM_ADDR, [SMTP_TO_ADDR], msg.as_string()) smtp_server.quit() except Exception as e: if log.isEnabledFor(logging.ERROR): log.error("邮件发送失败") log.exception(e) print(e)
def start_user_info_data_parse_thread(self): self.user_info_data_parse_thread.start() if log.isEnabledFor(logging.DEBUG): log.debug("用户信息数据解析线程启动")
def parse_user_information(html_string, user_token): if html_string is None: return None # 提取 json 数据 bs_object = BeautifulSoup(html_string, 'html.parser') data_string = bs_object.find('div', attrs={'id': 'data'}) if data_string is None: return None else: data_string = data_string['data-state'] # 字符串处理 # 对转义 html 字符进行处理 data_string = html.unescape(data_string) # 去除夹杂的 html 标签 data_string = BeautifulSoup(data_string, 'html.parser').text # 转换为 json 对象 try: # 防止解析到的 JSON 格式错误而引发异常 json_data = json.loads(data_string) except ValueError: if log.isEnabledFor(logging.DEBUG): log.debug('[error]解析到错误的 json 数据') return None # 提取实体 if JSON_ENTITIES not in json_data: return None entities = json_data[JSON_ENTITIES] # 提取各个用户信息 if JSON_USERS not in entities: return None users = entities[JSON_USERS] # 提取目标用户 if user_token not in users: return None user = users[user_token] # 提取目标用户的个人信息 avatar_url_template = None url_token = None name = None headline = None locations = [] business = None employments = [] educations = [] description = None sina_weibo_url = None gender = None following_count = None follower_count = None answer_count = None question_count = None voteup_count = None if USER_AVATAR_URL_TEMPLATE in user: avatar_url_template = user[USER_AVATAR_URL_TEMPLATE] if USER_URL_TOKEN in user: url_token = user[USER_URL_TOKEN] if USER_NAME in user: name = user[USER_NAME] if USER_HEADLINE in user: headline = user[USER_HEADLINE] if USER_LOCATIONS in user: for location in user[USER_LOCATIONS]: locations.append(location['name']) if USER_BUSINESS in user: business = user[USER_BUSINESS]['name'] if USER_EMPLOYMENTS in user: for employment in user[USER_EMPLOYMENTS]: elem = {} if 'job' in employment: job = employment['job']['name'] elem.update({'job': job}) if 'company' in employment: company = employment['company']['name'] elem.update({'company': company}) employments.append(elem) if USER_EDUCATIONS in user: for education in user[USER_EDUCATIONS]: if 'school' in education: school = education['school']['name'] educations.append(school) if USER_DESCRIPTION in user: description = user[USER_DESCRIPTION] if USER_SINAWEIBO_URL in user: sina_weibo_url = user[USER_SINAWEIBO_URL] if USER_GENDER in user: gender = user[USER_GENDER] if USER_FOLLOWING_COUNT in user: following_count = user[USER_FOLLOWING_COUNT] if USER_FOLLOWER_COUNT in user: follower_count = user[USER_FOLLOWER_COUNT] if USER_ANSWER_COUNT in user: answer_count = user[USER_ANSWER_COUNT] if USER_QUESTION_COUNT in user: question_count = user[USER_QUESTION_COUNT] if USER_VOTE_UP_COUNT in user: voteup_count = user[USER_VOTE_UP_COUNT] # 构造用户信息实体 user_info = { USER_AVATAR_URL_TEMPLATE: avatar_url_template, USER_URL_TOKEN: url_token, USER_NAME: name, USER_HEADLINE: headline, USER_LOCATIONS: locations, USER_BUSINESS: business, USER_EMPLOYMENTS: employments, USER_EDUCATIONS: educations, USER_DESCRIPTION: description, USER_SINAWEIBO_URL: sina_weibo_url, USER_GENDER: gender, USER_FOLLOWING_COUNT: following_count, USER_FOLLOWER_COUNT: follower_count, USER_ANSWER_COUNT: answer_count, USER_QUESTION_COUNT: question_count, USER_VOTE_UP_COUNT: voteup_count } return user_info
def start_user_list_data_parse_thread(self): self.user_list_data_parse_thread.start() if log.isEnabledFor(logging.DEBUG): log.debug('用户列表数据解析线程启动')
def start_spider(self): # 启动定时邮件线程 if IS_EMAIL_NOTIFICATION_ENABLE is True: self.emailService.start_email_notification_service() # 启动数据解析线程 self.dataParseModule.start_user_info_data_parse_thread() self.dataParseModule.start_user_list_data_parse_thread() # 启动用户信息爬取线程 for user_info_scrape_thread in self.user_info_scrape_thread_list: user_info_scrape_thread.start() # 启动用户列表爬取线程 for user_list_scrape_thread in self.user_list_scrape_thread_list: user_list_scrape_thread.start() if IS_EMAIL_NOTIFICATION_ENABLE is True: self.emailService.send_message("爬虫启动成功") # 工作线程检测并重启 while True: # 检测邮件服务线程 if IS_EMAIL_NOTIFICATION_ENABLE is True: if self.emailService.get_email_notification_service_status( ) == 'error': self.emailService.restart_email_notification_service() if log.isEnabledFor(logging.ERROR): log.error('邮件服务线程重新启动') # 检测用户信息解析线程 if self.dataParseModule.get_user_info_data_parse_thread_status( ) == 'error': self.dataParseModule.restart_user_info_data_parse_thread() if log.isEnabledFor(logging.ERROR): log.error('用户信息解析线程重新启动') # 检测用户列表解析线程 if self.dataParseModule.get_user_list_data_parse_thread_status( ) == 'error': self.dataParseModule.restart_user_list_data_parse_thread() if log.isEnabledFor(logging.ERROR): log.error('用户信息解析线程重新启动') # 检测用户信息爬取线程 for thread in self.user_info_scrape_thread_list: if thread.status == 'error': thread_name = thread.thread_name self.user_info_scrape_thread_list.remove(thread) new_thread = UserInfoScrapeThread(thread_name, self.DBConnectModule, self.dataFetchModule, self.userTokenCacheQueue, self.cacheQueue) self.user_info_scrape_thread_list.append(new_thread) new_thread.start() if log.isEnabledFor(logging.ERROR): log.error('用户信息爬取线程[' + thread_name + ']重新启动') # 检测用户列表爬取线程 for thread in self.user_list_scrape_thread_list: if thread.status == 'error': thread_name = thread.thread_name self.user_list_scrape_thread_list.remove(thread) new_thread = UserListScrapeThread(thread_name, self.DBConnectModule, self.dataFetchModule, self.userTokenCacheQueue, self.cacheQueue) self.user_list_scrape_thread_list.append(new_thread) new_thread.start() if log.isEnabledFor(logging.ERROR): log.error('用户列表爬取线程[' + thread_name + ']重新启动') # 检测间隔 time.sleep(180)
def user_list_scrape(self): # 为该线程绑定 session self.data_fetch_module.thread_bind_session(self.thread_name) if log.isEnabledFor(logging.DEBUG): log.debug('用户列表爬取线程[' + self.thread_name + ']开始运行') while True: # 从已分析 token 缓存列表中获取一个可用的token while True: token_info = self.user_token_cache_queue.get_token_form_analysed_cache_queue( ) if token_info is not None: break time.sleep(0.5) # 分析正在关注列表 if ANALYSE_FOLLOWING_LIST is True: # 计算页码范围 following_page_size = 1 if DataParser.USER_FOLLOWING_COUNT in token_info: following_page_size = self.calculate_max_page( token_info[DataParser.USER_FOLLOWING_COUNT]) if 0 < FOLLOWING_PAGE_MAX < following_page_size: following_page_size = FOLLOWING_PAGE_MAX # 开始分析 cur_page = 1 while cur_page <= following_page_size: # 获取数据 following_list_response = self.data_fetch_module.fetch_data_of_url( self.generate_following_list_url( token_info[DataParser.USER_URL_TOKEN], cur_page), self.thread_name) # 判断返回的数据是否有效,若有效再对数据进行分析 if following_list_response is not None: if following_list_response == 'reuse': # 重新分析该页的列表 continue else: # 添加到分析队列 self.cache_queue.add_data_into_user_list_cache_queue( { DataParser.QUEUE_ELEM_HTML: following_list_response.text, DataParser.QUEUE_ELEM_TOKEN: token_info[DataParser.USER_URL_TOKEN], DataParser.QUEUE_ELEM_THREAD_NAME: self.thread_name }) cur_page += 1 time.sleep(SCRAPE_TIME_INTERVAL) # 分析关注者列表 if ANALYSE_FOLLOWER_LIST is True: # 计算页码范围 follower_page_size = 1 if DataParser.USER_FOLLOWER_COUNT in token_info: follower_page_size = self.calculate_max_page( token_info[DataParser.USER_FOLLOWER_COUNT]) if follower_page_size > FOLLOWER_PAGE_MAX > 0: follower_page_size = FOLLOWER_PAGE_MAX # 开始分析 cur_page = 1 while cur_page <= follower_page_size: # 获取数据 follower_list_response = self.data_fetch_module.fetch_data_of_url( self.generate_follower_list_url( token_info[DataParser.USER_URL_TOKEN], cur_page), self.thread_name) # 判断返回的数据是否有效,若有效再继续对数据进行分析 if follower_list_response is not None: if follower_list_response == 'reuse': # 重新分析该页的列表 continue else: # 添加到待分析队列 self.cache_queue.add_data_into_user_list_cache_queue( { DataParser.QUEUE_ELEM_HTML: follower_list_response.text, DataParser.QUEUE_ELEM_TOKEN: token_info[DataParser.USER_URL_TOKEN], DataParser.QUEUE_ELEM_THREAD_NAME: self.thread_name }) cur_page += 1 time.sleep(SCRAPE_TIME_INTERVAL)