def delete_item(self): """ 删除缓存 :return: """ key_info_list = self.cache_info_lb.get( self.cache_info_lb.curselection()).split(' ') key_id = key_info_list[0] result = tk.messagebox.askquestion('删除', '确定删除key为' + key_id + '的缓存?', icon='warning') if result == 'yes': re_result = tk.messagebox.askquestion('删除', '你刚才点了删除,真的要删除吗?', icon='warning') if re_result == 'yes': redis = RedisUtil(int(self.db_index.get())) try: redis.del_key(key_id) except ConnectionError: tk.messagebox.showerror('错误', '没有连接Redis') else: pass else: pass
def print_key_value(self,event): """ 点击Listbox组件中一条缓存条目,查看对应数据 :param event: :return:None """ self.cache_content_text.delete('1.0', tk.END) #获取Listbox中选中的item的数据 key_info_list = self.cache_info_lb.get(self.cache_info_lb.curselection()).split(' ') key_id = key_info_list[0] key_type = key_info_list[1] third_value = key_info_list[2] #超时时间 or field_name(hash/set) redis = RedisUtil(int(self.db_index.get())) #如果数据类型是hash或者set,则在Listbox中显示hash或set的所有field if key_type in ['hash','set']: try: cache_info = redis.get_key_value(key_type, key_id, None) except ConnectionError: tk.messagebox.showerror('错误', '没有连接Redis') return self.fillin_listbox(key_type, key_id, cache_info) elif key_type in ['string','list','hash_field','set_field']: try: key_value = redis.get_key_value(key_type, key_id, third_value) except ConnectionError: tk.messagebox.showerror('错误', '没有连接Redis') return self.cache_content_text.insert(tk.INSERT, str(key_value))
def print_key_value(self, event): """ 点击Listbox组件中一条缓存条目,查看对应数据 :param event: :return:None """ self.cache_content_text.delete('1.0', tk.END) #获取Listbox中选中的item的数据 key_info_list = self.cache_info_lb.get( self.cache_info_lb.curselection()).split(' ') key_id = key_info_list[0] key_type = key_info_list[1] third_value = key_info_list[2] #超时时间 or field_name(hash/set) redis = RedisUtil(int(self.db_index.get())) #如果数据类型是hash或者set,则在Listbox中显示hash或set的所有field if key_type in ['hash', 'set']: try: cache_info = redis.get_key_value(key_type, key_id, None) except ConnectionError: tk.messagebox.showerror('错误', '没有连接Redis') return self.fillin_listbox(key_type, key_id, cache_info) elif key_type in ['string', 'list', 'hash_field', 'set_field']: try: key_value = redis.get_key_value(key_type, key_id, third_value) except ConnectionError: tk.messagebox.showerror('错误', '没有连接Redis') return self.cache_content_text.insert(tk.INSERT, str(key_value))
def __init__(self): self.base_url = 'https://www.zhihu.com' self.settings = 'https://www.zhihu.com/settings/profile' self.headers = { "User-Agent": 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:41.0) Gecko/20100101 Firefox/41.0', "Referer": 'http://www.zhihu.com/', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Accept-Encoding': 'gzip, deflate, br', 'Connection': 'keep-alive', 'Host': 'www.zhihu.com', } # 爬虫起点 self.start_user = None # 已爬取用户ID的Redis Set Key self.pass_key = 'zhihu:pass' # 爬取失败用户ID的Redis Set Key self.fail_key = 'zhihu:fail' # 待爬取用户ID的Redis List Key self.queue_key = 'user' # 知乎账号池 self.pool = AccountPool() # 采用requests库保存会话信息 self.session = requests.session() # mongodb存储爬取的用户信息 self.mongo = Mongo(database='zhihu') # redis存储爬取状态信息 self.redis = RedisUtil(host='localhost', port=6379, namespace='zhihu') # logger配置 logging.config.fileConfig("./Log/zhihu.conf") self.logger = logging.getLogger('zhihu') self.use_account()
def query_cache_info(self): """ 按查询条件查询缓存 :return: """ if self.db_index.get() in self.all_db: redis = RedisUtil(int(self.db_index.get())) else: tk.messagebox.showerror('错误', '参数不正确:没有选择数据库') return type_variable_value = self.type_variable.get() query_condition_key = self.input_value.get() self.cache_content_text.delete('1.0', tk.END) self.cache_info_lb.delete(0, tk.END) #2.按指定的key类型和key名称查询数据 if type_variable_value in ['string','list','set','hash'] and \ self.check_none(query_condition_key): try: cache_info = redis.get_key_value(type_variable_value, query_condition_key, None) except ConnectionError: tk.messagebox.showerror('错误', '没有连接Redis') return #如果数据类型是hash或者set,则在Listbox中显示hash或set的所有field if type_variable_value in ['hash', 'set']: self.fillin_listbox(type_variable_value, query_condition_key, cache_info) return #如果数据类型是string,则在Text中显示value,在Listbox中显示key self.cache_info_lb.delete(0, tk.END) try: key_info_tuple = redis.get_key_info(query_condition_key) result = query_condition_key + ' ' + type_variable_value + ' 超时时间:' + key_info_tuple[ 1] except ConnectionError: tk.messagebox.showerror('错误', '没有连接Redis') return self.cache_info_lb.insert(tk.END, result) self.cache_content_text.insert(tk.INSERT, str(cache_info)) #1.查询当前数据库中所有的key信息:key名称、key类型、key超时时间,在Listbox组件显示 elif type_variable_value == '选择数据类型': try: cache_info = redis.get_all_keys() except ConnectionError: tk.messagebox.showerror('错误', '没有连接Redis') return for item in cache_info: self.tmp_list.clear() self.tmp_list.append(item) #临时保存数据 self.cache_info_lb.insert(tk.END, item)
def query_cache_info(self): """ 按查询条件查询缓存 :return: """ if self.db_index.get() in self.all_db: redis = RedisUtil(int(self.db_index.get())) else: tk.messagebox.showerror('错误', '参数不正确:没有选择数据库') return type_variable_value = self.type_variable.get() query_condition_key = self.input_value.get() self.cache_content_text.delete('1.0', tk.END) self.cache_info_lb.delete(0,tk.END) #2.按指定的key类型和key名称查询数据 if type_variable_value in ['string','list','set','hash'] and \ self.check_none(query_condition_key): try: cache_info = redis.get_key_value(type_variable_value, query_condition_key, None) except ConnectionError: tk.messagebox.showerror('错误', '没有连接Redis') return #如果数据类型是hash或者set,则在Listbox中显示hash或set的所有field if type_variable_value in ['hash','set']: self.fillin_listbox(type_variable_value, query_condition_key, cache_info) return #如果数据类型是string,则在Text中显示value,在Listbox中显示key self.cache_info_lb.delete(0,tk.END) try: key_info_tuple = redis.get_key_info(query_condition_key) result = query_condition_key+' '+type_variable_value+' 超时时间:'+key_info_tuple[1] except ConnectionError: tk.messagebox.showerror('错误', '没有连接Redis') return self.cache_info_lb.insert(tk.END, result) self.cache_content_text.insert(tk.INSERT, str(cache_info)) #1.查询当前数据库中所有的key信息:key名称、key类型、key超时时间,在Listbox组件显示 elif type_variable_value=='选择数据类型': try: cache_info = redis.get_all_keys() except ConnectionError: tk.messagebox.showerror('错误', '没有连接Redis') return for item in cache_info: self.tmp_list.clear() self.tmp_list.append(item) #临时保存数据 self.cache_info_lb.insert(tk.END, item)
def _handler_for_recent_games(self) -> str: """[Returns the recent games searched by the user] Returns: [str]: [games with separator as \n] """ return RedisUtil(self.message.author).get_all_games()
def _insert_game_in_db_if_exists(self): """ Insert game in redis if google query contains words like !google apple games, !google game of thrones etc. Otherwise does nothing. """ if "game" in self.user_message_lower: game_name = " ".join(self.user_message_lower.split(" ")[1:]) RedisUtil(self.message.author).insert_game(game_name)
def __init__(self): if utils.get_host_ip() == '10.1.13.49': self.HOST = '10.1.13.29' else: self.HOST = '202.107.204.50' self.conn = MySQLdb.connect(host=self.HOST, user='******', passwd='tdlabDatabase', db='techpooldata', port=3306, charset='utf8') self.tables = {'paper': 'expert_paper_join', 'patent': 'expert_patent_join', 'project': 'expert_project_join'} self.columns = {'paper': 'PAPER_ID', 'patent': 'PATENT_ID', 'project': 'PROJECT_ID'} self.redis = RedisUtil()
def delete_item(self): """ 删除缓存 :return: """ key_info_list = self.cache_info_lb.get(self.cache_info_lb.curselection()).split(' ') key_id = key_info_list[0] result = tk.messagebox.askquestion('删除', '确定删除key为'+key_id+'的缓存?', icon='warning') if result == 'yes': re_result = tk.messagebox.askquestion('删除', '你刚才点了删除,真的要删除吗?', icon='warning') if re_result == 'yes': redis = RedisUtil(int(self.db_index.get())) try: redis.del_key(key_id) except ConnectionError: tk.messagebox.showerror('错误', '没有连接Redis') else: pass else: pass
def conn_redis(self): env = self.env_value.get() host = self.host_value.get() port = self.port_value.get() password = self.password_value.get() is_Connection_success = RedisUtil(None).testConnection( host=host, port=port, password=password) if is_Connection_success: self.top_level.destroy() #销毁Toplevel窗口 conf_file_path = os.getcwd() + '\\conf\\redis_conf.cfg' RedisConf().write_cfg(file_path=conf_file_path, env=env, host=host, port=port, password=password) tk.messagebox.showinfo('连接成功', '连接Redis成功!') else: tk.messagebox.showwarning('连接失败', '连接Redis失败!')
class ZhihuCrawler: def __init__(self): self.base_url = 'https://www.zhihu.com' self.settings = 'https://www.zhihu.com/settings/profile' self.headers = { "User-Agent": 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:41.0) Gecko/20100101 Firefox/41.0', "Referer": 'http://www.zhihu.com/', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Accept-Encoding': 'gzip, deflate, br', 'Connection': 'keep-alive', 'Host': 'www.zhihu.com', } # 爬虫起点 self.start_user = None # 已爬取用户ID的Redis Set Key self.pass_key = 'zhihu:pass' # 爬取失败用户ID的Redis Set Key self.fail_key = 'zhihu:fail' # 待爬取用户ID的Redis List Key self.queue_key = 'user' # 知乎账号池 self.pool = AccountPool() # 采用requests库保存会话信息 self.session = requests.session() # mongodb存储爬取的用户信息 self.mongo = Mongo(database='zhihu') # redis存储爬取状态信息 self.redis = RedisUtil(host='localhost', port=6379, namespace='zhihu') # logger配置 logging.config.fileConfig("./Log/zhihu.conf") self.logger = logging.getLogger('zhihu') self.use_account() ''' 切换账号 ''' def use_account(self): cookie = self.pool.get() if cookie is None: self.logger.error('NO ACCOUNT') return False self.session.cookies.update(cookie) return self.is_login() ''' 验证是否处于登录状态 ''' def is_login(self): login_code = self.session.get(self.settings, headers=self.headers, allow_redirects=False).status_code return True if login_code == 200 else False ''' 获取用户基本信息 包括其关注者列表 ''' def get_user_basic(self, username): home_url = self.base_url + '/people/' + username + '/following' req = self.session.get(url=home_url, headers=self.headers, verify=True) soup = BeautifulSoup(req.text, 'lxml') user_info = dict() data = soup.find('div', id='data')['data-state'] data = json.loads(data, encoding='utf-8') user = data['entities']['users'][username] followings = list(data['entities']['users']) followings.remove(username) img = soup.find('img', class_='Avatar Avatar--large UserAvatar-inner') user_info['avatar'] = img['src'] if img is not None else '' user_info['name'] = user['name'] user_info['headline'] = user['headline'] user_info['gender'] = 'Male' if user['gender'] else 'Female' user_info['description'] = user['description'] user_info['business'] = user['business'][ 'name'] if 'business' in user.keys() else '' user_info['answerCount'] = int(user['answerCount']) user_info['favoriteCount'] = int(user['favoriteCount']) user_info['thankedCount'] = int(user['thankedCount']) user_info['followerCount'] = int(user['followerCount']) user_info['followingCount'] = int(user['followingCount']) user_info['educations'] = list() user_info['employments'] = list() user_info['locations'] = list() for edu in user['educations']: info = dict() info['school'] = edu['school']['name'] if 'school' in edu.keys( ) else '' info['major'] = edu['major']['name'] if 'major' in edu.keys( ) else '' user_info['educations'].append(info) for loc in user['locations']: info = dict() info['name'] = loc['name'] user_info['locations'].append(info) for em in user['employments']: info = dict() info['name'] = em['company']['name'] if 'name' in em.keys() else '' info['job'] = em['job']['name'] if 'job' in em.keys() else '' user_info['employments'].append(info) user_info['create_time'] = datetime.datetime.now() user_info['following'] = followings return user_info, followings ''' 采用BFS沿着关注链爬取用户 depth: 当前层数 max_depth: 最大层数 ''' def following_crawler(self, depth, max_depth=5): if depth > max_depth: return depths = ['#{}'.format(i) for i in range(max_depth)] index = 0 s_cnt = self.redis.ssize(self.pass_key) f_cnt = self.redis.ssize(self.fail_key) if self.redis.get(self.queue_key) is None: self.start_user = raw_input('从谁开始爬? ').strip() self.redis.put(self.start_user) self.redis.put('#0') while index <= max_depth: while not self.redis.empty(self.queue_key): username = self.redis.get(self.queue_key) try: index = depths.index(username) break except Exception as e: pass if self.redis.sismem(self.pass_key, username) or self.redis.sismem( self.fail_key, username): continue self.logger.info('[{}]'.format(username)) try: basic, followings = self.get_user_basic(username) self.redis.sadd_items(self.pass_key, username) self.redis.put(self.queue_key, *tuple(followings)) self.mongo.save_user(basic) s_cnt += 1 except Exception as e: self.logger.info(e.message) self.logger.info( '--------{}--------failed'.format(username)) self.redis.sadd_items(self.fail_key, username) f_cnt += 1 # 知乎反爬虫力度太大,由于只有俩账号,只好放慢速度 if (f_cnt + s_cnt + 1) % 5 == 0: self.logger.info( '---------\nsleep at {}\n---------'.format( datetime.datetime.now())) time.sleep(5) if (f_cnt + s_cnt + 1) % 50 == 0: self.logger.info( '---------\nsleep at {}\n---------'.format( datetime.datetime.now())) time.sleep(15) if (f_cnt + s_cnt + 1) % 25 == 0: if not self.use_account(): self.logger.error('Account Error') raise Exception('Account Error') else: self.logger.info('--------\nchange account\n--------') self.redis.put(self.queue_key, depths[index + 1]) self.logger.info( '---------\nDepth {} crawled.\t Fail/Success: {}/{} got\n----------' .format(index, f_cnt, s_cnt)) index = index + 1