class RedisManager(): def __init__(self, settings, stats): self.logger = logging.getLogger(__name__) self.settings = settings self.stats = stats REDIS_HOST = self.settings.get('REDIS_HOST') REDIS_PORT = self.settings.get('REDIS_PORT') REDIS_PASSWORD = self.settings.get('REDIS_PASSWORD') try: self.rb = Client(host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD) self.logger.info(f"Successfully connected to redis server") except Exception as e: self.logger.error(f"Unable to connect to redis server: {e}") def _bf_add_url_(self, url): try: bf_add = self.rb.bfAdd('bf_urls', url) if bf_add: self.stats.inc_value('redis/bloomfilter/added_urls') self.logger.info(f"Added '{url}' to bloomfilter.") else: self.logger.error(f"Couldn't add '{url}' to bloomfilter") except Exception as e: self.logger.error(e) def _bf_check_url_pres_(self, url): if self.rb.bfExists('bf_urls', url): self.logger.debug(f"Found '{url}' in bloomfilter") self.stats.inc_value('redis/bloomfilter/existing_urls') return True else: self.logger.debug(f"Couldn't find '{url}' in bloomfilter") self.stats.inc_value('redis/bloomfilter/not_existing_urls') return False # if __name__ == '__main__': # rm = RedisManager() # rm._bf_add_url_("test1")
from redisbloom.client import Client # 因为我使用的是虚拟机中docker的redis, 填写虚拟机的ip地址和暴露的端口 rb = Client(host='node01', port=6379) rb.bfAdd('urls', 'baidu') rb.bfAdd('urls', 'google') print(rb.bfExists('urls', 'baidu')) # out: 1 print(rb.bfExists('urls', 'tencent2')) # out: 0 rb.bfMAdd('urls', 'a', 'b') print(rb.bfMExists('urls', 'google', 'baidu', 'tencent')) # out: [1, 1, 0]
def get_item(key, item): """判断是否存在""" rb = Client(connection_pool=pool) return rb.bfExists(key, item)
class Follow(object): def __init__(self, config): """Follow类初始化""" self.rb = Client() self.filter_redis_key = 'uidfilter' self.validate_config(config) self.cookie = {'Cookie': config['cookie']} user_id_list = config['user_id_list'] if not isinstance(user_id_list, list): if not os.path.isabs(user_id_list): user_id_list = os.path.split( os.path.realpath(__file__))[0] + os.sep + user_id_list user_id_list = self.get_user_list(user_id_list) self.user_id_list = user_id_list # 要爬取的微博用户的user_id列表 self.user_id = '' self.follow_list = [] # 存储爬取到的所有关注微博的uri和用户昵称 self.fans_list = [] # 存储爬取到的所有粉丝微博的uri和用户昵称 self.file_name = 'user_id_list' + str(time()) + '.txt' def validate_config(self, config): """验证配置是否正确""" user_id_list = config['user_id_list'] if (not isinstance(user_id_list, list)) and (not user_id_list.endswith('.txt')): sys.exit(u'user_id_list值应为list类型或txt文件路径') if not isinstance(user_id_list, list): if not os.path.isabs(user_id_list): user_id_list = os.path.split( os.path.realpath(__file__))[0] + os.sep + user_id_list if not os.path.isfile(user_id_list): sys.exit(u'不存在%s文件' % user_id_list) def deal_html(self, url): """处理html""" try: html = requests.get(url, cookies=self.cookie, verify=False).content selector = etree.HTML(html) return selector except Exception as e: print('Error: ', e) traceback.print_exc() def get_page_num(self): """获取关注列表页数""" url = "https://weibo.cn/%s/follow" % self.user_id selector = self.deal_html(url) if selector.xpath("//input[@name='mp']") == []: page_num = 1 else: page_num = (int)( selector.xpath("//input[@name='mp']")[0].attrib['value']) return page_num def get_one_page(self, page): """获取第page页的user_id""" print(u'%s第%d页%s' % ('-' * 30, page, '-' * 30)) url = 'https://weibo.cn/%s/follow?page=%d' % (self.user_id, page) selector = self.deal_html(url) table_list = selector.xpath('//table') if (page == 1 and len(table_list) == 0): print(u'cookie无效或提供的user_id无效') else: for t in table_list: im = t.xpath('.//a/@href')[-1] uri = im.split('uid=')[-1].split('&')[0].split('/')[-1] nickname = t.xpath('.//a/text()')[0] # if {'uri': uri, 'nickname': nickname} not in self.follow_list: if self.rb.bfExists(self.filter_redis_key, uri) == 0: self.rb.bfAdd(self.filter_redis_key, uri) self.follow_list.append({'uri': uri, 'nickname': nickname}) print(u'%s %s' % (nickname, uri)) def get_follow_list(self): """获取关注用户主页地址""" page_num = self.get_page_num() print(u'用户关注页数:' + str(page_num)) page1 = 0 random_pages = random.randint(1, 5) for page in tqdm(range(1, page_num + 1), desc=u'关注列表爬取进度'): self.get_one_page(page) if page - page1 == random_pages and page < page_num: sleep(random.randint(6, 10)) page1 = page random_pages = random.randint(1, 5) print(u'用户关注列表爬取完毕') def get_fans_page_num(self): """获取关注列表页数""" url = "https://weibo.cn/%s/fans" % self.user_id selector = self.deal_html(url) if selector.xpath("//input[@name='mp']") == []: page_num = 1 else: page_num = (int)( selector.xpath("//input[@name='mp']")[0].attrib['value']) return page_num def get_fans_one_page(self, page): """获取第page页的user_id""" print(u'%s第%d页%s' % ('-' * 30, page, '-' * 30)) url = 'https://weibo.cn/%s/fans?page=%d' % (self.user_id, page) selector = self.deal_html(url) table_list = selector.xpath('//table') if (page == 1 and len(table_list) == 0): print(u'cookie无效或提供的user_id无效') else: for t in table_list: im = t.xpath('.//a/@href')[-1] uri = im.split('uid=')[-1].split('&')[0].split('/')[-1] nickname = t.xpath('.//a/text()')[0] #if {'uri': uri, 'nickname': nickname} not in self.fans_list: if self.rb.bfExists(self.filter_redis_key, uri) == 0: self.rb.bfAdd(self.filter_redis_key, uri) self.fans_list.append({'uri': uri, 'nickname': nickname}) print(u'%s %s' % (nickname, uri)) def get_fans_list(self): """获取关注用户主页地址""" page_num = self.get_fans_page_num() print(u'用户关注页数:' + str(page_num)) page1 = 0 random_pages = random.randint(1, 5) for page in tqdm(range(1, page_num + 1), desc=u'关注列表爬取进度'): self.get_fans_one_page(page) if page - page1 == random_pages and page < page_num: sleep(random.randint(6, 10)) page1 = page random_pages = random.randint(1, 5) print(u'用户粉丝列表爬取完毕') def write_to_txt(self): with open(self.file_name, 'ab') as f: for user in self.follow_list: f.write((user['uri'] + ' ' + user['nickname'] + '\n').encode( sys.stdout.encoding)) for user in self.fans_list: f.write((user['uri'] + ' ' + user['nickname'] + '\n').encode( sys.stdout.encoding)) def get_user_list(self, file_name): """获取文件中的微博id信息""" with open(file_name, 'rb') as f: try: lines = f.read().splitlines() lines = [line.decode('utf-8-sig') for line in lines] except UnicodeDecodeError: sys.exit(u'%s文件应为utf-8编码,请先将文件编码转为utf-8再运行程序' % file_name) user_id_list = [] for line in lines: info = line.split(' ') if len(info) > 0 and info[0].isdigit(): user_id = info[0] if user_id not in user_id_list: user_id_list.append(user_id) return user_id_list def initialize_info(self, user_id): """初始化爬虫信息""" self.follow_list = [] self.fans_list = [] self.user_id = user_id def check_unique(self, user_id): """查看user_id是否已经保存过""" def start(self): """运行爬虫""" for user_id in self.user_id_list: self.initialize_info(user_id) print(u'开始抓取:' + user_id) print('*' * 100) try: self.get_follow_list() # 爬取关注列表 self.get_fans_list() # 爬取粉丝列表 except Exception as e: print('Error: ', e) traceback.print_exc() sleep(10) # 如果出错则跳过用户,而不是退出 self.write_to_txt() print(u'信息抓取完毕') print('*' * 100)
class Redis(BaseDb): ''' proxy以 proxy:IP:port作为key,以hash方式存储,field为type, protocol,score, ctime ''' __slots__ = ('_filter_name') @property def filter_name(self): return self._filter_name @filter_name.setter def filter_name(self, value): self._filter_name = value def __init__(self, host, pwd=None, port=6379, db=0): super().__init__() self.host = host self.pwd = pwd self.port = port self.db = db self._filter_name = '' def connect_to_redis(self): try: self.conn = Client(host=self.host, port=self.port, db=self.db, password=self.pwd) except Exception as e: print(e) return False return True def gen_key_name(self, record): # print(record) # print('ip' in record) # print('port' in record) if 'ip' in record and 'port' in record: return 'Proxy:%s:%s' % (record['ip'], record['port']) else: return None def exists(self, key_name): ''' 判断key是否已经存在,普通方式,和bf做对比,实际不使用 :param key_name: :return: 0(false)/1(True) ''' return self.conn.exists(key_name) def delete(self, key_name): return self.conn.delete(key_name) def delete_all(self): return self.conn.flushdb() # def hdelete(self, key_name): # return self.conn.hdel(key_name) def hmset(self, record, validate_time): valid_fields = ['ip', 'port', 'proxy_type', 'protocol', 'score'] # print(record) for single_valid_field in valid_fields: # print(single_valid_field) # print(single_valid_field not in record) if single_valid_field not in record: raise InvalidFieldException(single_valid_field) key_name = self.gen_key_name(record) field_value = { 'proxy_type': record['proxy_type'], 'protocol': record['protocol'], 'score': record['score'], # 'ctime': record['ctime'] } self.conn.hmset(key_name, field_value) self.conn.expire(key_name, validate_time) def multi_hmet(self, records, validate_time): for single_record in records: # print(single_record) self.hmset(single_record, validate_time) def time_interval_in_seconds(self, old_date_time, new_date_time): ''' 计算old_date_time和new_date_time之间时间间隔,单位秒 :param old_date_time: :param new_date_time: :return: int ''' if not helper.match_expect_type(old_date_time, 'datetime.datetime'): if helper.match_expect_type(old_date_time, 'str'): old_date_time = datetime.datetime.strptime( old_date_time, '%Y-%m-%d %H:%M:%S') else: raise ValueError('old_date_time的格式不正确') if not helper.match_expect_type(new_date_time, 'datetime.datetime'): if helper.match_expect_type(new_date_time, 'str'): new_date_time = datetime.datetime.strptime( new_date_time, '%Y-%m-%d %H:%M:%S') else: raise ValueError('new_date_time的格式不正确') # datetime.datetime.now()+datetime.timedelta(days=1) return int((new_date_time - old_date_time).total_seconds()) # print((new_date_time - old_date_time).total_seconds()) def expire(self, key_name, ttl): return self.conn.expire(key_name, ttl) def bf_create(self, fpp=0.001, capacity=1000, expansion=1): ''' 创建一个bloom过滤器 :param filter_name: 过滤器名称 :param fpp: 假阳性概率 :param capacity: 过滤器存储元素的个数 :param expansion: 当filter填满后,新建的子filter的capacity是当前filter的几倍大小。1,说明同样大小 :return: 0(create fail)/1(create success) ''' try: self.conn.bfCreate(key=self._filter_name, errorRate=fpp, capacity=capacity, expansion=expansion) except redis.exceptions.ResponseError as e: # print(e) #item exists return 0 return 1 def bf_madd(self, records): items = '' for single_record in records: items += self.gen_key_name(single_record) self.conn.bfMAdd(self._filter_name, items) def bf_add(self, record): item = self.gen_key_name(record) self.conn.bfMAdd(self._filter_name, item) def bf_exists(self, item): return self.conn.bfExists(self._filter_name, item) def bf_mexists(self, items): ''' :param items: 是一个list,调用bfMExists,加*变成可变参数 :return: ''' return self.conn.bfMExists(self._filter_name, *items)
class FullLayeredCache(LayeredCache): """ Multi-Layered key value store with bloom filter and dgraph. Layer 1: In Memory LRU Key Value Map Layer 2: Redis Key Value Store Layer 3: Bloom filter Layer 4: DGraph The primary difference between this class and the LayeredCache class is that this one includes the bloom filter and DGraph. """ def __init__(self, node_name: str, lru_size: int, p=1.0e-6, n=1000000): """ Initialize last two layers of cache :param node_name: :param lru_size: """ super(FullLayeredCache, self).__init__(node_name, lru_size) # Set to true so we add a timeout to layer 2 redis key value stores self.set_timeout = True # Create the bloom filter client object self.bloom = RedisBloom(port=6378) # Create a dgraph client, stub, and transaction self.dgraph, self.stub = get_client() self.txn = self.dgraph.txn() # Initialize the bloom filter (if it doesnt already exist) try: self.bloom.bfInfo(node_name) except exceptions.ResponseError: self.bloom.bfCreate(node_name, p, n) def __contains__(self, key: str) -> bool: """ Check to see if key is in a layer of the cache. We will start at layer 1 and go walk through each layer until we find a result. We will update previous layers if we cache miss. We'll return True if the key was found at a layer, False if we cache miss. :param key: :return: """ # Check layer 1 and 2 if super(FullLayeredCache, self).__contains__(key): return True # Check the layer 3 bloom filter exists_in_bloom = self.bloom.bfExists(self.node_name, self._get_key(key)) if exists_in_bloom == 1: # Unfortunately, we can't store the actual value in the bloom filter. # For this, we can't update previous layers with the value for this key. return True # All else has failed, we must now check dgraph. This is super super slow. query = """query all($a: string) { all(func: eq(%s, $a)) { uid } }""" % self.node_name dgraph_result = self.txn.query(query, variables={"$a": str(key)}) thing = json.loads(dgraph_result.json) if len(thing["all"]) > 0: # Update previous layers self[key] = thing["all"][0]["uid"] return True # Cache miss, return False return False def __getitem__(self, key: str) -> Union[str, None]: """ Check each layer iteratively for the key specified. If we find the result at a given layer, we update previous layers with the result. If the result was not found, return None. :param key: :return: """ # Check layer 1 and 2 item = super(FullLayeredCache, self).__getitem__(key) if item is not None: return item # Check layer 3 bloom filter exists_in_bloom = self.bloom.bfExists(self.node_name, self._get_key(key)) if exists_in_bloom == 1: return True # All else has failed, we must now check dgraph. This is super super slow. query = """query all($a: string) { all(func: eq(%s, $a)) { uid } }""" % self.node_name dgraph_result = self.txn.query(query, variables={"$a": str(key)}) thing = json.loads(dgraph_result.json) if len(thing["all"]) > 0: # Update previous layers self[key] = thing["all"][0]["uid"] return thing["all"][0]["uid"] # Cache miss, return None return None def close(self): """ Close all outstanding connections :return: """ # Close the layer 2 redis connection super(FullLayeredCache, self).close() # Close layer 3 bloom filter connection self.bloom.close() # Close layer 4 dgraph connections self.stub.close()