def load_data(): if environ.get('REDIS_SERVER') is not None: redis_server = environ.get('REDIS_SERVER') else: redis_server = 'localhost' if environ.get('REDIS_PORT') is not None: redis_port = int(environ.get('REDIS_PORT')) else: redis_port = 6379 if environ.get('REDIS_PASSWORD') is not None: redis_password = environ.get('REDIS_PASSWORD') else: redis_password = '' rdb = redis.Redis(host=redis_server, port=redis_port, password=redis_password) rb = RedisBloom(host=redis_server, port=redis_port, password=redis_password) rts = RedisTimeseries(host=redis_server, port=redis_port, password=redis_password) rdb.set("CONFIG", "YES") rts.create('s-unfiltered', retention_ms=60000) rts.create('s-filtered', retention_ms=60000) rts.create('unfiltered', labels={'Type': 'Final'}, retention_ms=86400000) rts.create('filtered', labels={'Type': 'Final'}, retention_ms=86400000) rts.createrule('s-unfiltered', 'unfiltered', 'last', 1000) rts.createrule('s-filtered', 'filtered', 'last', 1000) for gear in ['./dedup.py']: file = open(gear, mode='r') g = file.read() rdb.execute_command('RG.PYEXECUTE', g) file.close() if environ.get('REDIS_SCRABBLE') is not None: for line in fileinput.input("2019_Collins_Scrabble_Words.txt"): rb.bfAdd("Scrabble-Bloom", line.rstrip())
class Follow(object): def __init__(self, config): """Follow类初始化""" self.rb = Client() self.filter_redis_key = 'uidfilter' self.validate_config(config) self.cookie = {'Cookie': config['cookie']} user_id_list = config['user_id_list'] if not isinstance(user_id_list, list): if not os.path.isabs(user_id_list): user_id_list = os.path.split( os.path.realpath(__file__))[0] + os.sep + user_id_list user_id_list = self.get_user_list(user_id_list) self.user_id_list = user_id_list # 要爬取的微博用户的user_id列表 self.user_id = '' self.follow_list = [] # 存储爬取到的所有关注微博的uri和用户昵称 self.fans_list = [] # 存储爬取到的所有粉丝微博的uri和用户昵称 self.file_name = 'user_id_list' + str(time()) + '.txt' def validate_config(self, config): """验证配置是否正确""" user_id_list = config['user_id_list'] if (not isinstance(user_id_list, list)) and (not user_id_list.endswith('.txt')): sys.exit(u'user_id_list值应为list类型或txt文件路径') if not isinstance(user_id_list, list): if not os.path.isabs(user_id_list): user_id_list = os.path.split( os.path.realpath(__file__))[0] + os.sep + user_id_list if not os.path.isfile(user_id_list): sys.exit(u'不存在%s文件' % user_id_list) def deal_html(self, url): """处理html""" try: html = requests.get(url, cookies=self.cookie, verify=False).content selector = etree.HTML(html) return selector except Exception as e: print('Error: ', e) traceback.print_exc() def get_page_num(self): """获取关注列表页数""" url = "https://weibo.cn/%s/follow" % self.user_id selector = self.deal_html(url) if selector.xpath("//input[@name='mp']") == []: page_num = 1 else: page_num = (int)( selector.xpath("//input[@name='mp']")[0].attrib['value']) return page_num def get_one_page(self, page): """获取第page页的user_id""" print(u'%s第%d页%s' % ('-' * 30, page, '-' * 30)) url = 'https://weibo.cn/%s/follow?page=%d' % (self.user_id, page) selector = self.deal_html(url) table_list = selector.xpath('//table') if (page == 1 and len(table_list) == 0): print(u'cookie无效或提供的user_id无效') else: for t in table_list: im = t.xpath('.//a/@href')[-1] uri = im.split('uid=')[-1].split('&')[0].split('/')[-1] nickname = t.xpath('.//a/text()')[0] # if {'uri': uri, 'nickname': nickname} not in self.follow_list: if self.rb.bfExists(self.filter_redis_key, uri) == 0: self.rb.bfAdd(self.filter_redis_key, uri) self.follow_list.append({'uri': uri, 'nickname': nickname}) print(u'%s %s' % (nickname, uri)) def get_follow_list(self): """获取关注用户主页地址""" page_num = self.get_page_num() print(u'用户关注页数:' + str(page_num)) page1 = 0 random_pages = random.randint(1, 5) for page in tqdm(range(1, page_num + 1), desc=u'关注列表爬取进度'): self.get_one_page(page) if page - page1 == random_pages and page < page_num: sleep(random.randint(6, 10)) page1 = page random_pages = random.randint(1, 5) print(u'用户关注列表爬取完毕') def get_fans_page_num(self): """获取关注列表页数""" url = "https://weibo.cn/%s/fans" % self.user_id selector = self.deal_html(url) if selector.xpath("//input[@name='mp']") == []: page_num = 1 else: page_num = (int)( selector.xpath("//input[@name='mp']")[0].attrib['value']) return page_num def get_fans_one_page(self, page): """获取第page页的user_id""" print(u'%s第%d页%s' % ('-' * 30, page, '-' * 30)) url = 'https://weibo.cn/%s/fans?page=%d' % (self.user_id, page) selector = self.deal_html(url) table_list = selector.xpath('//table') if (page == 1 and len(table_list) == 0): print(u'cookie无效或提供的user_id无效') else: for t in table_list: im = t.xpath('.//a/@href')[-1] uri = im.split('uid=')[-1].split('&')[0].split('/')[-1] nickname = t.xpath('.//a/text()')[0] #if {'uri': uri, 'nickname': nickname} not in self.fans_list: if self.rb.bfExists(self.filter_redis_key, uri) == 0: self.rb.bfAdd(self.filter_redis_key, uri) self.fans_list.append({'uri': uri, 'nickname': nickname}) print(u'%s %s' % (nickname, uri)) def get_fans_list(self): """获取关注用户主页地址""" page_num = self.get_fans_page_num() print(u'用户关注页数:' + str(page_num)) page1 = 0 random_pages = random.randint(1, 5) for page in tqdm(range(1, page_num + 1), desc=u'关注列表爬取进度'): self.get_fans_one_page(page) if page - page1 == random_pages and page < page_num: sleep(random.randint(6, 10)) page1 = page random_pages = random.randint(1, 5) print(u'用户粉丝列表爬取完毕') def write_to_txt(self): with open(self.file_name, 'ab') as f: for user in self.follow_list: f.write((user['uri'] + ' ' + user['nickname'] + '\n').encode( sys.stdout.encoding)) for user in self.fans_list: f.write((user['uri'] + ' ' + user['nickname'] + '\n').encode( sys.stdout.encoding)) def get_user_list(self, file_name): """获取文件中的微博id信息""" with open(file_name, 'rb') as f: try: lines = f.read().splitlines() lines = [line.decode('utf-8-sig') for line in lines] except UnicodeDecodeError: sys.exit(u'%s文件应为utf-8编码,请先将文件编码转为utf-8再运行程序' % file_name) user_id_list = [] for line in lines: info = line.split(' ') if len(info) > 0 and info[0].isdigit(): user_id = info[0] if user_id not in user_id_list: user_id_list.append(user_id) return user_id_list def initialize_info(self, user_id): """初始化爬虫信息""" self.follow_list = [] self.fans_list = [] self.user_id = user_id def check_unique(self, user_id): """查看user_id是否已经保存过""" def start(self): """运行爬虫""" for user_id in self.user_id_list: self.initialize_info(user_id) print(u'开始抓取:' + user_id) print('*' * 100) try: self.get_follow_list() # 爬取关注列表 self.get_fans_list() # 爬取粉丝列表 except Exception as e: print('Error: ', e) traceback.print_exc() sleep(10) # 如果出错则跳过用户,而不是退出 self.write_to_txt() print(u'信息抓取完毕') print('*' * 100)