def __init__(self): self.url = 'https://passport.weibo.cn/sso/login' self.s = requests.Session() self.headers = HEADERS self.data = DATA self.followers_url = 'https://m.weibo.cn/api/container/getSecond?containerid={containerid}_-_FOLLOWERS&page={page}' self.weibo_url = 'https://m.weibo.cn/api/container/getIndex?containerid={containerid}_-_WEIBO_SECOND_PROFILE_WEIBO&page_type=03&page={page}' self.mongoclient = MongoClient() self.redisclient = RedisClient()
def __init__(self): self.s = requests.Session() self.s.headers.update(headers) self.s.proxies = proxies self.offset = 0 self._db = MongoClient() # 第一个参数 self.d = {"rid": None, "offset": 0, "total": "true", "limit": "20", "csrf_token": ""} # 存储每个歌单的url,更好用redis来存储 self.seen = set()
def main(): s = Scheduler() print('程序开始运行。。') # s.run('1266321801') db = MongoClient() while True: user = db.find_one_flag() if user: s.run(user['user_id']) else: print('所有用户已经爬取完') break
def __init__(self): self.redisdb = RedisClient() self.mongodb = MongoClient() self.crawl_urls = [] self.use_proxy = False if CRAWL_MODE == 'proxy': self.use_proxy = True self.start_urls = [ 'https://www.douban.com/group/explore', 'https://www.douban.com/group/explore/culture', 'https://www.douban.com/group/explore/travel', 'https://www.douban.com/group/explore/ent', 'https://www.douban.com/group/explore/fashion', 'https://www.douban.com/group/explore/life', 'https://www.douban.com/group/explore/tech' ]
class WeiBoSpider(object): def __init__(self): self.url = 'https://passport.weibo.cn/sso/login' self.s = requests.Session() self.headers = HEADERS self.data = DATA self.followers_url = 'https://m.weibo.cn/api/container/getSecond?containerid={containerid}_-_FOLLOWERS&page={page}' self.weibo_url = 'https://m.weibo.cn/api/container/getIndex?containerid={containerid}_-_WEIBO_SECOND_PROFILE_WEIBO&page_type=03&page={page}' self.mongoclient = MongoClient() self.redisclient = RedisClient() def get_uid(self): response = self.s.post(self.url, headers=self.headers, data=self.data) if response.status_code == 200: response.encoding = chardet.detect(response.content)['encoding'] uid = response.json()['data']['uid'] else: print('请求网页出错:', response.status_code) self.followers_containerid = '100505' + uid self.weibo_containerid = '230413' + uid def make_followers_kw(self, followers_page=1): followers_kw = (self.followers_containerid, followers_page) self.get_followers_page(followers_kw) def make_weibo_kw(self, weibo_page=1): weibo_kw = (self.weibo_containerid, weibo_page) self.get_followers_page(weibo_kw) def get_followers_page(self, kw): if kw[0].startswith('100505'): url = self.followers_url.format(containerid=kw[0], page=kw[1]) info_list = self.s.get(url).json() self.parse_followers_info(info_list) else: url = self.weibo_url.format(containerid=kw[0], page=kw[1]) info_list = self.s.get(url).json() self.parse_weibo_info(info_list) def parse_followers_info(self, followers_list): res = {} current_page = followers_list['cardlistInfo']['page'] res['flag'] = followers_list['title'] maxpage = followers_list['maxPage'] + 1 followers = followers_list['cards'] for follower in followers: user = follower['user'] res['name'] = user['screen_name'] res['id'] = user['id'] res['profile_url'] = user['profile_url'] res['weibo_count'] = user['statuses_count'] res['verify_info'] = user.get('verified_reason') res['description'] = user['description'] res['gender'] = '男' if user['gender'] == 'm' else '女' res['followers_count'] = user['followers_count'] res['follow_count'] = user['follow_count'] print(res) self.mongoclient.save_to_mongo(res) if current_page < maxpage: self.make_followers_kw(followers_page=current_page) def parse_weibo_info(self, weibo_list): res = {} current_page = weibo_list['cardlistInfo']['page'] blog_total = weibo_list['cardlistInfo']['total'] print(current_page) blog_list = weibo_list['cards'] if current_page == 2: blogs = blog_list[1:] else: blogs = blog_list for blog in blogs: blog_info = blog['mblog'] res['blog_content_url'] = blog['scheme'] # 针对带评论转发和只转发两种状态 res['blog_title'] = blog_info['retweeted_status']['page_info'][ 'content1'] if blog_info.get('raw_text') and blog_info[ 'retweeted_status'].get('page_info') else blog_info['text'] res['is_privacy'] = blog_info['title']['text'] res['attitudes_count'] = blog_info['attitudes_count'] res['create_time'] = blog_info['created_at'] res['comments_count'] = blog_info['comments_count'] res['reads_count'] = blog_info['reads_count'] res['source'] = blog_info['source'] res['location'] = blog_info.get('page_info').get( 'page_title') if blog_info.get('page_info') else None print(res) self.mongoclient.save_to_mongo(res) if current_page and (current_page - 1) * 10 < blog_total: self.make_weibo_kw(weibo_page=current_page)
class wangyimusic_comment(object): def __init__(self): self.s = requests.Session() self.s.headers.update(headers) self.s.proxies = proxies self.offset = 0 self._db = MongoClient() # 第一个参数 self.d = {"rid": None, "offset": 0, "total": "true", "limit": "20", "csrf_token": ""} # 存储每个歌单的url,更好用redis来存储 self.seen = set() def get_params(self): iv = '0102030405060708' # first_encText = generatestring() h_encText = self.aes_encrypt(str(self.d), g, iv) h_encText = self.aes_encrypt(h_encText, 'FFFFFFFFFFFFFFFF', iv) return h_encText # 因为 参数中的唯一一个变量正是那个 16 位随机字符串,因为我们用16个F代替,所以, # 每次调用的结果必然一致,下面的值是在浏览器中console中模拟取得。 def get_encSecKey(self): encSecKey = '257348aecb5e556c066de214e531faadd1c55d814f9be95fd06d6bff9f4c7a41f831f6394d5a3fd2e3881736d94a02ca919d952872e7d0a50ebfa1769a7a62d512f5f1ca21aec60bc3819a9c3ffca5eca9a0dba6d6f7249b06f5965ecfff3695b54e1c28f3f624750ed39e7de08fc8493242e26dbc4484a01c76f739e135637c' return encSecKey # 加密文本长度必须为 16(AES-128)、24(AES-192)、32(AES-256) Bytes长度,目前AES-128足够用 def aes_encrypt(self, text, key, iv): encryptor = AES.new(key, AES.MODE_CBC, iv) # 加密函数,如果 text 不是16的倍数(加密文本text必须为16的倍数), 那就补足为16的倍数 miss_lenght = 16 - len(text) % 16 text = text + miss_lenght * chr(miss_lenght) encrypt_text = encryptor.encrypt(text) # 不明白为什么有这句 encrypt_text = base64.b64encode(encrypt_text) return encrypt_text.decode('utf-8') def get_list_page(self, url): response = self.s.get(url) if response.status_code == 200: return response.text else: print('请求url时出错:', response.status_code) def parse_list(self, response): page = etree.HTML(response) links = page.xpath(".//*[@id='m-pl-container']/li/p[@class='dec']/a/@href") for link in links: base_songsheet_url = base_list_url + link self.add_songsheet_url(base_songsheet_url) print(base_songsheet_url) next_page = base_list_url + page.xpath(".//*/div[@class='u-page']/a[text()='下一页']/@href")[0] if not next_page.endswith('javascript:void(0)'): # print("*********************************") # print(next_page) self.parse_list(self.get_list_page(next_page)) def get_song(self, url): response = self.s.get(url) if response.status_code == 200: page = etree.HTML(response.text) song_sheet = page.xpath(".//*/ul[@class='f-hide']/li/a/@href") # self.song_sheet_name = page.xpath('.//*/title/text()')[0] for song_url in song_sheet: self.item = {} self.comment_url = base_list_url + song_url self.song_name = self.get_song_name(self.comment_url) song_id = self.comment_url.split('=')[1] self.d["rid"] = "R_SO_4_" + str(song_id) comment_dict = self.get_comment(song_id) self.parse_comment(comment_dict, self.item) self._db.save_to_mongo(self.song_name, self.item) total = comment_dict['total'] print('fisrt time:', total) while self.offset + 20 < total: self.offset += 20 self.d['offset'] = self.offset self.get_comment(song_id) else: print('请求 {url} 时出错: {code}'.format(url=url, code=response.status_code)) def get_comment(self, id): response = self.s.post(song_info_url.format(id=id), data={ 'params': self.get_params(), 'encSecKey': self.get_encSecKey() }, cookies={'Cookie': 'appver=1.5.0.75771;'}) res_json = json.loads(response.text) self.parse_comment(res_json, self.item) return res_json def parse_comment(self, comment_dict, item): for comment in comment_dict['comments']: item['commentId'] = comment['commentId'] item['nickname'] = comment['user']['nickname'] item['content'] = comment['content'] item['time'] = self.timestamp_to_time(comment['time']) item['likedCount'] = comment['likedCount'] item['song_url'] = self.comment_url self._db.save_to_mongo(self.song_name, self.item) print(item) # item = {} # self.parse_comment(comment_dict, self.item) # self._db.save_to_mongo(self.song_sheet_name, self.item) def timestamp_to_time(self, timestamp): time_length = len(str(timestamp)) if time_length == 13: timearr = time.localtime(timestamp // 1000) else: timearr = time.localtime(timestamp) return time.strftime('%Y-%m-%d %H:%M', timearr) def add_songsheet_url(self, url): if url not in self.seen: self.seen.add(url) def get_song_name(self, song_url): response = self.s.get(song_url) page = etree.HTML(response.text) song_name = page.xpath('.//*/title/text()')[0].split('-')[0] return song_name
def __init__(self): self.download = Download() self.db = MongoClient() self.user_url_list = [] self.threads = []
class Scheduler(object): def __init__(self): self.download = Download() self.db = MongoClient() self.user_url_list = [] self.threads = [] def run(self, user_id=config.START_ID): self.user_start(user_id) def user_start(self, user_id): user_id = int(user_id) results = self.db.find(user_id) if (results and results['flag'] == False) or not results: index_data = self.get_user_index(user_id) if index_data: self.get_user_info(user_id) self.get_fans(user_id, index_data['user']) self.get_followers(user_id, index_data['user']) else: data = {'user_id': user_id, 'flag': 'Error'} self.db.save(data) else: print(results['user'], " 该用户已经爬取过") def get_user_index(self, user_id): user_index = 'https://m.weibo.cn/api/container/getIndex?containerid=100505{user_id}' url = user_index.format(user_id=user_id) response = self.download.get_html(url) if response: try: res_json = json.loads(response) if 'userInfo' in res_json.keys(): user = res_json['userInfo']['screen_name'] user_id = res_json['userInfo']['id'] user_url = res_json['userInfo']['profile_url'] fans = res_json['userInfo']['followers_count'] followers = res_json['userInfo']['follow_count'] time = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') data = { 'user': user, 'user_id': user_id, 'user_url': user_url, 'fans': fans, 'followers': followers, 'time': time, 'flag': True } print('正在抓取 ' + user + ' ID为:' + str(user_id)) self.db.save(data) return data except: print('json解析出错') return None def get_user_info(self, user_id): user_info = 'https://m.weibo.cn/api/container/getIndex?containerid=230283{user_id}_-_INFO' url = user_info.format(user_id=user_id) response = self.download.get_html(url) if response: # pattern = re.compile( r'{"card_type":41,"item_name":"\u6027\u522b","item_content":"(.*?)"}.*?{"card_type":41,"item_name":"\u6240\u5728\u5730","item_content":"(.*?)"}.*?{"card_type":41,"item_name":"\u7b80\u4ecb","item_content":"(.*?)"}.*?{"card_type":41,"item_name":"\u7b49\u7ea7".*?"item_content":"(.*?)".*?{"card_type":41,"item_name":"\u9633\u5149\u4fe1\u7528","item_content":"(.*?)".*?{"card_type":41,"item_name":"\u6ce8\u518c\u65f6\u95f4","item_content":"(.*?)"}',re.S) # results = re.search(pattern,response) # if results: # sex = results.group(1) # location = results.group(2) # jianjie = results.group(3) # level = results.group(4) # credit = results.group(5) # reg_time = results.group(6) sex = '' location = '' jianjie = '' level = '' credit = '' reg_time = '' sex_pattern = re.compile( r'{"card_type":41,"item_name":"\\u6027\\u522b","item_content":"(.*?)"}', re.S) location_pattern = re.compile( r'{"card_type":41,"item_name":"\\u6240\\u5728\\u5730","item_content":"(.*?)"}', re.S) # jianjie_pattern = re.compile(r'{"card_type":41,"item_name":"\\u7b80\\u4ecb","item_content":"(.*?)"}',re.S) level_pattern = re.compile( r'{"card_type":41,"item_name":"\\u7b49\\u7ea7".*?"item_content":"(.*?)"', re.S) credit_pattern = re.compile( r'{"card_type":41,"item_name":"\\u9633\\u5149\\u4fe1\\u7528","item_content":"(.*?)"', re.S) reg_time_pattern = re.compile( r'{"card_type":41,"item_name":"\\u6ce8\\u518c\\u65f6\\u95f4","item_content":"(.*?)"}', re.S) sex_res = re.search(sex_pattern, response) if sex_res: sex = sex_res.group(1).encode('utf8').decode('unicode_escape') location_res = re.search(location_pattern, response) if location_res: location = location_res.group(1).encode('utf8').decode( 'unicode_escape') # jianjie_res = re.search(jianjie_pattern,response) # if jianjie_res: # jianjie = jianjie_res.group(1).encode('utf8').decode('unicode_escape') level_res = re.search(level_pattern, response) if level_res: level = level_res.group(1).encode('utf8').decode( 'unicode_escape') credit_res = re.search(credit_pattern, response) if credit_res: credit = credit_res.group(1).encode('utf8').decode( 'unicode_escape') reg_time_res = re.search(reg_time_pattern, response) if reg_time_res: reg_time = reg_time_res.group(1).encode('utf8').decode( 'unicode_escape') data = { 'user_id': user_id, 'sex': sex, 'location': location, # 'jianjie':jianjie, 'level': level, 'credit': credit, 'reg_time': reg_time } self.db.save(data) def get_fans(self, user_id, user_name): fans = 'https://m.weibo.cn/api/container/getIndex?containerid=231051_-_fans_-_{user_id}&since_id={since_id}' for sid in range(1, 251): print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')) print('正在爬 ' + user_name + ' 第' + str(sid) + '页的粉丝') sleep(0.5) url = fans.format(user_id=user_id, since_id=sid) print(url) response = self.download.get_html(url) if response: try: res_json = json.loads(response) if 'cards' in res_json.keys(): if res_json['cards']: results = res_json['cards'][0] if 'card_group' in results.keys(): for res in results['card_group']: if 'user' in res.keys(): user = res['user']['screen_name'] fans_user_id = res['user']['id'] data = { 'user': user, 'user_id': fans_user_id, 'flag': False } self.db.save_first(data) else: print('爬了' + user_name + ' ' + str(sid) + ' 页粉丝') break except: print('json解析出错') def get_followers(self, user_id, user_name): followers = 'https://m.weibo.cn/api/container/getIndex?containerid=231051_-_followers_-_{user_id}&page={page}' for page in range(1, 11): print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')) print('正在爬 ' + user_name + ' 第' + str(page) + '页的关注') sleep(0.5) url = followers.format(user_id=user_id, page=page) response = self.download.get_html(url) if response: try: res_json = json.loads(response) if 'cards' in res_json.keys(): if res_json['cards']: results = res_json['cards'][0] if 'card_group' in results.keys(): for res in results['card_group']: if 'user' in res.keys(): user = res['user']['screen_name'] follower_user_id = res['user']['id'] data = { 'user': user, 'user_id': follower_user_id, 'flag': False } self.db.save_first(data) else: print('爬了' + user_name + ' ' + str(page) + ' 页关注') break except: print('json解析出错')
class GroupCrawler(object): def __init__(self): self.redisdb = RedisClient() self.mongodb = MongoClient() self.crawl_urls = [] self.use_proxy = False if CRAWL_MODE == 'proxy': self.use_proxy = True self.start_urls = [ 'https://www.douban.com/group/explore', 'https://www.douban.com/group/explore/culture', 'https://www.douban.com/group/explore/travel', 'https://www.douban.com/group/explore/ent', 'https://www.douban.com/group/explore/fashion', 'https://www.douban.com/group/explore/life', 'https://www.douban.com/group/explore/tech' ] def start_requests(self): for url in self.start_urls: self.redisdb.add_new_url(url) def parse_content(self, html, url): if not html: logging.info('html is empty, from %s' % url) self.redisdb.add_new_url(url) return None bs = BeautifulSoup(html, 'lxml') # 找下一步要爬取的 links = bs.find_all( 'a', href=re.compile('https://www.douban.com/group/\d*-*\w*/$')) for l in links: new_url = l.attrs.get('href', '') if new_url and not self.redisdb.is_old_url(new_url): self.redisdb.add_new_url(new_url) # 解析想要的内容 info = { 'name': '', 'gid': '', 'members': 0, 'created_at': '', 'owner_name': '', 'owner_id': '' } try: info['name'] = self.parse_content_name(bs) info['gid'] = self.parse_content_gid(bs) info['members'] = self.parse_content_members(bs) info['created_at'] = self.parse_content_createdat(bs) info['owner_id'], info['owner_name'] = self.parse_content_owner(bs) logging.info(info) try: if info['gid']: self.mongodb.save_group(info) else: self.redisdb.add_new_url(url) except: self.redisdb.add_new_url(url) logging.info('insert into mysql error: %s' % info) traceback.print_exc() except: if new_url not in self.start_urls: logging.info('parse url %s error' % url) self.redisdb.add_new_url(url) # traceback.print_exc() def parse_content_name(self, bs): try: return bs.select_one('#group-info > h1').string.strip().encode( 'utf8') except: pass return '' def parse_content_gid(self, bs): try: group_members = bs.select_one( '#content > div.grid-16-8.clearfix > div.aside > div.mod.side-nav > p > a' ) return group_members.attrs.get('href').split('/group/')[1].split( '/')[0] except: pass return '' def parse_content_members(self, bs): try: group_members = bs.select_one( '#content > div.grid-16-8.clearfix > div.aside > div.mod.side-nav > p > a' ) pattern = re.compile('.*?\((\d+)\)', re.S) match = pattern.match(group_members.string.strip()) return match.groups()[0].encode('utf8') except: pass return 0 def parse_content_createdat(self, bs): try: created_at = '' for s in bs.select_one( '#content > div.grid-16-8.clearfix > div.article > div.group-board p' ).strings: created_at += s.strip() pattern = re.compile('.*?(\d{4}-\d{2}-\d{2})', re.S) match = pattern.match(created_at) return match.groups()[0].encode('utf8') except: pass return '' def parse_content_owner(self, bs): try: owner = bs.select_one( '#content > div.grid-16-8.clearfix > div.article > div.group-board > p > a' ) owner_id = owner.attrs.get('href').split('/people/')[1].split( '/')[0] owner_name = owner.string.strip().encode('utf8') return owner_id, owner_name except: pass return '', '' def crawler(self, iurl): url, proxies = iurl headers = {'Referer': 'https://www.douban.com/group/explore'} content = get_page(url, headers=headers, proxies=proxies) self.parse_content(content, url) def run(self): print 'group crawler start runing' if IS_SERVER: self.start_requests() pools = Pool(CRAWL_WORKER_THREAD_NUM) count = 0 while True: while self.redisdb.url_len: if self.redisdb.is_url_lock(): logging.info('url pool is locked') continue urls = self.redisdb.get_new_urls(CRAWL_WORKER_THREAD_NUM) self.redisdb.add_old_urls(urls) self.redisdb.url_unlock() if CRAWL_MODE in ['proxy', 'mix']: if CRAWL_MODE == 'mix' and count % 5 == 0: proxies = {} else: proxy = self.redisdb.rand_proxy().replace( 'https', 'http') proxies = {'https': proxy, 'http': proxy} pools.map(self.crawler, [(x, proxies) for x in urls]) logging.info('waitting for next round') count += 1 if count >= 1000: count = 0 sleep(CRAWL_WORKER_SLEEP) else: print 'url queue len is: %s' % self.redisdb.url_len