def get(self): if not self.bsp_info: logger.debug('Not a valid bsp') return None if not self.page: logger.debug("Cant't fetch content.") return None html = self.page username, homepage, sp = self.bsp_info links_url = None if sp not in ('tianya', 'ycool', 'blogcn', '163', 'cnblogs', 'sina', 'live', 'blogbus', 'baidu', 'hexun', 'sohu'): return None if sp == 'sohu': pattern = re.compile("var _ebi = '([^^].*?)'") res = pattern.findall(html) if res: links_url = 'http://blog.sohu.com/sff/links/%s.html'%res[0] elif sp == '163': pattern = re.compile("hostName : '([^^].*?)'") hostNameRes = pattern.findall(html) if hostNameRes: hostName = hostNameRes[0] pattern = re.compile("dataDigest : '([^^].*?)'") dataDigest = pattern.findall(html) if dataDigest: dataDigest = dataDigest[0] if hostNameRes and dataDigest: link = 'http://%s.blog.163.com/friends/dwr/call/plaincall/UserBean.getFriends.dwr'%hostName mario = Mario() body = [('callCount', '1'), ('scriptSessionId', '${scriptSessionId}561'), ('c0-scriptName', 'UserBean'), ('c0-methodName','getFriends'), ('c0-id', 0), ('c0-param0', 'boolean:false'), ('c0-param1', 'number:0'), ('c0-param2', 'number:0'), ('c0-param3', 'number:20'), ('batchId', 0),] response = mario.get(link, body=urlencode(body)) if response and response.body: html = response.body elif sp == 'baidu': links_url = urljoin(homepage, 'friends') elif sp == 'hexun': html = '' friend_links = [] page = 1 results = [] def callback(response): if response and response.body: results.append(response.body) while True: mario = Mario() response = mario.get('http://hexun.com/%s/%d/t0/friends.html'%(username, page)) if not response or not response.body: break; friendsPage = response.body pattern = re.compile('<!-- 朋友列表:开始 -->[^^]*?<!-- 朋友列表:结束 -->') if friendsPage: dom = pattern.findall(friendsPage) if not friendsPage or not dom: break pattern = re.compile('<div class="FriendTableList_2_1_1"><a href="/([^^].*?)/default.html"', re.I) ids = pattern.findall(dom[0]) if not ids: break has_friend_link = False results = [] mario = MarioBatch(callback=callback) for friend_id in ids: mario.add_job('http://hexun.com/%s/default.html'%friend_id) mario(5) if not results: break pattern = re.compile('blogname=([^^].*?)&preview=', re.I) for f in results: if not f: continue res = pattern.findall(f) if not res: continue friend_links.append('<a href="http://%s.blog.hexun.com/">link</a>'%res[0]) has_friend_link = True if not has_friend_link: break page += 1 html = ','.join(friend_links) elif sp == 'blogcn': bsp = BSP() nu = bsp.normalize(homepage) if nu!=homepage: mario = Mario() response = mario.get(nu[1]) if response and response.body: html = response.body if links_url: html = '' mario = Mario() response = mario.get(links_url) if response and response.body: html = response.body return self.parser(html, sp, homepage)