Пример #1
0
 def get(self):
     if not self.bsp_info:
         logger.debug('Not a valid bsp')
         return None
     if not self.page:
         logger.debug("Cant't fetch content.")
         return None
     html = self.page
     username, homepage, sp  = self.bsp_info
     links_url = None
     if sp not in ('tianya', 'ycool', 'blogcn', '163', 'cnblogs', 'sina', 'live', 'blogbus', 'baidu', 'hexun', 'sohu'):
         return None
     if sp == 'sohu':
         pattern = re.compile("var _ebi = '([^^].*?)'")
         res = pattern.findall(html)
         if res: links_url = 'http://blog.sohu.com/sff/links/%s.html'%res[0]
     elif sp == '163':
         pattern = re.compile("hostName     : '([^^].*?)'")
         hostNameRes = pattern.findall(html)
         if hostNameRes: hostName = hostNameRes[0]
         pattern = re.compile("dataDigest	  : '([^^].*?)'")
         dataDigest = pattern.findall(html)
         if dataDigest: dataDigest = dataDigest[0]
         if hostNameRes and dataDigest: 
             link = 'http://%s.blog.163.com/friends/dwr/call/plaincall/UserBean.getFriends.dwr'%hostName
             mario = Mario()
             body = [('callCount', '1'), ('scriptSessionId', '${scriptSessionId}561'), ('c0-scriptName', 'UserBean'), ('c0-methodName','getFriends'), ('c0-id', 0), ('c0-param0', 'boolean:false'), ('c0-param1', 'number:0'), ('c0-param2', 'number:0'), ('c0-param3', 'number:20'), ('batchId', 0),]
             response = mario.get(link, body=urlencode(body))
             if response and response.body: html = response.body
     elif sp == 'baidu':
         links_url = urljoin(homepage, 'friends')
     elif sp == 'hexun':
         html = ''
         friend_links = []
         page = 1
         results = []
         def callback(response):
             if response and response.body: results.append(response.body)
         while True:
             mario = Mario()
             response = mario.get('http://hexun.com/%s/%d/t0/friends.html'%(username, page))
             if not response or not response.body: break;
             friendsPage = response.body
             pattern = re.compile('<!--  朋友列表:开始  -->[^^]*?<!--  朋友列表:结束  -->')
             if friendsPage: dom = pattern.findall(friendsPage)
             if not friendsPage or not dom: break
             pattern = re.compile('<div class="FriendTableList_2_1_1"><a href="/([^^].*?)/default.html"', re.I)
             ids = pattern.findall(dom[0])
             if not ids: break
             has_friend_link = False
             results = []
             mario = MarioBatch(callback=callback)
             for friend_id in ids:
                 mario.add_job('http://hexun.com/%s/default.html'%friend_id)
             mario(5)
             if not results: break
             pattern = re.compile('blogname=([^^].*?)&preview=', re.I)
             for f in results:
                 if not f: continue
                 res = pattern.findall(f)
                 if not res: continue
                 friend_links.append('<a href="http://%s.blog.hexun.com/">link</a>'%res[0])
                 has_friend_link = True
             if not has_friend_link: break
             page += 1
         html = ','.join(friend_links)
     elif sp == 'blogcn':
         bsp = BSP()
         nu = bsp.normalize(homepage)
         if nu!=homepage:
             mario = Mario()
             response = mario.get(nu[1])
             if response and response.body: html = response.body
     if links_url:
         html = '' 
         mario = Mario()
         response = mario.get(links_url)
         if response and response.body:
             html = response.body
     return self.parser(html, sp, homepage)