def get_followings(self, userid): url = "http://weibo.cn/%s" % (userid) s = grasp.get_content(url) soup = bs(s) tip = soup.find('div', class_='tip2') if(tip == None): return [] a = tip.findChildren() num = int(re.search("\d+", a[1].string).group()) pages = int(num/10) if( num%10 > 0): pages += 1 furl = urljoin('http://weibo.cn/', a[1].attrs['href']) for i in range(1, pages+1): url = furl + '?page=%d' % i self.db.hset(self.pages, userid, url) print url # retry 10 times if fails for j in range(10): if self.get_users_by_url(userid, url):break print url time.sleep(5) time.sleep(3) self.db.save()
def get_users_by_url(self, userid, url): s = None try: s = grasp.get_content(url) except: return False soup = bs(s) tables = soup('table') for t in tables: link = t.find('a') u = link.attrs['href'] sp = urlsplit(u) self.add_following(userid, sp.path[1:].strip()) return True