browser.reload() browser.click_link_by_text('查找') while not browser.is_element_present_by_css('.talent_rankinglist .tab .cur a', wait_time=10): print 'prov and city page reload' browser.reload() baseurl = browser.url for i in range(11)[1:]: url = baseurl + '&page=' + str(i) + '&' print url while True: try: browser.visit(url) break except Exception: continue soup = BeautifulSoup(browser.html) ol = soup.find('ol') if ol is None: print 'no this page' break soup = BeautifulSoup(str(ol)) for child in soup.findAll('a',{'class':'name'}): try: print re.search(r'sinaimg.cn/([0-9]*)/',soup.find('img',{'title':child.string}).get('src')).group(1) daren_uids.intappend(re.search(r'sinaimg.cn/([0-9]*)/',soup.find('img',{'title':child.string}).get('src')).group(1)) except Exception: pass print city[1].encode('utf-8') + ' is Done'
continue while len(browser.title) == 18: print "wait for homepage" time.sleep(0.5) for letter in range(97, 123): for prov in SPROV: for pages in range(1, 11): url = "http://verified.weibo.com/fame/%s/?rt=4&srt=3&province=%s&page=%s" % ( chr(letter), prov[1], str(pages), ) print url browser.visit(url) while not browser.is_element_present_by_css(".categories_list .titlebar", wait_time=10): print "page reload" browser.reload() soup = BeautifulSoup(browser.html) soup = BeautifulSoup(str(soup.find("div", {"class": "detail"}))) for i in soup.find_all("input"): print i["value"] famous_uids.intappend(i["value"]) soup = BeautifulSoup(browser.html) ifnext = soup.find("div", {"class": "W_pages W_pages_comment"}) if str(ifnext).find("下一页") == -1: print "no more page" break