def generate_user_seeds(self, request_times=1, user_accessed_set=None): if self._url is None: print "I'm anonymous user." return 0 else: if self.soup is None: self.parser() soup = self.soup seed_list = [] for i in range(request_times): post_url = "https://www.zhihu.com/lookup/suggest_member" _xsrf = soup.find("input", attrs={'name': '_xsrf'})["value"] data = { 'ids': ",,", '_xsrf': _xsrf } post_data = urlencode(data) r_post = zhihu_util.post(post_url, post_data) suggent_member_list = json.loads(r_post)["msg"] for suggent_member in suggent_member_list: suggent_member_soup = zhihu_util.get_soup(suggent_member) suggent_member_str = suggent_member_soup.find("a", class_="image-link")\ .get("href").split("/")[-1] seed_list.append(suggent_member_str) seed_set = set(seed_list) if user_accessed_set: seed_set.difference_update(user_accessed_set) return seed_set
def test_post_repeatedly(self): count = 0 post_url = 'https://www.zhihu.com/node/TopicsPlazzaListV2' post_data = 'method=next¶ms=%7B%22topic_id%22%3A686%2C%22offset%22%3A80%2C%22hash_id%22%3A%22dced108689287057f5cc3b5e85cb8289%22%7D&_xsrf=c6946d5914172133e875956a711be3ad' while count < 3: response = zhihu_util.post(post_url, post_data) print "...post count:%s" % count self.assertTrue(response != "FAIL", "post count is {0}".format(count)) count += 1
def get_followees(self): if self._url is None: print "I'm anonymous user." return yield else: followees_num = self.get_followees_num() if followees_num == 0: return yield else: followee_url = self._url + "/followees" r = zhihu_util.get_content(followee_url) # print "r:%s" % r soup = zhihu_util.get_soup(r) for i in xrange((followees_num - 1) / 20 + 1): if i == 0: user_url_list = soup.find_all("h2", class_="zm-list-content-title") for j in xrange(min(followees_num, 20)): try: yield User(user_url_list[j].a["href"], user_url_list[j].a.string.encode("utf-8")) except: print("...get followee error ,just skip...") return yield else: post_url = "http://www.zhihu.com/node/ProfileFolloweesListV2" _xsrf = soup.find("input", attrs={'name': '_xsrf'})["value"] offset = i * 20 hash_id = re.findall("hash_id": "(.*)"},", r)[0] params = json.dumps( {"offset": offset, "order_by": "created", "hash_id": hash_id}) data = { '_xsrf': _xsrf, 'method': "next", 'params': params } post_data = urlencode(data) r_post = zhihu_util.post(post_url, post_data) followee_list = json.loads(r_post)["msg"] for j in xrange(min(followees_num - i * 20, 20)): try: followee_soup = zhihu_util.get_soup(followee_list[j]) user_link = followee_soup.find("h2", class_="zm-list-content-title").a yield User(user_link["href"], user_link.string.encode("utf-8")) except: print("...get followee error ,just skip...") return yield
def test_post(self): post_url = 'https://www.zhihu.com/node/TopicsPlazzaListV2' post_data = 'method=next¶ms=%7B%22topic_id%22%3A253%2C%22offset%22%3A60%2C%22hash_id%22%3A%22dced108689287057f5cc3b5e85cb8289%22%7D&_xsrf=c6946d5914172133e875956a711be3ad' zhihu_util.post(post_url, post_data)