def save_all_followers_profile(self): # save the profile of all the followers of the collection if self.url is None: raise ValueError("Did not found url for the collection") else: self.parser() new_session = login.log_in() cookie = login.get_cookie() xsrf = cookie["_xsrf"] header = { "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:34.0) Gecko/20100101 Firefox/34.0", "Host": "www.zhihu.com", "Origin": "http://www.zhihu.com", "Referer": self.url + "/followers", } title = self.get_title() text_file = open(title.replace("/", "") + " followers.txt(collection)", "w") follower_num = self.get_follower_num() for i in xrange((follower_num - 1) / 10): data = {"offset": 10 * i, "_xsrf": xsrf} r = new_session.post(self.url + "/followers", headers=header, data=data, cookies=cookie) if r.status_code != 200: raise ValueError("Error in retrieving collection's follower") soup = BeautifulSoup(r.text.decode("string_escape"), "lxml") soup = soup.find_all("a", class_="zg-link") for j in soup: follower_id = j["title"].decode("unicode-escape") follower_url = Zhihu + "/people/" + j["href"][32:] text_file.write("Url: " + follower_url + " ID: " + follower_id + "\n") text_file.close() return
def get_topic_followed(self): # get the list of urls of the topics that the user is following if self.url is None: print "Anonymous user, cannot get topic followed" return else: if self.user_session is None: self.user_session = login.log_in() topics_followed_url = self.url + "/topics" topic_followed_num = self.get_topic_followed_num() if topic_followed_num == 0: return [] r = self.user_session.get(topics_followed_url) soup = BeautifulSoup(r.content, "lxml") cookie = login.get_cookie() _xsrf = soup.find("input", attrs={'name': '_xsrf'})["value"] header = { 'User-Agent': "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:34.0) Gecko/20100101 Firefox/34.0", 'Host': "www.zhihu.com", 'Referer': topics_followed_url } topic_followed_list = [] for i in xrange((topic_followed_num - 1) / 20 + 1): data = {'_xsrf': _xsrf, 'start': 0, 'offset': 20 * i} response = self.user_session.post(topics_followed_url, data=data, headers=header, cookies=cookie) topic_followed_raw = response.json()["msg"][1] main_soup = BeautifulSoup(topic_followed_raw, "lxml") topic_followed_raw = main_soup.find_all("div", class_="zm-profile-section-main") for topic in topic_followed_raw: topic = Zhihu + topic.a.next_sibling.next_sibling["href"] topic_followed_list.append(topic) return topic_followed_list
def save_all_followers_profile(self): # save the profile of all the followers of the collection if self.url is None: raise ValueError("Did not found url for the collection") else: self.parser() new_session = login.log_in() cookie = login.get_cookie() xsrf = cookie["_xsrf"] header = { 'User-Agent': "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:34.0) Gecko/20100101 Firefox/34.0", 'Host': "www.zhihu.com", 'Origin': "http://www.zhihu.com", 'Referer': self.url + "/followers", } title = self.get_title() text_file = open( title.replace("/", "") + " followers.txt(collection)", "w") follower_num = self.get_follower_num() for i in xrange((follower_num - 1) / 10): data = {'offset': 10 * i, '_xsrf': xsrf} r = new_session.post(self.url + "/followers", headers=header, data=data, cookies=cookie) if r.status_code != 200: raise ValueError( "Error in retrieving collection's follower") soup = BeautifulSoup(r.text.decode('string_escape'), "lxml") soup = soup.find_all("a", class_="zg-link") for j in soup: follower_id = j["title"].decode('unicode-escape') follower_url = Zhihu + "/people/" + j["href"][32:] text_file.write("Url: " + follower_url + " ID: " + follower_id + "\n") text_file.close() return
def get_column_followed(self): # get the list of urls of the columns that the user is following if self.url is None: print "Anonymous user, cannot get column followed" return else: if self.user_session is None: self.user_session = login.log_in() column_followed_url = self.url + "/columns/followed" column_followed_num = self.get_column_followed_num() if column_followed_num == 0: return [] r = self.user_session.get(column_followed_url) soup = BeautifulSoup(r.content, "lxml") # print soup cookie = login.get_cookie() _xsrf = soup.find("input", attrs={'name': '_xsrf'})["value"] soup1 = soup.find("div", class_="zh-general-list clearfix") string = soup1['data-init'] params = literal_eval(string)['params'] post_url = "http://www.zhihu.com/node/ProfileFollowedColumnsListV2" header = { 'User-Agent': "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:34.0) Gecko/20100101 Firefox/34.0", 'Host': "www.zhihu.com", 'Referer': column_followed_url } column_followed_list = [] for i in xrange((column_followed_num - 1) / 20 + 1): params['offset'] = i * 20 data = {'_xsrf': _xsrf, 'method': "next", 'params': json.dumps(params)} response = self.user_session.post(post_url, data=data, headers=header, cookies=cookie) column_followed_list_raw = response.json()["msg"] for column_followed_raw in column_followed_list_raw: main_soup = BeautifulSoup(column_followed_raw, "lxml") column_followed = main_soup.find("div", class_="zm-profile-section-main").a["href"] column_followed_list.append(column_followed) return column_followed_list
def save_all_activity(self): # save all activities of the user if self.url is None: print "Anonymous user, cannot save all activity" return else: if self.soup is None: self.parser() usr_id = self.get_id() text_file = open(usr_id.replace("/", "") + " all activities.txt", "w") temp_soup = self.soup.find("div", class_="zm-profile-section-list profile-feed-wrap") activities = temp_soup.find_all("div", class_="zm-profile-section-main zm-profile-section-" "activity-main zm-profile-activity-page-item-main") times = temp_soup.find_all("span", class_="zm-profile-setion-time zg-gray zg-right") if len(times) != len(activities): raise ValueError("Bug in save_all_activities") for i in xrange(len(activities)): activity = activities[i] text_file.write(activity.text[:-1]) text_file.write(times[i].text + "\n\n") try: text_file.write("url is " + Zhihu + activity.a.next_sibling.next_sibling["href"] + "\n") except: text_file.write( "url is " + Zhihu + activity.a.next_sibling.next_sibling.next_sibling["href"] + "\n") if self.user_session is None: self.user_session = login.log_in() start_raw = self.soup.find_all("div", class_="zm-profile-section-item zm-item clearfix") try: start_raw[-1] except IndexError: print "No activity found" return start = start_raw[-1]["data-time"] _xsrf = self.soup.find("input", attrs={'name': '_xsrf'})["value"] data = {"start": start, "_xsrf": _xsrf} cookie = login.get_cookie() activities_url = self.url + "/activities" header = { 'User-Agent': "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:34.0) Gecko/20100101 Firefox/34.0", 'Host': "www.zhihu.com", 'Referer': self.url } info = 1 while True: if info % 10 == 0 and info != 0: print "Saved {0} pieces of activities".format(info * 10) r = self.user_session.post(activities_url, headers=header, data=data, cookies=cookie) if r.status_code != 200: break new_soup = BeautifulSoup(r.json()["msg"][1], "lxml") activities = new_soup.find_all("div", class_="zm-profile-section-main zm-profile-section" "-activity-main zm-profile-activity-page-item-main") times = new_soup.find_all("span", class_="zm-profile-setion-time zg-gray zg-right") if len(times) != len(activities): raise ValueError("Bug in save_all_activities") for i in xrange(len(activities)): activity = activities[i] text_file.write(activity.text[:-1]) text_file.write(times[i].text + "\n\n") try: if activity.a.next_sibling.next_sibling["href"][0:3] != "http": text_file.write("url is " + Zhihu + activity.a.next_sibling.next_sibling["href"] + "\n") else: text_file.write("url is " + activity.a.next_sibling.next_sibling["href"] + "\n") except: if activity.a.next_sibling.next_sibling.next_sibling["href"][0:3] != "http": text_file.write( "url is " + Zhihu + activity.a.next_sibling.next_sibling.next_sibling["href"] + "\n") else: text_file.write( "url is " + activity.a.next_sibling.next_sibling.next_sibling["href"] + "\n") try: start = new_soup.find_all("div", class_="zm-profile-section-item zm-item clearfix")[-1]["data-time"] except: break data["start"] = start info += 1 text_file.write("Approximately {0} pieces of activities".format(info * 10)) text_file.close() return
def save_followees_profile(self): # save the profile of all the user's followees if self.url is None: print "Anonymous user, cannot save followees profile" return else: if self.user_session is None: self.user_session = login.log_in() followee_num = self.get_followee_num() if followee_num == 0: print "No followee" return followee_url = self.url + "/followees" cookie = login.get_cookie() r = self.user_session.get(followee_url) soup = BeautifulSoup(r.content, "lxml") _xsrf = soup.find("input", attrs={'name': '_xsrf'})["value"] soup1 = soup.find("div", class_="zh-general-list clearfix") string = soup1['data-init'] params = literal_eval(string)['params'] post_url = "http://www.zhihu.com/node/ProfileFolloweesListV2" header = { 'User-Agent': "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:34.0) Gecko/20100101 Firefox/34.0", 'Host': "www.zhihu.com", 'Referer': followee_url, } book = xlwt.Workbook(encoding="utf-8") new_sheet = book.add_sheet("FolloweeList") new_sheet.write(0, 0, "url") new_sheet.write(0, 1, "id") new_sheet.write(0, 2, "follower_num") new_sheet.write(0, 3, "asks_num") new_sheet.write(0, 4, "answers_num") new_sheet.write(0, 5, "agree_num") new_sheet.write(0, 6, "is_robot") row = 1 for i in xrange((followee_num - 1) / 20 + 1): if i % 100 == 0 and i != 0: print "Have recorded", i * 20, "followees" params['offset'] = i * 20 data = {'_xsrf': _xsrf, 'method': "next", 'params': json.dumps(params)} response = self.user_session.post(post_url, data=data, headers=header, cookies=cookie) followee_list = response.json()["msg"] for j in followee_list: main_soup = BeautifulSoup(j, "lxml") followees_url = main_soup.find("h2", class_="zm-list-content-title").a["href"] new_sheet.write(row, 0, followees_url) followees_id = main_soup.find("h2", class_="zm-list-content-title").a["title"] new_sheet.write(row, 1, followees_id) info_list = main_soup.find_all("a", class_="zg-link-gray-normal") follower_num = int(info_list[0].text.split()[0]) new_sheet.write(row, 2, follower_num) asks_num = int(info_list[1].text.split()[0]) new_sheet.write(row, 3, asks_num) answers_num = int(info_list[2].text.split()[0]) new_sheet.write(row, 4, answers_num) agree_num = int(info_list[3].text.split()[0]) new_sheet.write(row, 5, agree_num) if followee_num < 5 and asks_num < 2 and answers_num < 2 and agree_num < 2: is_robot = 1 else: is_robot = 0 new_sheet.write(row, 6, is_robot) row += 1 book.save(self.get_id() + " followee list.xls") return
def save_all_voters_profile(self): # save the profile of all voters of the answer if self.url is None: raise ValueError("Did not found url for the answer") else: if self.session is None: self.session = login.log_in() if self.soup is None: self.parser() answer_id = self.soup.find("div", class_="zm-item-answer ")["data-aid"] voters_profile_url = Zhihu + "/answer/" + answer_id + "/voters_profile" cookie = login.get_cookie() header = { 'User-Agent': "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:34.0) Gecko/20100101 Firefox/34.0", 'Host': "www.zhihu.com", 'Referer': self.url } book = xlwt.Workbook(encoding="utf-8") new_sheet = book.add_sheet("Voter_profile") new_sheet.write(0, 0, "url") new_sheet.write(0, 1, "id") new_sheet.write(0, 2, "agree_num") new_sheet.write(0, 3, "thanks_num") new_sheet.write(0, 4, "ask_num") new_sheet.write(0, 5, "answer_num") new_sheet.write(0, 6, "is_robot") total_vote = self.get_vote_num() row = 1 robot_vote = 0 for i in xrange((total_vote - 1) / 10 + 1): data = {"total": total_vote, "offset": 10 * i} r = self.session.get(voters_profile_url, headers=header, data=data, cookies=cookie) for j in r.json()["payload"]: if row % 100 == 0 and row != 0: print "Have saved {0} voter profiles".format(row) soup = BeautifulSoup(j, "lxml") try: voter_url = soup.find("div", class_="author ellipsis").a["href"] except: voter_url = "Anonymous user" new_sheet.write(row, 0, voter_url) if voter_url != "Anonymous user": voter_id = soup.find("div", class_="author ellipsis").a["title"] new_sheet.write(row, 1, voter_id) try: voter_agree_num = int(soup.find("ul", class_="status").li.span.text.split()[0]) except ValueError: voter_agree_num = soup.find("ul", class_="status").li.span.text.split()[0] new_sheet.write(row, 2, voter_agree_num) try: voter_thanks_num = int( soup.find("ul", class_="status").li.next_sibling.next_sibling.span.text.split()[0]) except ValueError: voter_thanks_num = soup.find("ul", class_="status").li.next_sibling.next_sibling.span.text.split()[0] new_sheet.write(row, 3, voter_thanks_num) voter_ask_num = int(soup.find_all("li", class_="hidden-phone")[0].a.text.split()[0]) new_sheet.write(row, 4, voter_ask_num) voter_answer_num = int(soup.find_all("li", class_="hidden-phone")[1].a.text.split()[0]) new_sheet.write(row, 5, voter_answer_num) if voter_agree_num < 1 and voter_thanks_num < 1 and voter_ask_num < 1 and voter_answer_num < 2: voter_is_robot = 1 robot_vote += 1 else: voter_is_robot = 0 new_sheet.write(row, 6, voter_is_robot) row += 1 book.save(self.url[20:].replace("/", " ") + " voter profile(answer).xls") return robot_vote / (total_vote * 1.0)
def save_all_followers_profile(self): # save the profile of all followers of the question if self.url is None: raise ValueError("Did not found url for the question") else: if self.session is None: self.session = login.log_in() url = self.url + "/followers" follower_num = self.get_follower_num() r = self.session.get(url) soup = BeautifulSoup(r.content, "lxml") _xsrf = soup.find("input", attrs={'name': '_xsrf'})["value"] header = { 'User-Agent': "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:34.0) Gecko/20100101 Firefox/34.0", 'Host': "www.zhihu.com", 'Referer': url } book = xlwt.Workbook(encoding="utf-8") new_sheet = book.add_sheet("Follower_profile") new_sheet.write(0, 0, "url") new_sheet.write(0, 1, "id") new_sheet.write(0, 2, "follower_num") new_sheet.write(0, 3, "ask_num") new_sheet.write(0, 4, "answer_num") new_sheet.write(0, 5, "agree_num") new_sheet.write(0, 6, "is_robot") cookie = login.get_cookie() row = 1 for i in xrange((follower_num-1)/20 + 1): data = {"offset": 20*i, "start": 0, "_xsrf": _xsrf} r1 = self.session.post(url, headers=header, data=data, cookies=cookie) temp_soup = BeautifulSoup(r1.json()["msg"][1], "lxml") user_list_raw = temp_soup.find_all("div", class_="zm-profile-card zm-profile-section-item zg-clear no-hovercard") for j in user_list_raw: try: user_url = j.h2.a["href"] new_sheet.write(row, 0, user_url) user_id = j.find("a", class_="zm-item-link-avatar")["title"] new_sheet.write(row, 1, user_id) sub_soup = j.find_all("a", class_="zg-link-gray-normal") try: user_follower = int(sub_soup[0].text.split()[0]) except: user_follower = sub_soup[0].text.split()[0] new_sheet.write(row, 2, user_follower) user_asks = int(sub_soup[1].text.split()[0]) new_sheet.write(row, 3, user_asks) try: user_answers = int(sub_soup[2].text.split()[0]) except: user_answers = sub_soup[2].text.split()[0] new_sheet.write(row, 4, user_answers) try: user_agrees = int(sub_soup[3].text.split()[0]) except: user_agrees = sub_soup[3].text.split()[0] new_sheet.write(row, 5, user_agrees) if user_follower < 2 and user_asks < 1 and user_answers < 2 and user_agrees < 3: is_robot = 1 else: is_robot = 0 new_sheet.write(row, 6, is_robot) except: user_url = "Anonymous user" new_sheet.write(row, 0, user_url) row += 1 book.save(self.get_title().replace("/", "") + " followers profile(question).xls") return