Exemplo n.º 1
0
 def save_all_followers_profile(self):
     # save the profile of all the followers of the collection
     if self.url is None:
         raise ValueError("Did not found url for the collection")
     else:
         self.parser()
         new_session = login.log_in()
         cookie = login.get_cookie()
         xsrf = cookie["_xsrf"]
         header = {
             "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:34.0) Gecko/20100101 Firefox/34.0",
             "Host": "www.zhihu.com",
             "Origin": "http://www.zhihu.com",
             "Referer": self.url + "/followers",
         }
         title = self.get_title()
         text_file = open(title.replace("/", "") + " followers.txt(collection)", "w")
         follower_num = self.get_follower_num()
         for i in xrange((follower_num - 1) / 10):
             data = {"offset": 10 * i, "_xsrf": xsrf}
             r = new_session.post(self.url + "/followers", headers=header, data=data, cookies=cookie)
             if r.status_code != 200:
                 raise ValueError("Error in retrieving collection's follower")
             soup = BeautifulSoup(r.text.decode("string_escape"), "lxml")
             soup = soup.find_all("a", class_="zg-link")
             for j in soup:
                 follower_id = j["title"].decode("unicode-escape")
                 follower_url = Zhihu + "/people/" + j["href"][32:]
                 text_file.write("Url: " + follower_url + "    ID: " + follower_id + "\n")
         text_file.close()
         return
Exemplo n.º 2
0
 def get_topic_followed(self):
     # get the list of urls of the topics that the user is following
     if self.url is None:
         print "Anonymous user, cannot get topic followed"
         return
     else:
         if self.user_session is None:
             self.user_session = login.log_in()
         topics_followed_url = self.url + "/topics"
         topic_followed_num = self.get_topic_followed_num()
         if topic_followed_num == 0:
             return []
         r = self.user_session.get(topics_followed_url)
         soup = BeautifulSoup(r.content, "lxml")
         cookie = login.get_cookie()
         _xsrf = soup.find("input", attrs={'name': '_xsrf'})["value"]
         header = {
             'User-Agent': "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:34.0) Gecko/20100101 Firefox/34.0",
             'Host': "www.zhihu.com",
             'Referer': topics_followed_url
         }
         topic_followed_list = []
         for i in xrange((topic_followed_num - 1) / 20 + 1):
             data = {'_xsrf': _xsrf, 'start': 0, 'offset': 20 * i}
             response = self.user_session.post(topics_followed_url, data=data, headers=header, cookies=cookie)
             topic_followed_raw = response.json()["msg"][1]
             main_soup = BeautifulSoup(topic_followed_raw, "lxml")
             topic_followed_raw = main_soup.find_all("div", class_="zm-profile-section-main")
             for topic in topic_followed_raw:
                 topic = Zhihu + topic.a.next_sibling.next_sibling["href"]
                 topic_followed_list.append(topic)
         return topic_followed_list
Exemplo n.º 3
0
 def save_all_followers_profile(self):
     # save the profile of all the followers of the collection
     if self.url is None:
         raise ValueError("Did not found url for the collection")
     else:
         self.parser()
         new_session = login.log_in()
         cookie = login.get_cookie()
         xsrf = cookie["_xsrf"]
         header = {
             'User-Agent':
             "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:34.0) Gecko/20100101 Firefox/34.0",
             'Host': "www.zhihu.com",
             'Origin': "http://www.zhihu.com",
             'Referer': self.url + "/followers",
         }
         title = self.get_title()
         text_file = open(
             title.replace("/", "") + " followers.txt(collection)", "w")
         follower_num = self.get_follower_num()
         for i in xrange((follower_num - 1) / 10):
             data = {'offset': 10 * i, '_xsrf': xsrf}
             r = new_session.post(self.url + "/followers",
                                  headers=header,
                                  data=data,
                                  cookies=cookie)
             if r.status_code != 200:
                 raise ValueError(
                     "Error in retrieving collection's follower")
             soup = BeautifulSoup(r.text.decode('string_escape'), "lxml")
             soup = soup.find_all("a", class_="zg-link")
             for j in soup:
                 follower_id = j["title"].decode('unicode-escape')
                 follower_url = Zhihu + "/people/" + j["href"][32:]
                 text_file.write("Url: " + follower_url + "    ID: " +
                                 follower_id + "\n")
         text_file.close()
         return
Exemplo n.º 4
0
 def get_column_followed(self):
     # get the list of urls of the columns that the user is following
     if self.url is None:
         print "Anonymous user, cannot get column followed"
         return
     else:
         if self.user_session is None:
             self.user_session = login.log_in()
         column_followed_url = self.url + "/columns/followed"
         column_followed_num = self.get_column_followed_num()
         if column_followed_num == 0:
             return []
         r = self.user_session.get(column_followed_url)
         soup = BeautifulSoup(r.content, "lxml")
         # print soup
         cookie = login.get_cookie()
         _xsrf = soup.find("input", attrs={'name': '_xsrf'})["value"]
         soup1 = soup.find("div", class_="zh-general-list clearfix")
         string = soup1['data-init']
         params = literal_eval(string)['params']
         post_url = "http://www.zhihu.com/node/ProfileFollowedColumnsListV2"
         header = {
             'User-Agent': "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:34.0) Gecko/20100101 Firefox/34.0",
             'Host': "www.zhihu.com",
             'Referer': column_followed_url
         }
         column_followed_list = []
         for i in xrange((column_followed_num - 1) / 20 + 1):
             params['offset'] = i * 20
             data = {'_xsrf': _xsrf, 'method': "next", 'params': json.dumps(params)}
             response = self.user_session.post(post_url, data=data, headers=header, cookies=cookie)
             column_followed_list_raw = response.json()["msg"]
             for column_followed_raw in column_followed_list_raw:
                 main_soup = BeautifulSoup(column_followed_raw, "lxml")
                 column_followed = main_soup.find("div", class_="zm-profile-section-main").a["href"]
                 column_followed_list.append(column_followed)
         return column_followed_list
Exemplo n.º 5
0
 def save_all_activity(self):
     # save all activities of the user
     if self.url is None:
         print "Anonymous user, cannot save all activity"
         return
     else:
         if self.soup is None:
             self.parser()
         usr_id = self.get_id()
         text_file = open(usr_id.replace("/", "") + " all activities.txt", "w")
         temp_soup = self.soup.find("div", class_="zm-profile-section-list profile-feed-wrap")
         activities = temp_soup.find_all("div", class_="zm-profile-section-main zm-profile-section-"
                                                       "activity-main zm-profile-activity-page-item-main")
         times = temp_soup.find_all("span", class_="zm-profile-setion-time zg-gray zg-right")
         if len(times) != len(activities):
             raise ValueError("Bug in save_all_activities")
         for i in xrange(len(activities)):
             activity = activities[i]
             text_file.write(activity.text[:-1])
             text_file.write(times[i].text + "\n\n")
             try:
                 text_file.write("url is " + Zhihu + activity.a.next_sibling.next_sibling["href"] + "\n")
             except:
                 text_file.write(
                     "url is " + Zhihu + activity.a.next_sibling.next_sibling.next_sibling["href"] + "\n")
         if self.user_session is None:
             self.user_session = login.log_in()
         start_raw = self.soup.find_all("div", class_="zm-profile-section-item zm-item clearfix")
         try:
             start_raw[-1]
         except IndexError:
             print "No activity found"
             return
         start = start_raw[-1]["data-time"]
         _xsrf = self.soup.find("input", attrs={'name': '_xsrf'})["value"]
         data = {"start": start, "_xsrf": _xsrf}
         cookie = login.get_cookie()
         activities_url = self.url + "/activities"
         header = {
             'User-Agent': "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:34.0) Gecko/20100101 Firefox/34.0",
             'Host': "www.zhihu.com",
             'Referer': self.url
         }
         info = 1
         while True:
             if info % 10 == 0 and info != 0:
                 print "Saved {0} pieces of activities".format(info * 10)
             r = self.user_session.post(activities_url, headers=header, data=data, cookies=cookie)
             if r.status_code != 200:
                 break
             new_soup = BeautifulSoup(r.json()["msg"][1], "lxml")
             activities = new_soup.find_all("div", class_="zm-profile-section-main zm-profile-section"
                                                          "-activity-main zm-profile-activity-page-item-main")
             times = new_soup.find_all("span", class_="zm-profile-setion-time zg-gray zg-right")
             if len(times) != len(activities):
                 raise ValueError("Bug in save_all_activities")
             for i in xrange(len(activities)):
                 activity = activities[i]
                 text_file.write(activity.text[:-1])
                 text_file.write(times[i].text + "\n\n")
                 try:
                     if activity.a.next_sibling.next_sibling["href"][0:3] != "http":
                         text_file.write("url is " + Zhihu + activity.a.next_sibling.next_sibling["href"] + "\n")
                     else:
                         text_file.write("url is " + activity.a.next_sibling.next_sibling["href"] + "\n")
                 except:
                     if activity.a.next_sibling.next_sibling.next_sibling["href"][0:3] != "http":
                         text_file.write(
                             "url is " + Zhihu + activity.a.next_sibling.next_sibling.next_sibling["href"] + "\n")
                     else:
                         text_file.write(
                             "url is " + activity.a.next_sibling.next_sibling.next_sibling["href"] + "\n")
             try:
                 start = new_soup.find_all("div", class_="zm-profile-section-item zm-item clearfix")[-1]["data-time"]
             except:
                 break
             data["start"] = start
             info += 1
         text_file.write("Approximately {0} pieces of activities".format(info * 10))
         text_file.close()
         return
Exemplo n.º 6
0
 def save_followees_profile(self):
     # save the profile of all the user's followees
     if self.url is None:
         print "Anonymous user, cannot save followees profile"
         return
     else:
         if self.user_session is None:
             self.user_session = login.log_in()
         followee_num = self.get_followee_num()
         if followee_num == 0:
             print "No followee"
             return
         followee_url = self.url + "/followees"
         cookie = login.get_cookie()
         r = self.user_session.get(followee_url)
         soup = BeautifulSoup(r.content, "lxml")
         _xsrf = soup.find("input", attrs={'name': '_xsrf'})["value"]
         soup1 = soup.find("div", class_="zh-general-list clearfix")
         string = soup1['data-init']
         params = literal_eval(string)['params']
         post_url = "http://www.zhihu.com/node/ProfileFolloweesListV2"
         header = {
             'User-Agent': "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:34.0) Gecko/20100101 Firefox/34.0",
             'Host': "www.zhihu.com",
             'Referer': followee_url,
         }
         book = xlwt.Workbook(encoding="utf-8")
         new_sheet = book.add_sheet("FolloweeList")
         new_sheet.write(0, 0, "url")
         new_sheet.write(0, 1, "id")
         new_sheet.write(0, 2, "follower_num")
         new_sheet.write(0, 3, "asks_num")
         new_sheet.write(0, 4, "answers_num")
         new_sheet.write(0, 5, "agree_num")
         new_sheet.write(0, 6, "is_robot")
         row = 1
         for i in xrange((followee_num - 1) / 20 + 1):
             if i % 100 == 0 and i != 0:
                 print "Have recorded", i * 20, "followees"
             params['offset'] = i * 20
             data = {'_xsrf': _xsrf, 'method': "next", 'params': json.dumps(params)}
             response = self.user_session.post(post_url, data=data, headers=header, cookies=cookie)
             followee_list = response.json()["msg"]
             for j in followee_list:
                 main_soup = BeautifulSoup(j, "lxml")
                 followees_url = main_soup.find("h2", class_="zm-list-content-title").a["href"]
                 new_sheet.write(row, 0, followees_url)
                 followees_id = main_soup.find("h2", class_="zm-list-content-title").a["title"]
                 new_sheet.write(row, 1, followees_id)
                 info_list = main_soup.find_all("a", class_="zg-link-gray-normal")
                 follower_num = int(info_list[0].text.split()[0])
                 new_sheet.write(row, 2, follower_num)
                 asks_num = int(info_list[1].text.split()[0])
                 new_sheet.write(row, 3, asks_num)
                 answers_num = int(info_list[2].text.split()[0])
                 new_sheet.write(row, 4, answers_num)
                 agree_num = int(info_list[3].text.split()[0])
                 new_sheet.write(row, 5, agree_num)
                 if followee_num < 5 and asks_num < 2 and answers_num < 2 and agree_num < 2:
                     is_robot = 1
                 else:
                     is_robot = 0
                 new_sheet.write(row, 6, is_robot)
                 row += 1
         book.save(self.get_id() + " followee list.xls")
         return
 def save_all_voters_profile(self):
     # save the profile of all voters of the answer
     if self.url is None:
         raise ValueError("Did not found url for the answer")
     else:
         if self.session is None:
             self.session = login.log_in()
         if self.soup is None:
             self.parser()
         answer_id = self.soup.find("div", class_="zm-item-answer ")["data-aid"]
         voters_profile_url = Zhihu + "/answer/" + answer_id + "/voters_profile"
         cookie = login.get_cookie()
         header = {
             'User-Agent': "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:34.0) Gecko/20100101 Firefox/34.0",
             'Host': "www.zhihu.com",
             'Referer': self.url
         }
         book = xlwt.Workbook(encoding="utf-8")
         new_sheet = book.add_sheet("Voter_profile")
         new_sheet.write(0, 0, "url")
         new_sheet.write(0, 1, "id")
         new_sheet.write(0, 2, "agree_num")
         new_sheet.write(0, 3, "thanks_num")
         new_sheet.write(0, 4, "ask_num")
         new_sheet.write(0, 5, "answer_num")
         new_sheet.write(0, 6, "is_robot")
         total_vote = self.get_vote_num()
         row = 1
         robot_vote = 0
         for i in xrange((total_vote - 1) / 10 + 1):
             data = {"total": total_vote, "offset": 10 * i}
             r = self.session.get(voters_profile_url, headers=header, data=data, cookies=cookie)
             for j in r.json()["payload"]:
                 if row % 100 == 0 and row != 0:
                     print "Have saved {0} voter profiles".format(row)
                 soup = BeautifulSoup(j, "lxml")
                 try:
                     voter_url = soup.find("div", class_="author ellipsis").a["href"]
                 except:
                     voter_url = "Anonymous user"
                 new_sheet.write(row, 0, voter_url)
                 if voter_url != "Anonymous user":
                     voter_id = soup.find("div", class_="author ellipsis").a["title"]
                     new_sheet.write(row, 1, voter_id)
                     try:
                         voter_agree_num = int(soup.find("ul", class_="status").li.span.text.split()[0])
                     except ValueError:
                         voter_agree_num = soup.find("ul", class_="status").li.span.text.split()[0]
                     new_sheet.write(row, 2, voter_agree_num)
                     try:
                         voter_thanks_num = int(
                             soup.find("ul", class_="status").li.next_sibling.next_sibling.span.text.split()[0])
                     except ValueError:
                         voter_thanks_num = soup.find("ul", class_="status").li.next_sibling.next_sibling.span.text.split()[0]
                     new_sheet.write(row, 3, voter_thanks_num)
                     voter_ask_num = int(soup.find_all("li", class_="hidden-phone")[0].a.text.split()[0])
                     new_sheet.write(row, 4, voter_ask_num)
                     voter_answer_num = int(soup.find_all("li", class_="hidden-phone")[1].a.text.split()[0])
                     new_sheet.write(row, 5, voter_answer_num)
                     if voter_agree_num < 1 and voter_thanks_num < 1 and voter_ask_num < 1 and voter_answer_num < 2:
                         voter_is_robot = 1
                         robot_vote += 1
                     else:
                         voter_is_robot = 0
                     new_sheet.write(row, 6, voter_is_robot)
                 row += 1
         book.save(self.url[20:].replace("/", " ") + " voter profile(answer).xls")
         return robot_vote / (total_vote * 1.0)
Exemplo n.º 8
0
 def save_all_followers_profile(self):
     # save the profile of all followers of the question
     if self.url is None:
         raise ValueError("Did not found url for the question")
     else:
         if self.session is None:
             self.session = login.log_in()
         url = self.url + "/followers"
         follower_num = self.get_follower_num()
         r = self.session.get(url)
         soup = BeautifulSoup(r.content, "lxml")
         _xsrf = soup.find("input", attrs={'name': '_xsrf'})["value"]
         header = {
             'User-Agent': "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:34.0) Gecko/20100101 Firefox/34.0",
             'Host': "www.zhihu.com",
             'Referer': url
         }
         book = xlwt.Workbook(encoding="utf-8")
         new_sheet = book.add_sheet("Follower_profile")
         new_sheet.write(0, 0, "url")
         new_sheet.write(0, 1, "id")
         new_sheet.write(0, 2, "follower_num")
         new_sheet.write(0, 3, "ask_num")
         new_sheet.write(0, 4, "answer_num")
         new_sheet.write(0, 5, "agree_num")
         new_sheet.write(0, 6, "is_robot")
         cookie = login.get_cookie()
         row = 1
         for i in xrange((follower_num-1)/20 + 1):
             data = {"offset": 20*i, "start": 0, "_xsrf": _xsrf}
             r1 = self.session.post(url, headers=header, data=data, cookies=cookie)
             temp_soup = BeautifulSoup(r1.json()["msg"][1], "lxml")
             user_list_raw = temp_soup.find_all("div", class_="zm-profile-card zm-profile-section-item zg-clear no-hovercard")
             for j in user_list_raw:
                 try:
                     user_url = j.h2.a["href"]
                     new_sheet.write(row, 0, user_url)
                     user_id = j.find("a", class_="zm-item-link-avatar")["title"]
                     new_sheet.write(row, 1, user_id)
                     sub_soup = j.find_all("a", class_="zg-link-gray-normal")
                     try:
                         user_follower = int(sub_soup[0].text.split()[0])
                     except:
                         user_follower = sub_soup[0].text.split()[0]
                     new_sheet.write(row, 2, user_follower)
                     user_asks = int(sub_soup[1].text.split()[0])
                     new_sheet.write(row, 3, user_asks)
                     try:
                         user_answers = int(sub_soup[2].text.split()[0])
                     except:
                         user_answers = sub_soup[2].text.split()[0]
                     new_sheet.write(row, 4, user_answers)
                     try:
                         user_agrees = int(sub_soup[3].text.split()[0])
                     except:
                         user_agrees = sub_soup[3].text.split()[0]
                     new_sheet.write(row, 5, user_agrees)
                     if user_follower < 2 and user_asks < 1 and user_answers < 2 and user_agrees < 3:
                         is_robot = 1
                     else:
                         is_robot = 0
                     new_sheet.write(row, 6, is_robot)
                 except:
                     user_url = "Anonymous user"
                     new_sheet.write(row, 0, user_url)
                 row += 1
         book.save(self.get_title().replace("/", "") + " followers profile(question).xls")
         return