def get_user_following(self, headers, user_id): """ # will get all the user following urls and then get the user info one by one """ user_following_url_list = [] to_visit_url = 'http://weibo.cn/' + str(user_id) + '/follow' req = urllib2.Request(url=to_visit_url, headers=headers) try: response = self.opener.open(req) html = response.read() # if is the pub page, means too fast.. have been detected and banned by sina if parse.is_pub_page(html): self.logger.error("BANNED by sina, coz is_pub_page..") print "BANNED by sina, coz is_pub_page..sleep for 40 mins.." return False user_following_url_list = parse.get_following_url_list( user_id, html, page_num=1, total_page_num=111, headers=headers, opener=self.opener, logger=self.logger) print "In total %s has %s followings." % ( str(user_id), str(len(user_following_url_list))) response.close() except URLError, e: if hasattr(e, 'code'): self.logger.error("http url error code: %s" % e.code) if hasattr(e, 'reason'): self.logger.error("http url error reason: %s" % e.reason)
def get_user_following(self, headers, user_id): """ # will get all the user following urls and then get the user info one by one """ user_following_url_list = [] to_visit_url = "http://weibo.cn/" + str(user_id) + "/follow" req = urllib2.Request(url=to_visit_url, headers=headers) try: response = self.opener.open(req) html = response.read() # if is the pub page, means too fast.. have been detected and banned by sina if parse.is_pub_page(html): self.logger.error("BANNED by sina, coz is_pub_page..") print "BANNED by sina, coz is_pub_page..sleep for 40 mins.." return False user_following_url_list = parse.get_following_url_list( user_id, html, page_num=1, total_page_num=111, headers=headers, opener=self.opener, logger=self.logger ) print "In total %s has %s followings." % (str(user_id), str(len(user_following_url_list))) response.close() except URLError, e: if hasattr(e, "code"): self.logger.error("http url error code: %s" % e.code) if hasattr(e, "reason"): self.logger.error("http url error reason: %s" % e.reason)
def get_user_info(self, headers, user_url): user_home = {} user_info = {} user_id = 0 username = user_id is_stored = 0 is_banned = False if 'u/' in user_url: # means that u/***, *** is a number namely user_id user_url = user_url[2:] user_id = user_url # if user_url is the user_id, then username will also be user_id # else, the username would be the user_url username = user_url # judge the user_id here # check if the user_id exists in the database already # if so...do not proceed.... # else go on and get the user info if user_id == 0: # if still not get the user_id is_stored = self.is_stored_username(username) else: # already have the user_id is_stored = self.is_stored_user(user_id) if is_stored: user_id = self.get_userid_by_username(username) print '%s has been stored already' % user_id return is_banned, user_id else: # if a new user, add it to db to_visit_url = 'http://weibo.cn/' + str(user_url) req = urllib2.Request(url=to_visit_url, headers=headers) # user_home contains the user_id, following, follower, and status count try: response = self.opener.open(req) html = response.read() if parse.is_pub_page(html): is_banned = True return is_banned, user_id user_home = parse.parse_user_home(html) print user_home user_id = user_home['user_id'] response.close() except URLError, e: user_id = 0 is_banned = True if hasattr(e, 'code'): self.logger.error("http url error code: %s" % e.code) if hasattr(e, 'reason'): self.logger.error("http url error reason: %s" % e.reason) return is_banned, user_id to_visit_url = 'http://weibo.cn/' + str(user_id) + "/info" req = urllib2.Request(url=to_visit_url, headers=headers) # to get the user info try: response = self.opener.open(req) html_str = response.read() if parse.is_pub_page(html_str): is_banned = True return is_banned, user_id user_info = parse.parse_user_info(str(html_str), user_id, headers, self.opener, self.logger) response.close() # store the user_home(u know, those numbers) and user_info into database if user_info['screen_name'] != '': self.store_user_into_db(user_home, user_info, username) time_sleep = random.randint(12, 23) print "after requesting the user info, sleep for %s secs" % str( time_sleep) time.sleep(time_sleep) except URLError, e: if hasattr(e, 'code'): self.logger.error("http url error code: %s" % e.code) if hasattr(e, 'reason'): self.logger.error("http url error reason: %s" % e.reason)
def get_user_info(self, headers, user_url): user_home = {} user_info = {} user_id = 0 username = user_id is_stored = 0 is_banned = False if "u/" in user_url: # means that u/***, *** is a number namely user_id user_url = user_url[2:] user_id = user_url # if user_url is the user_id, then username will also be user_id # else, the username would be the user_url username = user_url # judge the user_id here # check if the user_id exists in the database already # if so...do not proceed.... # else go on and get the user info if user_id == 0: # if still not get the user_id is_stored = self.is_stored_username(username) else: # already have the user_id is_stored = self.is_stored_user(user_id) if is_stored: user_id = self.get_userid_by_username(username) print "%s has been stored already" % user_id return is_banned, user_id else: # if a new user, add it to db to_visit_url = "http://weibo.cn/" + str(user_url) req = urllib2.Request(url=to_visit_url, headers=headers) # user_home contains the user_id, following, follower, and status count try: response = self.opener.open(req) html = response.read() if parse.is_pub_page(html): is_banned = True return is_banned, user_id user_home = parse.parse_user_home(html) print user_home user_id = user_home["user_id"] response.close() except URLError, e: user_id = 0 is_banned = True if hasattr(e, "code"): self.logger.error("http url error code: %s" % e.code) if hasattr(e, "reason"): self.logger.error("http url error reason: %s" % e.reason) return is_banned, user_id to_visit_url = "http://weibo.cn/" + str(user_id) + "/info" req = urllib2.Request(url=to_visit_url, headers=headers) # to get the user info try: response = self.opener.open(req) html_str = response.read() if parse.is_pub_page(html_str): is_banned = True return is_banned, user_id user_info = parse.parse_user_info(str(html_str), user_id, headers, self.opener, self.logger) response.close() # store the user_home(u know, those numbers) and user_info into database if user_info["screen_name"] != "": self.store_user_into_db(user_home, user_info, username) time_sleep = random.randint(12, 23) print "after requesting the user info, sleep for %s secs" % str(time_sleep) time.sleep(time_sleep) except URLError, e: if hasattr(e, "code"): self.logger.error("http url error code: %s" % e.code) if hasattr(e, "reason"): self.logger.error("http url error reason: %s" % e.reason)