Python is_pub_page 예제들, parse.is_pub_page Python 예제들

예제 #1

0

파일 보기

파일: controller.py 프로젝트: iswangheng/Website_Crawler

 def get_user_following(self, headers, user_id):
     """
     # will get all the user following urls and then get the user info one by one
     """
     user_following_url_list = []
     to_visit_url = 'http://weibo.cn/' + str(user_id) + '/follow'
     req = urllib2.Request(url=to_visit_url, headers=headers)
     try:
         response = self.opener.open(req)
         html = response.read()
         # if is the pub page, means too fast.. have been detected and banned by sina
         if parse.is_pub_page(html):
             self.logger.error("BANNED by sina, coz is_pub_page..")
             print "BANNED by sina, coz is_pub_page..sleep for 40 mins.."
             return False
         user_following_url_list = parse.get_following_url_list(
             user_id,
             html,
             page_num=1,
             total_page_num=111,
             headers=headers,
             opener=self.opener,
             logger=self.logger)
         print "In total %s has %s followings." % (
             str(user_id), str(len(user_following_url_list)))
         response.close()
     except URLError, e:
         if hasattr(e, 'code'):
             self.logger.error("http url error code: %s" % e.code)
             if hasattr(e, 'reason'):
                 self.logger.error("http url error reason: %s" % e.reason)

예제 #2

0

파일 보기

파일: controller.py 프로젝트: iswangheng/Website_Crawler

 def get_user_following(self, headers, user_id):
     """
     # will get all the user following urls and then get the user info one by one
     """
     user_following_url_list = []
     to_visit_url = "http://weibo.cn/" + str(user_id) + "/follow"
     req = urllib2.Request(url=to_visit_url, headers=headers)
     try:
         response = self.opener.open(req)
         html = response.read()
         # if is the pub page, means too fast.. have been detected and banned by sina
         if parse.is_pub_page(html):
             self.logger.error("BANNED by sina, coz is_pub_page..")
             print "BANNED by sina, coz is_pub_page..sleep for 40 mins.."
             return False
         user_following_url_list = parse.get_following_url_list(
             user_id, html, page_num=1, total_page_num=111, headers=headers, opener=self.opener, logger=self.logger
         )
         print "In total %s has %s followings." % (str(user_id), str(len(user_following_url_list)))
         response.close()
     except URLError, e:
         if hasattr(e, "code"):
             self.logger.error("http url error code: %s" % e.code)
             if hasattr(e, "reason"):
                 self.logger.error("http url error reason: %s" % e.reason)

예제 #3

0

파일 보기

파일: controller.py 프로젝트: iswangheng/Website_Crawler

 def get_user_info(self, headers, user_url):
     user_home = {}
     user_info = {}
     user_id = 0
     username = user_id
     is_stored = 0
     is_banned = False
     if 'u/' in user_url:
         # means that u/***, *** is a number namely user_id
         user_url = user_url[2:]
         user_id = user_url
     # if user_url is the user_id, then username will also be user_id
     # else, the username would be the user_url
     username = user_url
     # judge the user_id here
     # check if the user_id exists in the database already
     # if so...do not proceed....
     # else go on and get the user info
     if user_id == 0:  # if still not get the user_id
         is_stored = self.is_stored_username(username)
     else:  # already have the user_id
         is_stored = self.is_stored_user(user_id)
     if is_stored:
         user_id = self.get_userid_by_username(username)
         print '%s has been stored already' % user_id
         return is_banned, user_id
     else:  # if a new user, add it to db
         to_visit_url = 'http://weibo.cn/' + str(user_url)
         req = urllib2.Request(url=to_visit_url, headers=headers)
         # user_home contains the user_id, following, follower, and status count
         try:
             response = self.opener.open(req)
             html = response.read()
             if parse.is_pub_page(html):
                 is_banned = True
                 return is_banned, user_id
             user_home = parse.parse_user_home(html)
             print user_home
             user_id = user_home['user_id']
             response.close()
         except URLError, e:
             user_id = 0
             is_banned = True
             if hasattr(e, 'code'):
                 self.logger.error("http url error code: %s" % e.code)
                 if hasattr(e, 'reason'):
                     self.logger.error("http url error reason: %s" %
                                       e.reason)
             return is_banned, user_id
         to_visit_url = 'http://weibo.cn/' + str(user_id) + "/info"
         req = urllib2.Request(url=to_visit_url, headers=headers)
         # to get the user info
         try:
             response = self.opener.open(req)
             html_str = response.read()
             if parse.is_pub_page(html_str):
                 is_banned = True
                 return is_banned, user_id
             user_info = parse.parse_user_info(str(html_str), user_id,
                                               headers, self.opener,
                                               self.logger)
             response.close()
             # store the user_home(u know, those numbers) and user_info into database
             if user_info['screen_name'] != '':
                 self.store_user_into_db(user_home, user_info, username)
             time_sleep = random.randint(12, 23)
             print "after requesting the user info, sleep for %s secs" % str(
                 time_sleep)
             time.sleep(time_sleep)
         except URLError, e:
             if hasattr(e, 'code'):
                 self.logger.error("http url error code: %s" % e.code)
                 if hasattr(e, 'reason'):
                     self.logger.error("http url error reason: %s" %
                                       e.reason)

예제 #4

0

파일 보기

파일: controller.py 프로젝트: iswangheng/Website_Crawler

 def get_user_info(self, headers, user_url):
     user_home = {}
     user_info = {}
     user_id = 0
     username = user_id
     is_stored = 0
     is_banned = False
     if "u/" in user_url:
         # means that u/***, *** is a number namely user_id
         user_url = user_url[2:]
         user_id = user_url
     # if user_url is the user_id, then username will also be user_id
     # else, the username would be the user_url
     username = user_url
     # judge the user_id here
     # check if the user_id exists in the database already
     # if so...do not proceed....
     # else go on and get the user info
     if user_id == 0:  # if still not get the user_id
         is_stored = self.is_stored_username(username)
     else:  # already have the user_id
         is_stored = self.is_stored_user(user_id)
     if is_stored:
         user_id = self.get_userid_by_username(username)
         print "%s has been stored already" % user_id
         return is_banned, user_id
     else:  # if a new user, add it to db
         to_visit_url = "http://weibo.cn/" + str(user_url)
         req = urllib2.Request(url=to_visit_url, headers=headers)
         # user_home contains the user_id, following, follower, and status count
         try:
             response = self.opener.open(req)
             html = response.read()
             if parse.is_pub_page(html):
                 is_banned = True
                 return is_banned, user_id
             user_home = parse.parse_user_home(html)
             print user_home
             user_id = user_home["user_id"]
             response.close()
         except URLError, e:
             user_id = 0
             is_banned = True
             if hasattr(e, "code"):
                 self.logger.error("http url error code: %s" % e.code)
                 if hasattr(e, "reason"):
                     self.logger.error("http url error reason: %s" % e.reason)
             return is_banned, user_id
         to_visit_url = "http://weibo.cn/" + str(user_id) + "/info"
         req = urllib2.Request(url=to_visit_url, headers=headers)
         # to get the user info
         try:
             response = self.opener.open(req)
             html_str = response.read()
             if parse.is_pub_page(html_str):
                 is_banned = True
                 return is_banned, user_id
             user_info = parse.parse_user_info(str(html_str), user_id, headers, self.opener, self.logger)
             response.close()
             # store the user_home(u know, those numbers) and user_info into database
             if user_info["screen_name"] != "":
                 self.store_user_into_db(user_home, user_info, username)
             time_sleep = random.randint(12, 23)
             print "after requesting the user info, sleep for %s secs" % str(time_sleep)
             time.sleep(time_sleep)
         except URLError, e:
             if hasattr(e, "code"):
                 self.logger.error("http url error code: %s" % e.code)
                 if hasattr(e, "reason"):
                     self.logger.error("http url error reason: %s" % e.reason)