示例#1
0
    def crawl(self, url, is_again=True):
        loginer = Loginer()
        cookie = loginer.get_cookie()
        proxy = loginer.get_proxy()
        craw_object = Crawler_with_proxy(url, cookie, proxy)

        WeiboSearchLog().get_scheduler_logger().info(self.name +
                                                     " start to crawl ! " +
                                                     url)

        weibo_list = []
        try:
            page = craw_object.get_page()
            weibo_list = page_parser_from_search(page)
        except:
            print traceback.format_exc()
            crawl_set_time_with_keyword.del_proxy_lock.acquire()
            if proxy == loginer.get_proxy():
                loginer.del_proxy()
                WeiboSearchLog().get_scheduler_logger().warning(
                    self.name + " proxy exception , change proxy !")
            crawl_set_time_with_keyword.del_proxy_lock.release()
            if is_again:
                return self.crawl(url, is_again=False)
            else:
                self.second_url_queue.put(url)
                return weibo_list

        if len(weibo_list) == 0:
            if zero_aviable_check_validity(page):
                WeiboSearchLog().get_scheduler_logger().info(
                    self.name + " get nothing, sina does not have ! " + url)
                return weibo_list
            if weibo_guangchang_forbidden(page):
                WeiboSearchLog().get_scheduler_logger().info(
                    self.name + " get nothing, forbidden ! ! " + url)

            crawl_set_time_with_keyword.del_proxy_lock.acquire()
            if proxy == loginer.get_proxy():
                loginer.del_proxy()
                WeiboSearchLog().get_scheduler_logger().warning(
                    self.name + " get nothing, change proxy ! " + url)
            crawl_set_time_with_keyword.del_proxy_lock.release()
            if is_again:
                return self.crawl(url, is_again=False)
            else:
                self.second_url_queue.put(url)
                return weibo_list

        else:
            if int(url[url.rfind('=') + 1:]) == 1:
                total_num = weibo_list[0].all_weibo_num
                self.put_second_and_more_url_queue(total_num, url)
            WeiboSearchLog().get_scheduler_logger().info(self.name +
                                                         " crawl success! " +
                                                         url)
            return weibo_list
示例#2
0
 def setUpClass():
     driver = webdriver.Firefox()
     driver.implicitly_wait(60)
     loging_instance = Loginer(LOGIN , PASSWORD, SOCIAL_NETWORK,  driver)
     loging_instance.log_in()
     main_page = MainPage(driver, TESTED_URL)
     main_page.login_with(SOCIAL_NETWORK)
     bubble_main = main_page.go_bubble_page()
     TestCase.achives_page = bubble_main.continue_quest()
     TestCase.achives_page.skip_explaining()
     TestCase.driver = driver
    def crawl(self, url, is_again=True, two_again=True):
        loginer = Loginer()
        cookie = loginer.get_cookie()
        proxy = loginer.get_proxy()
        craw_object = Crawler_with_proxy(url, cookie, proxy)

        WeiboSearchLog().get_scheduler_logger().info(self.name +
                                                     " start to crawl ! " +
                                                     url)

        comment_list = []
        page = ""
        try:
            page = craw_object.get_page()
            comment_list = page_parser_from_search_for_comment(
                page)  # 解析页面,生成一条条的 comment
        except:
            print traceback.format_exc()
            crawl_comment.del_proxy_lock.acquire()
            if proxy == loginer.get_proxy():
                loginer.del_proxy()
                WeiboSearchLog().get_scheduler_logger().warning(
                    self.name + " proxy exception , change proxy !")
            crawl_comment.del_proxy_lock.release()
            if is_again:
                return self.crawl(url, is_again=False)
            else:
                if two_again:
                    return self.crawl(url, is_again=False, two_again=False)
                return comment_list

        if len(comment_list) == 0:
            # ## 还没有人针对这条微博发表评论!
            if no_one_commented(page):
                WeiboSearchLog().get_scheduler_logger().info(
                    self.name + " nobody commented !")
                return comment_list

            crawl_comment.del_proxy_lock.acquire()
            if proxy == loginer.get_proxy():
                loginer.del_proxy()
                WeiboSearchLog().get_scheduler_logger().warning(
                    self.name + " comment_list is 0 , change proxy !")
            crawl_comment.del_proxy_lock.release()
            if is_again:
                return self.crawl(url, is_again=False)
            else:
                if two_again:
                    return self.crawl(url, is_again=False, two_again=False)
                return comment_list
        else:
            return comment_list
示例#4
0
 def setUpClass():
     driver = webdriver.Firefox()
     driver.implicitly_wait(60)
     loging_instance = Loginer(LOGIN , PASSWORD, SOCIAL_NETWORK,  driver)
     loging_instance.log_in()
     main_page = MainPage(driver, TESTED_URL)
     main_page = main_page.login_with(SOCIAL_NETWORK)
     panda_main = main_page.go_panda_page()
     TestCase.achives_page = panda_main.continue_quest()
     TestCase.achives_page.skip_explaining()
     TestCase.driver = driver
     TestCase.config = SafeConfigParser()
     TestCase.config.read(CONFIG_PATH)
示例#5
0
    def crawl(self, uid_or_nickname, is_again=False):

        # $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
        url = ''
        #         if len(UserInfo_store.objects(Q(uid_or_uname=str(uid_or_nickname)) | Q(nickname=str(uid_or_nickname)))) != 0 or\
        #             len(Bie_Ming_store.objects(Q(uid_or_uname=str(uid_or_nickname)) | Q(bie_ming=str(uid_or_nickname)))) != 0:
        #             WeiboSearchLog().get_scheduler_logger().info("already in the database : " + uid_or_nickname)
        #             return "nothing"

        quote_uid_or_nickname = ""
        try:
            quote_uid_or_nickname = quote_plus(str(uid_or_nickname.strip()))
        except:
            print traceback.format_exc()
            print uid_or_nickname

        url = "http://weibo.cn/" + uid_or_nickname + "/info"

        #         if quote_uid_or_nickname == uid_or_nickname:
        #             url = "http://weibo.cn/" + uid_or_nickname + "?f=search_0"
        #         else:
        #             url = "http://weibo.cn/n/" + quote_uid_or_nickname

        # $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$

        loginer = Loginer()
        cookie = loginer.get_cookie()
        proxy = loginer.get_proxy()

        craw_object = Crawler_with_proxy(url, cookie, proxy)

        WeiboSearchLog().get_scheduler_logger().info(self.name +
                                                     " start to crawl ! " +
                                                     url)

        user_info = ""
        try:
            page = craw_object.get_page()

            user_info = page_parser_from_search_for_UserInfo(page, url)
        except:
            if is_again:
                return self.crawl(url, is_again=False)
            else:
                return user_info

        return user_info
示例#6
0
    def crawl(self, uid_or_nickname, is_again=False):
        
        # $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
        url = ''
        if len(UserInfo_store.objects(Q(uid_or_uname=str(uid_or_nickname)) | Q(nickname=str(uid_or_nickname)))) != 0 or\
            len(Bie_Ming_store.objects(Q(uid_or_uname=str(uid_or_nickname)) | Q(bie_ming=str(uid_or_nickname)))) != 0:
            WeiboSearchLog().get_scheduler_logger().info("already in the database : " + uid_or_nickname)
            return "nothing"
        
        quote_uid_or_nickname = ""
        try:
            quote_uid_or_nickname = quote_plus(str(uid_or_nickname.strip()))
        except:
            print  traceback.format_exc()
            print  uid_or_nickname
        
#         url = "http://weibo.cn/" + uid_or_nickname + "?f=search_0"
            
        if quote_uid_or_nickname == uid_or_nickname:
            url = "http://weibo.cn/" + uid_or_nickname + "?f=search_0"
        else:
            url = "http://weibo.cn/n/" + quote_uid_or_nickname
        
        # $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
        
        loginer = Loginer()
        cookie = loginer.get_cookie()
        proxy = loginer.get_proxy()
        
        craw_object = Crawler_with_proxy(url, cookie, proxy)
        
        WeiboSearchLog().get_scheduler_logger().info(self.name + " start to crawl ! " + url)
        
        user_info = ""
        try:
            page = craw_object.get_page()
            
            user_info = page_parser_from_search_for_UserInfo(page, url)
        except:
            if is_again:
                return self.crawl(url, is_again=False)
            else:
                return user_info
            
            
        return user_info
示例#7
0
    def crawl(self, url, is_again=True):
        loginer = Loginer()
        cookie = loginer.get_cookie()
        proxy = loginer.get_proxy()
        craw_object = Crawler_with_proxy(url, cookie, proxy)

        WeiboSearchLog().get_scheduler_logger().info(self.name +
                                                     " start to crawl ! " +
                                                     url)

        uid_or_uname = ""
        try:
            page = craw_object.get_page()

            uid_or_uname = page_parser_from_search_for_uid(page)
        except:
            print traceback.format_exc()
            crawl_set_time_with_keyword.del_proxy_lock.acquire()
            if proxy == loginer.get_proxy():
                loginer.del_proxy()
                WeiboSearchLog().get_scheduler_logger().warning(
                    self.name + " proxy exception , change proxy !")
            crawl_set_time_with_keyword.del_proxy_lock.release()
            if is_again:
                return self.crawl(url, is_again=False)
            else:
                self.second_url_queue.put(url)
                return uid_or_uname
        return uid_or_uname
示例#8
0
    def crawl(self):
        # 实时且原创的微博
        self.data['advancedfilter'] = '1'
        self.data['keyword'] = self.keyword
        self.data['hasori'] = '1' 
#         self.data['nick'] =  '' 
#         self.data['starttime'] =  '' 
#         self.data['endtime'] =  '' 
        self.data['sort'] = 'time' 
        self.data['smblog'] = '搜索' 
        url = 'http://weibo.cn/search/'
        
        loginer = Loginer()
        cookie = loginer.get_cookie();
        proxy = loginer.get_proxy();
        craw_object = Crawler_with_proxy(url, cookie, proxy)
        
        weibo_list = []
        try:
            page = craw_object.get_page_with_form(self.data)
            weibo_list = page_parser_from_search(page)
        except :
            print traceback.format_exc()
            loginer.del_proxy()
            WeiboSearchLog().get_scheduler_logger().warning(self.name + " proxy exception , change proxy !")
            time.sleep(int(random.random() * 10))
            return self.crawl()
        
        if len(weibo_list) == 0:
            loginer.del_proxy()
            WeiboSearchLog().get_scheduler_logger().warning(self.name + " get nothing, change proxy !")
            time.sleep(int(random.random() * 10))
            return self.crawl()
        else:
            return weibo_list
示例#9
0
 def crawl(self, url, is_again=True):
     loginer = Loginer()
     cookie = loginer.get_cookie()
     proxy = loginer.get_proxy()
     craw_object = Crawler_with_proxy(url, cookie, proxy)
     
     WeiboSearchLog().get_scheduler_logger().info(self.name + " start to crawl ! " + url)
     
     weibo_list = []
     try:
         page = craw_object.get_page()
         weibo_list = page_parser_from_search(page)
     except:
         print traceback.format_exc()
         crawl_set_time_with_keyword.del_proxy_lock.acquire()
         if proxy == loginer.get_proxy():
             loginer.del_proxy()
             WeiboSearchLog().get_scheduler_logger().warning(self.name + " proxy exception , change proxy !")
         crawl_set_time_with_keyword.del_proxy_lock.release()
         if is_again:
             return self.crawl(url, is_again=False)
         else:
             self.second_url_queue.put(url)
             return weibo_list
     
     
     if len(weibo_list) == 0:
         if zero_aviable_check_validity(page):
             WeiboSearchLog().get_scheduler_logger().info(self.name + " get nothing, sina does not have ! " + url)
             return weibo_list
         if weibo_guangchang_forbidden(page):
             WeiboSearchLog().get_scheduler_logger().info(self.name + " get nothing, forbidden ! ! " + url)
             
         crawl_set_time_with_keyword.del_proxy_lock.acquire()
         if proxy == loginer.get_proxy():
             loginer.del_proxy()
             WeiboSearchLog().get_scheduler_logger().warning(self.name + " get nothing, change proxy ! " + url)
         crawl_set_time_with_keyword.del_proxy_lock.release()
         if is_again:
             return self.crawl(url, is_again=False)
         else:
             self.second_url_queue.put(url)
             return weibo_list
         
     else:
         if int(url[url.rfind('=') + 1:]) == 1:
             total_num = weibo_list[0].all_weibo_num
             self.put_second_and_more_url_queue(total_num, url)
         WeiboSearchLog().get_scheduler_logger().info(self.name + " crawl success! " + url)
         return weibo_list
示例#10
0
    def crawl(self, url, is_again=True, two_again=True):
        loginer = Loginer()
        cookie = loginer.get_cookie()
        proxy = loginer.get_proxy()
        craw_object = Crawler_with_proxy(url, cookie, proxy)

        WeiboSearchLog().get_scheduler_logger().info(self.name + " start to crawl ! " + url)

        repost_list = []
        page = ""
        try:
            page = craw_object.get_page()
            repost_list = page_parser_from_search_for_repost(page)  # 解析页面,生成一条条的 repost
        except:
            print traceback.format_exc()
            crawl_repost.del_proxy_lock.acquire()
            if proxy == loginer.get_proxy():
                loginer.del_proxy()
                WeiboSearchLog().get_scheduler_logger().warning(self.name + " proxy exception , change proxy !")
            crawl_repost.del_proxy_lock.release()
            if is_again:
                return self.crawl(url, is_again=False)
            else:
                if two_again:
                    return self.crawl(url, is_again=False, two_again=False)
                return repost_list

        if len(repost_list) == 0:
            # ## 还没有人针对这条微博发表评论!
            #             if no_one_commented(page):
            #                 WeiboSearchLog().get_scheduler_logger().info(self.name + " nobody commented !")
            #                 return repost_list;

            crawl_repost.del_proxy_lock.acquire()
            if proxy == loginer.get_proxy():
                loginer.del_proxy()
                WeiboSearchLog().get_scheduler_logger().warning(self.name + " repost_list is 0 , change proxy !")
            crawl_repost.del_proxy_lock.release()
            if is_again:
                return self.crawl(url, is_again=False)
            else:
                if two_again:
                    return self.crawl(url, is_again=False, two_again=False)
                return repost_list
        else:
            return repost_list
示例#11
0
 def crawl(self, url, is_again=True):
     loginer = Loginer()
     cookie = loginer.get_cookie()
     proxy = loginer.get_proxy()
     craw_object = Crawler_with_proxy(url, cookie, proxy)
     
     WeiboSearchLog().get_scheduler_logger().info(self.name + " start to crawl ! " + url)
     
     uid_or_uname = ""
     try:
         page = craw_object.get_page()
         
         uid_or_uname = page_parser_from_search_for_uid(page)
     except:
         print traceback.format_exc()
         crawl_set_time_with_keyword.del_proxy_lock.acquire()
         if proxy == loginer.get_proxy():
             loginer.del_proxy()
             WeiboSearchLog().get_scheduler_logger().warning(self.name + " proxy exception , change proxy !")
         crawl_set_time_with_keyword.del_proxy_lock.release()
         if is_again:
             return self.crawl(url, is_again=False)
         else:
             self.second_url_queue.put(url)
             return uid_or_uname
     return uid_or_uname
示例#12
0
class Helper:
    def __init__(self, student_id):

        self.loginer = Loginer(student_id, student_id)
        session = self.loginer.login()
        self.reserver = Reserver(session)
        self.seat_getter = SeatGetter(student_id, session)

    def _reserve_all(self, room_code, date_number, start_time, end_time):
        url, seats = self.seat_getter.choose_seat(room_code)
        for seat in seats:
            reserve_status = self.reserver.reserve(seat, date_number,
                                                   start_time, end_time)
            if reserve_status:
                return True
        return False

    def run(self, room_code, date_number, start_time, end_time):
        flag = self._reserve_all(room_code, date_number, start_time, end_time)
        return flag
示例#13
0
    def crawl(self):
        # 实时且原创的微博
        self.data['advancedfilter'] = '1'
        self.data['keyword'] = self.keyword
        self.data['hasori'] = '1'
        #         self.data['nick'] =  ''
        #         self.data['starttime'] =  ''
        #         self.data['endtime'] =  ''
        self.data['sort'] = 'time'
        self.data['smblog'] = '搜索'
        url = 'http://weibo.cn/search/'

        loginer = Loginer()
        cookie = loginer.get_cookie()
        proxy = loginer.get_proxy()
        craw_object = Crawler_with_proxy(url, cookie, proxy)

        weibo_list = []
        try:
            page = craw_object.get_page_with_form(self.data)
            weibo_list = page_parser_from_search(page)
        except:
            print traceback.format_exc()
            loginer.del_proxy()
            WeiboSearchLog().get_scheduler_logger().warning(
                self.name + " proxy exception , change proxy !")
            time.sleep(int(random.random() * 10))
            return self.crawl()

        if len(weibo_list) == 0:
            loginer.del_proxy()
            WeiboSearchLog().get_scheduler_logger().warning(
                self.name + " get nothing, change proxy !")
            time.sleep(int(random.random() * 10))
            return self.crawl()
        else:
            return weibo_list
示例#14
0
                                     log_type=self.data.typeid,
                                     fail_code=OtherException.ERROR_CODE_DICT["weibo hot topic OtherError"],
                                     keyword="",
                                     page_num=self.data.page_num
                                    )
            return False
        
        return True
###############################################################################################
try:
    from storage_manager import connect_db
    from loginer import Loginer
except:
    s = traceback.format_exc()
    print s

if __name__ == '__main__':
    
    connect_db('test')
    
    wrapper = HotTopicURLWrapper()
    parser = HotTopicPageParser(wrapper)
    
    crawler = HotTopicCrawler(wrapper, parser)
    
    loginer = Loginer()
    
    print bool( loginer.login('*****@*****.**','900119','./weibo_login_cookies.dat') )
    
    crawler.crawl(0)
示例#15
0
                    page_num=self.data.page_num)
            return False

        return True


###############################################################################################
try:
    from storage_manager import connect_db
    from loginer import Loginer
except:
    s = traceback.format_exc()
    print s

if __name__ == '__main__':

    connect_db('test')

    wrapper = HotTopicURLWrapper()
    parser = HotTopicPageParser(wrapper)

    crawler = HotTopicCrawler(wrapper, parser)

    loginer = Loginer()

    print bool(
        loginer.login('*****@*****.**', '900119',
                      './weibo_login_cookies.dat'))

    crawler.crawl(0)
示例#16
0
    def __init__(self, student_id):

        self.loginer = Loginer(student_id, student_id)
        session = self.loginer.login()
        self.reserver = Reserver(session)
        self.seat_getter = SeatGetter(student_id, session)