def crawl(self, url, is_again=True): loginer = Loginer() cookie = loginer.get_cookie() proxy = loginer.get_proxy() craw_object = Crawler_with_proxy(url, cookie, proxy) WeiboSearchLog().get_scheduler_logger().info(self.name + " start to crawl ! " + url) weibo_list = [] try: page = craw_object.get_page() weibo_list = page_parser_from_search(page) except: print traceback.format_exc() crawl_set_time_with_keyword.del_proxy_lock.acquire() if proxy == loginer.get_proxy(): loginer.del_proxy() WeiboSearchLog().get_scheduler_logger().warning( self.name + " proxy exception , change proxy !") crawl_set_time_with_keyword.del_proxy_lock.release() if is_again: return self.crawl(url, is_again=False) else: self.second_url_queue.put(url) return weibo_list if len(weibo_list) == 0: if zero_aviable_check_validity(page): WeiboSearchLog().get_scheduler_logger().info( self.name + " get nothing, sina does not have ! " + url) return weibo_list if weibo_guangchang_forbidden(page): WeiboSearchLog().get_scheduler_logger().info( self.name + " get nothing, forbidden ! ! " + url) crawl_set_time_with_keyword.del_proxy_lock.acquire() if proxy == loginer.get_proxy(): loginer.del_proxy() WeiboSearchLog().get_scheduler_logger().warning( self.name + " get nothing, change proxy ! " + url) crawl_set_time_with_keyword.del_proxy_lock.release() if is_again: return self.crawl(url, is_again=False) else: self.second_url_queue.put(url) return weibo_list else: if int(url[url.rfind('=') + 1:]) == 1: total_num = weibo_list[0].all_weibo_num self.put_second_and_more_url_queue(total_num, url) WeiboSearchLog().get_scheduler_logger().info(self.name + " crawl success! " + url) return weibo_list
def setUpClass(): driver = webdriver.Firefox() driver.implicitly_wait(60) loging_instance = Loginer(LOGIN , PASSWORD, SOCIAL_NETWORK, driver) loging_instance.log_in() main_page = MainPage(driver, TESTED_URL) main_page.login_with(SOCIAL_NETWORK) bubble_main = main_page.go_bubble_page() TestCase.achives_page = bubble_main.continue_quest() TestCase.achives_page.skip_explaining() TestCase.driver = driver
def crawl(self, url, is_again=True, two_again=True): loginer = Loginer() cookie = loginer.get_cookie() proxy = loginer.get_proxy() craw_object = Crawler_with_proxy(url, cookie, proxy) WeiboSearchLog().get_scheduler_logger().info(self.name + " start to crawl ! " + url) comment_list = [] page = "" try: page = craw_object.get_page() comment_list = page_parser_from_search_for_comment( page) # 解析页面,生成一条条的 comment except: print traceback.format_exc() crawl_comment.del_proxy_lock.acquire() if proxy == loginer.get_proxy(): loginer.del_proxy() WeiboSearchLog().get_scheduler_logger().warning( self.name + " proxy exception , change proxy !") crawl_comment.del_proxy_lock.release() if is_again: return self.crawl(url, is_again=False) else: if two_again: return self.crawl(url, is_again=False, two_again=False) return comment_list if len(comment_list) == 0: # ## 还没有人针对这条微博发表评论! if no_one_commented(page): WeiboSearchLog().get_scheduler_logger().info( self.name + " nobody commented !") return comment_list crawl_comment.del_proxy_lock.acquire() if proxy == loginer.get_proxy(): loginer.del_proxy() WeiboSearchLog().get_scheduler_logger().warning( self.name + " comment_list is 0 , change proxy !") crawl_comment.del_proxy_lock.release() if is_again: return self.crawl(url, is_again=False) else: if two_again: return self.crawl(url, is_again=False, two_again=False) return comment_list else: return comment_list
def setUpClass(): driver = webdriver.Firefox() driver.implicitly_wait(60) loging_instance = Loginer(LOGIN , PASSWORD, SOCIAL_NETWORK, driver) loging_instance.log_in() main_page = MainPage(driver, TESTED_URL) main_page = main_page.login_with(SOCIAL_NETWORK) panda_main = main_page.go_panda_page() TestCase.achives_page = panda_main.continue_quest() TestCase.achives_page.skip_explaining() TestCase.driver = driver TestCase.config = SafeConfigParser() TestCase.config.read(CONFIG_PATH)
def crawl(self, uid_or_nickname, is_again=False): # $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ url = '' # if len(UserInfo_store.objects(Q(uid_or_uname=str(uid_or_nickname)) | Q(nickname=str(uid_or_nickname)))) != 0 or\ # len(Bie_Ming_store.objects(Q(uid_or_uname=str(uid_or_nickname)) | Q(bie_ming=str(uid_or_nickname)))) != 0: # WeiboSearchLog().get_scheduler_logger().info("already in the database : " + uid_or_nickname) # return "nothing" quote_uid_or_nickname = "" try: quote_uid_or_nickname = quote_plus(str(uid_or_nickname.strip())) except: print traceback.format_exc() print uid_or_nickname url = "http://weibo.cn/" + uid_or_nickname + "/info" # if quote_uid_or_nickname == uid_or_nickname: # url = "http://weibo.cn/" + uid_or_nickname + "?f=search_0" # else: # url = "http://weibo.cn/n/" + quote_uid_or_nickname # $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ loginer = Loginer() cookie = loginer.get_cookie() proxy = loginer.get_proxy() craw_object = Crawler_with_proxy(url, cookie, proxy) WeiboSearchLog().get_scheduler_logger().info(self.name + " start to crawl ! " + url) user_info = "" try: page = craw_object.get_page() user_info = page_parser_from_search_for_UserInfo(page, url) except: if is_again: return self.crawl(url, is_again=False) else: return user_info return user_info
def crawl(self, uid_or_nickname, is_again=False): # $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ url = '' if len(UserInfo_store.objects(Q(uid_or_uname=str(uid_or_nickname)) | Q(nickname=str(uid_or_nickname)))) != 0 or\ len(Bie_Ming_store.objects(Q(uid_or_uname=str(uid_or_nickname)) | Q(bie_ming=str(uid_or_nickname)))) != 0: WeiboSearchLog().get_scheduler_logger().info("already in the database : " + uid_or_nickname) return "nothing" quote_uid_or_nickname = "" try: quote_uid_or_nickname = quote_plus(str(uid_or_nickname.strip())) except: print traceback.format_exc() print uid_or_nickname # url = "http://weibo.cn/" + uid_or_nickname + "?f=search_0" if quote_uid_or_nickname == uid_or_nickname: url = "http://weibo.cn/" + uid_or_nickname + "?f=search_0" else: url = "http://weibo.cn/n/" + quote_uid_or_nickname # $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ loginer = Loginer() cookie = loginer.get_cookie() proxy = loginer.get_proxy() craw_object = Crawler_with_proxy(url, cookie, proxy) WeiboSearchLog().get_scheduler_logger().info(self.name + " start to crawl ! " + url) user_info = "" try: page = craw_object.get_page() user_info = page_parser_from_search_for_UserInfo(page, url) except: if is_again: return self.crawl(url, is_again=False) else: return user_info return user_info
def crawl(self, url, is_again=True): loginer = Loginer() cookie = loginer.get_cookie() proxy = loginer.get_proxy() craw_object = Crawler_with_proxy(url, cookie, proxy) WeiboSearchLog().get_scheduler_logger().info(self.name + " start to crawl ! " + url) uid_or_uname = "" try: page = craw_object.get_page() uid_or_uname = page_parser_from_search_for_uid(page) except: print traceback.format_exc() crawl_set_time_with_keyword.del_proxy_lock.acquire() if proxy == loginer.get_proxy(): loginer.del_proxy() WeiboSearchLog().get_scheduler_logger().warning( self.name + " proxy exception , change proxy !") crawl_set_time_with_keyword.del_proxy_lock.release() if is_again: return self.crawl(url, is_again=False) else: self.second_url_queue.put(url) return uid_or_uname return uid_or_uname
def crawl(self): # 实时且原创的微博 self.data['advancedfilter'] = '1' self.data['keyword'] = self.keyword self.data['hasori'] = '1' # self.data['nick'] = '' # self.data['starttime'] = '' # self.data['endtime'] = '' self.data['sort'] = 'time' self.data['smblog'] = '搜索' url = 'http://weibo.cn/search/' loginer = Loginer() cookie = loginer.get_cookie(); proxy = loginer.get_proxy(); craw_object = Crawler_with_proxy(url, cookie, proxy) weibo_list = [] try: page = craw_object.get_page_with_form(self.data) weibo_list = page_parser_from_search(page) except : print traceback.format_exc() loginer.del_proxy() WeiboSearchLog().get_scheduler_logger().warning(self.name + " proxy exception , change proxy !") time.sleep(int(random.random() * 10)) return self.crawl() if len(weibo_list) == 0: loginer.del_proxy() WeiboSearchLog().get_scheduler_logger().warning(self.name + " get nothing, change proxy !") time.sleep(int(random.random() * 10)) return self.crawl() else: return weibo_list
def crawl(self, url, is_again=True): loginer = Loginer() cookie = loginer.get_cookie() proxy = loginer.get_proxy() craw_object = Crawler_with_proxy(url, cookie, proxy) WeiboSearchLog().get_scheduler_logger().info(self.name + " start to crawl ! " + url) weibo_list = [] try: page = craw_object.get_page() weibo_list = page_parser_from_search(page) except: print traceback.format_exc() crawl_set_time_with_keyword.del_proxy_lock.acquire() if proxy == loginer.get_proxy(): loginer.del_proxy() WeiboSearchLog().get_scheduler_logger().warning(self.name + " proxy exception , change proxy !") crawl_set_time_with_keyword.del_proxy_lock.release() if is_again: return self.crawl(url, is_again=False) else: self.second_url_queue.put(url) return weibo_list if len(weibo_list) == 0: if zero_aviable_check_validity(page): WeiboSearchLog().get_scheduler_logger().info(self.name + " get nothing, sina does not have ! " + url) return weibo_list if weibo_guangchang_forbidden(page): WeiboSearchLog().get_scheduler_logger().info(self.name + " get nothing, forbidden ! ! " + url) crawl_set_time_with_keyword.del_proxy_lock.acquire() if proxy == loginer.get_proxy(): loginer.del_proxy() WeiboSearchLog().get_scheduler_logger().warning(self.name + " get nothing, change proxy ! " + url) crawl_set_time_with_keyword.del_proxy_lock.release() if is_again: return self.crawl(url, is_again=False) else: self.second_url_queue.put(url) return weibo_list else: if int(url[url.rfind('=') + 1:]) == 1: total_num = weibo_list[0].all_weibo_num self.put_second_and_more_url_queue(total_num, url) WeiboSearchLog().get_scheduler_logger().info(self.name + " crawl success! " + url) return weibo_list
def crawl(self, url, is_again=True, two_again=True): loginer = Loginer() cookie = loginer.get_cookie() proxy = loginer.get_proxy() craw_object = Crawler_with_proxy(url, cookie, proxy) WeiboSearchLog().get_scheduler_logger().info(self.name + " start to crawl ! " + url) repost_list = [] page = "" try: page = craw_object.get_page() repost_list = page_parser_from_search_for_repost(page) # 解析页面,生成一条条的 repost except: print traceback.format_exc() crawl_repost.del_proxy_lock.acquire() if proxy == loginer.get_proxy(): loginer.del_proxy() WeiboSearchLog().get_scheduler_logger().warning(self.name + " proxy exception , change proxy !") crawl_repost.del_proxy_lock.release() if is_again: return self.crawl(url, is_again=False) else: if two_again: return self.crawl(url, is_again=False, two_again=False) return repost_list if len(repost_list) == 0: # ## 还没有人针对这条微博发表评论! # if no_one_commented(page): # WeiboSearchLog().get_scheduler_logger().info(self.name + " nobody commented !") # return repost_list; crawl_repost.del_proxy_lock.acquire() if proxy == loginer.get_proxy(): loginer.del_proxy() WeiboSearchLog().get_scheduler_logger().warning(self.name + " repost_list is 0 , change proxy !") crawl_repost.del_proxy_lock.release() if is_again: return self.crawl(url, is_again=False) else: if two_again: return self.crawl(url, is_again=False, two_again=False) return repost_list else: return repost_list
def crawl(self, url, is_again=True): loginer = Loginer() cookie = loginer.get_cookie() proxy = loginer.get_proxy() craw_object = Crawler_with_proxy(url, cookie, proxy) WeiboSearchLog().get_scheduler_logger().info(self.name + " start to crawl ! " + url) uid_or_uname = "" try: page = craw_object.get_page() uid_or_uname = page_parser_from_search_for_uid(page) except: print traceback.format_exc() crawl_set_time_with_keyword.del_proxy_lock.acquire() if proxy == loginer.get_proxy(): loginer.del_proxy() WeiboSearchLog().get_scheduler_logger().warning(self.name + " proxy exception , change proxy !") crawl_set_time_with_keyword.del_proxy_lock.release() if is_again: return self.crawl(url, is_again=False) else: self.second_url_queue.put(url) return uid_or_uname return uid_or_uname
class Helper: def __init__(self, student_id): self.loginer = Loginer(student_id, student_id) session = self.loginer.login() self.reserver = Reserver(session) self.seat_getter = SeatGetter(student_id, session) def _reserve_all(self, room_code, date_number, start_time, end_time): url, seats = self.seat_getter.choose_seat(room_code) for seat in seats: reserve_status = self.reserver.reserve(seat, date_number, start_time, end_time) if reserve_status: return True return False def run(self, room_code, date_number, start_time, end_time): flag = self._reserve_all(room_code, date_number, start_time, end_time) return flag
def crawl(self): # 实时且原创的微博 self.data['advancedfilter'] = '1' self.data['keyword'] = self.keyword self.data['hasori'] = '1' # self.data['nick'] = '' # self.data['starttime'] = '' # self.data['endtime'] = '' self.data['sort'] = 'time' self.data['smblog'] = '搜索' url = 'http://weibo.cn/search/' loginer = Loginer() cookie = loginer.get_cookie() proxy = loginer.get_proxy() craw_object = Crawler_with_proxy(url, cookie, proxy) weibo_list = [] try: page = craw_object.get_page_with_form(self.data) weibo_list = page_parser_from_search(page) except: print traceback.format_exc() loginer.del_proxy() WeiboSearchLog().get_scheduler_logger().warning( self.name + " proxy exception , change proxy !") time.sleep(int(random.random() * 10)) return self.crawl() if len(weibo_list) == 0: loginer.del_proxy() WeiboSearchLog().get_scheduler_logger().warning( self.name + " get nothing, change proxy !") time.sleep(int(random.random() * 10)) return self.crawl() else: return weibo_list
log_type=self.data.typeid, fail_code=OtherException.ERROR_CODE_DICT["weibo hot topic OtherError"], keyword="", page_num=self.data.page_num ) return False return True ############################################################################################### try: from storage_manager import connect_db from loginer import Loginer except: s = traceback.format_exc() print s if __name__ == '__main__': connect_db('test') wrapper = HotTopicURLWrapper() parser = HotTopicPageParser(wrapper) crawler = HotTopicCrawler(wrapper, parser) loginer = Loginer() print bool( loginer.login('*****@*****.**','900119','./weibo_login_cookies.dat') ) crawler.crawl(0)
page_num=self.data.page_num) return False return True ############################################################################################### try: from storage_manager import connect_db from loginer import Loginer except: s = traceback.format_exc() print s if __name__ == '__main__': connect_db('test') wrapper = HotTopicURLWrapper() parser = HotTopicPageParser(wrapper) crawler = HotTopicCrawler(wrapper, parser) loginer = Loginer() print bool( loginer.login('*****@*****.**', '900119', './weibo_login_cookies.dat')) crawler.crawl(0)
def __init__(self, student_id): self.loginer = Loginer(student_id, student_id) session = self.loginer.login() self.reserver = Reserver(session) self.seat_getter = SeatGetter(student_id, session)