def crawl(self, page_count=1, comments=False): ''' crawl the weibo using the keywords page_count: how many pages would be crawled ''' self.results = [] # get the mids from each result page pages = list(range(1, page_count + 1)) random.shuffle(pages) for t in ('hot', 'time'): for i in pages: url_to_crawl = self.get_search_url(i) logging.info('crawling page {}:{}'.format(i, url_to_crawl)) self.driver.get(url_to_crawl) # wait the page loading the content try: element = WebDriverWait(self.driver, 5).until( lambda x: x.find_elements_by_class_name('feed_list')) except TimeoutException: logging.info( 'there is no weibo content in {}'.format(url_to_crawl)) logging.info('you are considered as a robot') logging.info(self.driver.current_url) self.driver.get_screenshot_as_file( './screenshot/error.png') # let user input the verification code verify_user(self.driver, 'search') # break weibo_list = self.get_weibo_list( self.driver.page_source ) # mid is used to crawl the original weibo content, using batch mode self.results.extend(weibo_list) # sleep some time to prevent hitting too much # time.sleep(1) else: continue break # for r in results: # logging.info_dict(r) logging.info('total result {}'.format(len(self.results))) if comments: logging.info('crawling the comments') self.crawl_comments() return
def crawl(self, page_count=1, comments=False): ''' crawl the weibo using the keywords page_count: how many pages would be crawled ''' self.results = [] # get the mids from each result page pages = list(range(1, page_count+1)) random.shuffle(pages) for t in ('hot', 'time'): for i in pages: url_to_crawl = self.get_search_url(i) logging.info('crawling page {}:{}'.format(i, url_to_crawl)) self.driver.get(url_to_crawl) # wait the page loading the content try: element = WebDriverWait(self.driver, 5).until( lambda x: x.find_elements_by_class_name('feed_list') ) except TimeoutException: logging.info('there is no weibo content in {}'.format(url_to_crawl)) logging.info('you are considered as a robot') logging.info(self.driver.current_url) self.driver.get_screenshot_as_file('./screenshot/error.png') # let user input the verification code verify_user(self.driver, 'search') # break weibo_list = self.get_weibo_list(self.driver.page_source) # mid is used to crawl the original weibo content, using batch mode self.results.extend(weibo_list) # sleep some time to prevent hitting too much # time.sleep(1) else: continue break # for r in results: # logging.info_dict(r) logging.info('total result {}'.format(len(self.results))) if comments: logging.info('crawling the comments') self.crawl_comments() return
def login_once(self): self.driver.get('http://www.weibo.com/login.php') try: WebDriverWait(self.driver, 10).until( lambda x: x.find_element_by_css_selector('div.info_list') ) # logging.info self.driver.page_source self.driver.maximize_window() user_input = self.driver.find_element_by_xpath('//div[@node-type="normal_form"]//input[@name="username"]') # logging.info user_input.get_attribute('action-data') user_input.click() user_input.clear() user_input.send_keys(self.username) passwd_input = self.driver.find_element_by_xpath('//div[@node-type="normal_form"]//input[@name="password"]') passwd_input.click() passwd_input.clear() # logging.info passwd_input passwd_input.send_keys(self.passwd) submit_button = self.driver.find_element_by_xpath('//div[@node-type="normal_form"]//a[@class="W_btn_g"]') self.driver.get_screenshot_as_file('./screenshot/screenshot.png') except TimeoutException: logging.info('load login page failed') return False logging.info('user name: {}'.format(user_input.get_attribute('value'))) logging.info('passwd: {}'.format(passwd_input.get_attribute('value'))) time.sleep(1) # wait the page to load the verification code verify_user(self.driver, 'login') submit_button.click() try: WebDriverWait(self.driver, 10).until( lambda x: x.find_element_by_class_name('WB_left_nav') ) logging.info('login success') return True except TimeoutException: logging.info('login failed: {}'.format(self.driver.current_url)) self.driver.get_screenshot_as_file('./screenshot/login_failed.png') return False
def login_once(self): self.driver.get("http://www.weibo.com/login.php") try: WebDriverWait(self.driver, 10).until(lambda x: x.find_element_by_css_selector("div.info_list")) # logging.info self.driver.page_source self.driver.maximize_window() user_input = self.driver.find_element_by_xpath('//div[@node-type="normal_form"]//input[@name="username"]') # logging.info user_input.get_attribute('action-data') user_input.click() user_input.clear() user_input.send_keys(self.username) passwd_input = self.driver.find_element_by_xpath('//div[@node-type="normal_form"]//input[@name="password"]') passwd_input.click() passwd_input.clear() # logging.info passwd_input passwd_input.send_keys(self.passwd) submit_button = self.driver.find_element_by_xpath('//div[@node-type="normal_form"]//a[@class="W_btn_g"]') self.driver.get_screenshot_as_file("./screenshot/screenshot.png") except TimeoutException: logging.info("load login page failed") return False logging.info("user name: {}".format(user_input.get_attribute("value"))) logging.info("passwd: {}".format(passwd_input.get_attribute("value"))) time.sleep(1) # wait the page to load the verification code verify_user(self.driver, "login") submit_button.click() try: WebDriverWait(self.driver, 10).until(lambda x: x.find_element_by_class_name("WB_left_nav")) logging.info("login success") return True except TimeoutException: logging.info("login failed: {}".format(self.driver.current_url)) self.driver.get_screenshot_as_file("./screenshot/login_failed.png") return False