def return_html_code(url,use_proxy): dcap = dict(webdriver.DesiredCapabilities.PHANTOMJS) dcap["phantomjs.page.settings.userAgent"] = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.57 Safari/537.36" if use_proxy==True: proxy_address=random.choice(get_proxy_fastest()) proxy_type='https' print proxy_address,proxy_type service_args = [ '--proxy='+proxy_address, '--proxy-type='+proxy_type, ] driver = webdriver.PhantomJS(desired_capabilities=dcap,service_args=service_args) else: driver = webdriver.PhantomJS(desired_capabilities=dcap) driver.maximize_window() driver.get(url) print 'Loading initial page' # initial wait for the tweets to load wait = WebDriverWait(driver, 10) try: wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "li[data-item-id]"))) except TimeoutException: print 'No tweets here' driver.quit() return False # scroll down to the last tweet until there is no more tweets loaded print 'Scrolling tweets' while True: tweets = driver.find_elements_by_css_selector("li[data-item-id]") number_of_tweets = len(tweets) print(number_of_tweets), # move to the top and then to the bottom 5 times in a row for _ in range(5): driver.execute_script("window.scrollTo(0, 0)") driver.execute_script("arguments[0].scrollIntoView(true);", tweets[-1]) time.sleep(0.5) try: wait.until(wait_for_more_than_n_elements_to_be_present((By.CSS_SELECTOR, "li[data-item-id]"), number_of_tweets)) except TimeoutException: break html_full_source=driver.page_source driver.quit() print "_"*15 #with open("check.html",'w') as f: f.write(html_full_source) return html_full_source
def return_html_code(url, proxy_use): vdisplay = Xvfb() vdisplay.start() proxy_address_list = get_proxy_fastest() if proxy_address_list != False: proxy_address = random.choice(proxy_address_list) ip, port = proxy_address.split(":") print ip, port profile = webdriver.FirefoxProfile() profile.set_preference("network.proxy.http", ip) profile.set_preference("network.proxy.http_port", port) profile.set_preference("network.proxy_type", 1) driver = webdriver.Firefox(firefox_profile=profile) else: print "Using localhost, unable to get proxy" driver = webdriver.Firefox() driver.maximize_window() driver.get(url) # initial wait for the tweets to load # initial wait for the tweets to load wait = WebDriverWait(driver, 30) try: wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "li[data-item-id]"))) except TimeoutException: driver.quit() return False # scroll down to the last tweet until there is no more tweets loaded while True: tweets = driver.find_elements_by_css_selector("li[data-item-id]") print len(tweets) # added in edit 1 number_of_tweets = len(tweets) driver.execute_script("arguments[0].scrollIntoView(true);", tweets[-1]) try: wait.until( wait_for_more_than_n_elements_to_be_present((By.CSS_SELECTOR, "li[data-item-id]"), number_of_tweets) ) except TimeoutException: break html_full_source = driver.page_source driver.close() vdisplay.stop() return html_full_source