class GoogleImages(): def __init__( self ): self.display = Display(visible=0, size=(800, 600)) self.display.start() self.base_url = 'https://www.google.com/search?q=%s&tbm=isch' self.path_to_chromedriver = './chromedriver' self.browser = webdriver.Chrome(executable_path = self.path_to_chromedriver) self.browser = webdriver.Chrome() def crawl(self, qry ): url = self.base_url % ( '+'.join(qry) ) self.browser.get(url) self.browser.execute_script("window.scrollTo(0, document.body.scrollHeight);") time.sleep(4) pages = self.browser.page_source soup = BeautifulSoup(pages,'lxml') x = soup.findAll( 'div', 'rg_di rg_el ivg-i' ) print len(x) imgs = [ y.findAll('a')[0]['href'] for y in x ] imgurls = [ ( x.split('imgurl=')[1].split('&')[0],\ x.split('imgurl=')[1].split('&')[1].replace('imgrefurl=','') )\ for x in imgs ] return imgurls def stop(): browser.quit() display.stop()
def run(self): """Run the SelScraper.""" display = Display(visible=0, size=(800, 600)) display.start() #self._set_xvfb_display() if not self._get_webdriver(): raise_or_log('{}: Aborting due to no available selenium webdriver.'.format(self.name), exception_obj=SeleniumMisconfigurationError) try: self.webdriver.set_window_size(400, 400) self.webdriver.set_window_position(400 * (self.browser_num % 4), 400 * (math.floor(self.browser_num // 4))) except WebDriverException as e: out('Cannot set window size: {}'.format(e), lvl=4) super().before_search() if self.startable: self.build_search() self.search() if self.webdriver: self.webdriver.close()
def webthumb(url, filename, is_flash=False): script = """ var s = document.createElement('script'); s.src = 'http://cruels.net/sb/flashfix.js'; document.body.appendChild(s); """ print "webthumb(%s, %s)" % (url, filename) display = Display(visible=0, size=(1200, 900)) display.start() browser = webdriver.Firefox() browser.get(url) if is_flash: time.sleep(1) else: browser.execute_script(script) time.sleep(6) tmpfile = "%s.tmp" % filename browser.get_screenshot_as_file(tmpfile) img = pil.open(tmpfile) width, height = img.size if is_flash: resized = img.resize((LIBRARYFILE_THUMB_WIDTH, LIBRARYFILE_THUMB_HEIGHT), pil.ANTIALIAS) else: ratio = float(width) / float(height) resized = img.resize((LIBRARYFILE_THUMB_WIDTH, int(LIBRARYFILE_THUMB_WIDTH / ratio)), pil.ANTIALIAS) resized.save(filename) os.remove(tmpfile) print "Saved %s." % filename browser.quit() display.stop() return True
def get_driver(browser, display): dricve=1 if display==0: display = Display(visible=0, size=(800, 600)) display.start() if browser and 'chrome' in browser.lower(): options = webdriver.ChromeOptions() #prefs = {"download.default_directory" : folder} options.add_argument('--user-agent={}'.format(random.choice(USER_AGENTS))) options.add_experimental_option("prefs",prefs) if dricve == 1: return webdriver.Chrome(chrome_options=options) else: return webdriver.PhantomJS()# else: profile = webdriver.FirefoxProfile() profile.set_preference('general.useragent.override', random.choice(USER_AGENTS)) #profile.set_preference("browser.download.folderList",2); #profile.set_preference("browser.download.manager.showWhenStarting",false); #profile.set_preference("browser.download.dir",folder); profile.set_preference('browser.helperApps.neverAsk.saveToDisk', 'application/zip') return webdriver.Firefox(profile)
class UITestCase(LiveServerTestCase): def use_xvfb(self): from pyvirtualdisplay import Display self.display = Display('xvfb', visible=1, size=(1280, 1024)) self.display.start() self.driver = WebDriver() def setUp(self): try: self.driver = WebDriver() ui_is_not_available = False except WebDriverException: ui_is_not_available = True if ui_is_not_available: self.use_xvfb() self.driver.implicitly_wait(10) super(UITestCase, self).setUp() def tearDown(self): self.driver.quit() if hasattr(self, 'display'): self.display.stop() super(UITestCase, self).tearDown()
def main(): '''business logic for when running this module as the primary one!''' display = Display(visible=0, size=(1024, 768)) display.start() fresh_cl_post = find_cl_post() prev_cl_post = {"title":"","link":""} old_cl_post = {"title":"","link":""} # find_cl_post() while True: # print "TEST" + str(datetime.date.today()) fresh_cl_post = find_cl_post() try: if fresh_cl_post['title'] != prev_cl_post['title']: old_cl_post = prev_cl_post prev_cl_post = fresh_cl_post send_cl_email(fresh_cl_post) except: print "Failed to test & send mail at: "+str(datetime.datetime.now()) gc.collect() time.sleep(SLEEP_SECONDS) display.stop()
class Xvfb(object): def __init__(self, width=1366, height=768, visible=0): self.__virtual_display = None self.width = width self.height = height self.visible = visible def __init_display(self): if self.__virtual_display is None: self.__virtual_display = Display(visible=self.visible, size=(self.width, self.height)) self.__virtual_display.start() def __enter__(self): self.__init_display() def __exit__(self, exc_type, exc_val, exc_tb): self._close_display() def _close_display(self): if self.__virtual_display: try: self.__virtual_display.close() except: pass self.__virtual_display = None @staticmethod def run(func, *args, **kwargs): runner = Xvfb() with runner: return func(*args, **kwargs)
class BCCVLTestCase(unittest.TestCase): def setUp(self): # acquire URL, username and password from environment variables, or use default values for dev env. self.username = os.getenv("BCCVL_TEST_USERNAME", "admin") self.password = os.getenv("BCCVL_TEST_PASSWORD", "admin") self.url = os.getenv("BCCVL_TEST_URL", "https://192.168.100.200/") # The amount of time selenium will potentially wait in searching for elements. This is blocking. implicit_wait = int(os.getenv("BCCVL_TEST_IMPLICIT_WAIT", "15")) # Run tests in a virtual display (xvfb) virtual_display = os.getenv("BCCVL_TEST_VIRTUAL_DISPLAY", "false") == "true" # Setup the virtual display if virtual_display: self.display = Display(visible=0, size=(1920, 1080)) self.display.start() else: self.display = None # Setup the Firefox Profile and webdriver self.driver = webdriver.Firefox() self.driver.implicitly_wait(implicit_wait) # Maximize the window # self.driver.maximize_window() self.driver.set_window_size(1200, 800) # Go to the bccvl homepage self.driver.get(self.url) def tearDown(self): if self.display: self.display.stop() self.driver.quit()
def getupc(data, sleeptime): display = Display(visible=0, size=(800, 600)) display.start() a = webdriver.Firefox() a.get('https://www.google.com/ncr') time.sleep(sleeptime) search = WebDriverWait(a, 5).until(EC.element_to_be_clickable((By.XPATH, "//input[@type='text']"))) for i in data: ActionChains(a).move_to_element(search).click(search).send_keys(i['name'] + ' upc', Keys.ENTER).perform() time.sleep(sleeptime) contents = WebDriverWait(a, 5).until(EC.presence_of_all_elements_located((By.XPATH, "//div[@class='g']"))) try: upc = next( (re.split(r'/', href.find_element_by_tag_name('a').get_attribute('href'))[-1] for href in contents if href.find_element_by_tag_name('a').get_attribute('href').startswith( 'http://www.upcitemdb.com/upc'))) i['upc'] = upc except StopIteration: pass search = WebDriverWait(a, 5).until(EC.element_to_be_clickable((By.XPATH, "//input[@type='text']"))) search.clear() a.close() display.stop() return data
def get_screenshot(site_id, update_id): """ Create a screenshot and save it to the database """ # Get the objects we're working with site = Site.objects.get(id=site_id) update = Update.objects.get(id=update_id) # Fire up a headless display to work in display = Display(visible=0, size=(1680, 1050)) display.start() # Fire up a Selenium browsers browser = webdriver.Firefox() # Set a timeout for the pageload seconds = 15 browser.command_executor._commands['setPageLoadTimeout'] = ( 'POST', '/session/$sessionId/timeouts' ) browser.execute("setPageLoadTimeout", { 'ms': 1000*seconds, 'type':'page load' }) # Snap a screenshot of the target site logger.debug("Opening %s" % site.url) timestamp = timezone.now() try: browser.get(site.url + "?x=" + get_random_string()) logger.debug("Response received for %s" % site.url) except TimeoutException, e: logger.error("Request for %s timed out" % site.url) pass
def load(self): min_time = 3600 # 1 hour in seconds max_time = 7179 # 2 hours in seconds (less 21) tasktime = randint(min_time, max_time) threading.Timer(tasktime, self.load).start() tasktime_m , tasktime_s = divmod( tasktime , 60) tasktime_h , tasktime_m = divmod( tasktime_m , 60) output_content = "Load execution - waiting %dh %02dmin %02dsec for the next time." % (tasktime_h, tasktime_m, tasktime_s) print "[KeepUp]" , output_content from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as ec from selenium.webdriver.common.keys import Keys from pyvirtualdisplay import Display # Initial display = Display(visible=0, size=(1600, 900)) display.start() profile = webdriver.FirefoxProfile() profile.set_preference("browser.cache.disk.enable", False) profile.set_preference("browser.cache.memory.enable", False) profile.set_preference("browser.cache.offline.enable", False) profile.set_preference("network.http.use-cache", False) driver = webdriver.Firefox() driver.get("https://c9.io/dashboard.html") driver.save_screenshot(self.directory_img + 'login.png') #Username username = driver.find_element_by_id("id-username") username.click() username.clear() username.send_keys(self.user, Keys.ARROW_DOWN) #Password password = driver.find_element_by_id("id-password") password.click() password.clear() password.send_keys(self.password, Keys.ARROW_DOWN) #Submit submit_button = driver.find_element_by_css_selector("button[type=submit]") # print submit_button.text # Click submition submit_button.click(); time.sleep(5) driver.save_screenshot(self.directory_img + 'user_profile.png') # Target dir driver.get(self.target_workspace) time.sleep(10) self.log({'log_html': driver.page_source, 'log_file': output_content}) #make log driver.save_screenshot(self.directory_img + 'final_workspace.png') # End driver.quit() display.stop()
def main(param): if len(param) != 2: sys.exit(-9) if len(param[1]) <= 0: sys.exit(-8) paths = param[0] shotsdir = paths.get('path', 'output.shotsdir').lstrip('"').rstrip('"') targets = param[1] display = Display(visible=0, size=(800, 600)) display.start() binary = FirefoxBinary('/opt/firefox/firefox') browser = webdriver.Firefox(firefox_binary=binary) tgt_len = len(targets) for i, tgt in enumerate(targets): browser.get(tgt[0]) browser.save_screenshot(shotsdir+'/'+tgt[1]+'.png') print '( %3d / %3d ) Took %s.png' % (i+1, tgt_len, tgt[1]) browser.quit() display.stop()
def rzhd(): directions=[create_url(),] while raw_input('Want to add more directions? y/n ')=='y': directions.append(create_url()) print "------------------" # n=raw_input('Check tickets every ...(seconds)? ') n = 60 place=choose_place() i = 0 display = Display(visible=0, size=(5, 5)) display.start() # Запускаем вирутальный дисплей while len(directions)!=0: i+=1 print print "----------------->Searching for PLATSKART<-----------------" print "try #",i print time.asctime() print for url in directions: if find_train(url, place)==True: send_email('*****@*****.**', url) if raw_input('Did you buy ticket? y/n ')=='y': directions.remove(url) if len(directions) == 0: print "Successfully bought all tickets!" return True print str(n)+" seconds until next try..." time.sleep(float(n)) # Дадим браузеру корректно завершиться display.stop() # Закрываем виртуальный дисплей
def loadSite(url): profile = webdriver.FirefoxProfile() profile.set_preference("network.proxy.type", 1) profile.set_preference("network.proxy.http", "74.84.131.34") profile.set_preference("network.proxy.http_port", int('80')) profile.update_preferences() # display = Display(visible=0, size=(800, 600)) display.start() path_to_chromedriver = '/home/alexandr/www/html/python/prs/files/geckodriver' browser = webdriver.Firefox(firefox_profile = profile, executable_path = path_to_chromedriver) # browser.delete_all_cookies() browser.get(url) #print(browser.page_source) #print(browser.page_source) tree = etree.HTML( browser.page_source) # browser.close() display.stop() # nodes = tree.xpath('//table[@class="network-info"]//tr/td') for node in nodes: print(node.text) return 1
def process_install_form (self): if (self.args.xvfb): print "Omeka is being installed in: " + self.folder_name display = Display(visible=0, size=(800, 600)) display.start() driver = webdriver.Firefox() driver.get("http://localhost/omeka/" + self.folder_name + "/install") inputElement = driver.find_element_by_name("username") inputElement.send_keys(self.omeka_user) inputElement = driver.find_element_by_name("password") inputElement.send_keys(self.omeka_passwd) inputElement = driver.find_element_by_name("password_confirm") inputElement.send_keys(self.omeka_passwd) inputElement = driver.find_element_by_name("super_email") inputElement.send_keys("*****@*****.**") inputElement = driver.find_element_by_name("administrator_email") inputElement.send_keys("*****@*****.**") inputElement = driver.find_element_by_name("site_title") inputElement.send_keys(self.omeka_title) inputElement.submit() try: WebDriverWait(driver, 10).until( lambda driver : driver.find_element_by_partial_link_text("Tableau")) finally: driver.quit()
class TestCase(unittest.TestCase): def setUp(self): app.config['TESTING'] = True app.config['SQLALCHEMY_DATABASE_URI'] = 'sqlite:///:memory:' self.app = app.test_client() db.create_all() self.display = Display(visible=0, size=(800, 600)) self.display.start() self.driver = webdriver.Firefox() def tearDown(self): db.session.remove() db.drop_all() self.driver.quit() self.display.stop() def test_extract_funds(self): funds = extract_funds( # some javascript going on that I can't figure out how to mock #'file:///%s/t/test_files/list_mutual_funds.html' % basedir, self.driver ) self.assertTrue(len(funds) > 110)
def main(args): parser = argparse.ArgumentParser(description="Program for running tests on the PATRIC web interface.") parser.add_argument("user", metavar="user", help="Patric login username.") parser.add_argument("passwd", metavar="passwd", help="Patric login password.") parser.add_argument("--firebug", action="store_true", help="Open Firebug during test.") args = parser.parse_args() fp = webdriver.FirefoxProfile() if args.firebug: fp.add_extension(extension='extras/firebug-2.0.9.xpi') fp.set_preference("extensions.firebug.currentVersion", "2.0.9") #Avoid startup screen fp.set_preference("extensions.firebug.console.enableSites", "true") fp.set_preference("extensions.firebug.net.enableSites", "true") fp.set_preference("extensions.firebug.script.enableSites", "true") fp.set_preference("extensions.firebug.allPagesActivation", "on") # Create virtual display display = Display(visible=0, size=(1400, 950)) display.start() # Create webdriver and retrieve url driver = webdriver.Firefox(firefox_profile=fp) driver.get(SITE_URL + '/login') # Wait for username input box to appear WebDriverWait(driver, PAGE_LOAD_TIMEOUT).until(EC.presence_of_element_located((By.ID, "dijit_form_TextBox_0"))) # Set username and password, click login button userElement = driver.find_element_by_id("dijit_form_TextBox_0") pwdElement = driver.find_element_by_id("dijit_form_TextBox_1") userElement.send_keys(args.user) pwdElement.send_keys(args.passwd) loginElement = driver.find_element_by_id("dijit_form_Button_1") loginElement.click() time.sleep(3) # Retrieve home page, wait for an expected page element to load, take a screenshot driver.get(SITE_URL + '/portal/portal/patric/Home') WebDriverWait(driver, PAGE_LOAD_TIMEOUT).until(EC.presence_of_element_located((By.ID, "cart"))) driver.set_window_size(1400, 950) driver.execute_script("window.scrollTo(0,0);") driver.get_screenshot_as_file("homepage_after_login.jpg") print "Saved screenshot to: homepage_after_login.jpg\n" # Retrieve ws url, wait for create folder button to appear ws_url = SITE_URL + '/workspace/' + args.user + '@patricbrc.org/home' driver.get(ws_url) WebDriverWait(driver, PAGE_LOAD_TIMEOUT).until(EC.presence_of_element_located((By.CLASS_NAME, "ActionButtonContainer"))) time.sleep(5) # Have to reload page, because often time the workspace is empty on first load driver.get(ws_url) WebDriverWait(driver, PAGE_LOAD_TIMEOUT).until(EC.presence_of_element_located((By.CLASS_NAME, "ActionButtonContainer"))) # createFolderButton = driver.find_element_by_class_name("ActionButton fa icon-folder-plus fa-2x") # createFolderButton.click() time.sleep(30) driver.quit() display.stop() return 0
def get_news(): if check_wlan(): from pyvirtualdisplay import Display import re display = Display(visible=0, size=(800, 600)) display.start() driver = webdriver.Firefox() url = "http://www.deutschlandfunk.de/" driver.get(url) source = driver.find_element_by_xpath('//*[@id="wrapper"]/div/section[2]/div[1]').get_attribute('innerHTML') n_articles = source.count('<article') print(str(n_articles) + " articles found.") lst = re.findall('<h3>(.+)</h3>', source) result = lst driver.close() display.stop() return result else: print("Error: Not connected to the internet")
class FunctionalTest(StaticLiveServerTestCase): @classmethod def setUpClass(cls): for arg in sys.argv: if 'liveserver' in arg: cls.server_url = 'http://' + arg.split('=')[1] return super().setUpClass() cls.server_url = cls.live_server_url @classmethod def tearDownClass(cls): if cls.server_url == cls.live_server_url: super().tearDownClass() def setUp(self): self.display = Display(visible=0, size=(1024, 768)) self.display.start() self.browser = webdriver.Firefox() # self.browser.implicitly_wait(3) def tearDown(self): self.browser.quit() self.display.stop() def check_for_row_in_list_table(self, row_text): table = self.browser.find_element_by_id('id_list_table') rows = table.find_elements_by_tag_name('tr') self.assertIn(row_text, [row.text for row in rows])
class AdminTestCase(LiveServerTestCase): def setUp(self): self.display = Display(visible=0, size=(800, 600)) self.display.start() self.selenium = webdriver.Firefox() super(AdminTestCase, self).setUp() def tearDown(self): self.selenium.quit() self.display.stop() super(AdminTestCase, self).tearDown() def test_payment(self): """ payment will be successful. """ self.selenium.get("%s/pay" % self.live_server_url) self.selenium.implicitly_wait(20) self.selenium.maximize_window() self.selenium.find_element_by_name("amount").send_keys("100000") pay_button = self.selenium \ .find_element_by_xpath('//input[@value="pay"]') pay_button.click() return_to_site_button = self.selenium.find_element_by_id("btn3") return_to_site_button.click() self.assertIn("successful", self.selenium.page_source)
def work(): logging.info("start weeklys screenshot work") print ("start ... ") if not DISPLAY: print ("hide display ... ") display = Display(visible=0, size=(1366, 768)) display.start() config = getConfigObj() if config == None: return False userName = config.get("USER", "UserName") userPWD = config.get("USER", "userPWD") ret = getTowerWeeklyScreenshot(userName, userPWD, DEFAULT_SAVE_PATH) if not ret: print ('Error, abort. Please check the log file "%s"' % LOG_FILE) return False logging.info("finish all work, exit.") if not DISPLAY: display.stop() return True
def get_image(self): ## Uses supplied scrape site to find new pictures url = self.scrape_site # virtual display for headless runs display = Display(visible=0, size=(800, 600)) display.start() with closing(Firefox()) as browser: browser.get(url) time.sleep(5) # TODO: fix with something less static, but still # multipurpose considering scrape_site as a db var imgs = browser.find_elements_by_tag_name('img') # TODO: fix this temporary workaround that prevents ad server data # from reaching the image checks no_ad_imgs = [i for i in imgs if 'adsrvr' not in \ i.get_attribute('src')] for img in no_ad_imgs: src = img.get_attribute('src') alt = img.get_attribute('alt') image_id = re.findall("/photo/(.+?)/", src)[0] if(self._check_id(image_id) and self._check_ratios(src)): self.img_id = image_id self.description = alt self._save_hd_image() break display.stop() if (self.img_id): return raise Exception('Failed to find a suitable image: all out or bugged')
def get_all_items(): #list to store alll scraped data all_items = list() #Display - read about pyvirtualdisplay display = Display(visible=0, size=(1024, 768)) display.start() #webdriver - read about selenium.webdriver driver = webdriver.Firefox() #this is a starting page we are scraping driver.get("http://www.federalreserve.gov/apps/reportforms/default.aspx") #Every element on the HTML page can be located using CSS selectors. #Opening the starting page in Chrome, right click on the drop-down menu, click "Inspect" we see a tag on the right highlighted, we copy it's id - MainContent_ddl_ReportForms #Knowing the id of dropdown menu, we can locate it with Selenium like this main_menu = WebDriverWait(driver,10).until(EC.presence_of_element_located((By.CSS_SELECTOR,"#MainContent_ddl_ReportForms"))) #Drop down menu is an HTML table of options which can be verified in Chrome browser (Developer Tools, that pop up when you right click and press "Inspect" on an element) #Following returns all of the options - rows in that table form_options = main_menu.find_elements_by_tag_name("option") #We count them option_count = len(form_options) #Next, we loop over all of them - essentially like we scrolling down the drop down menu and clicking on each every form for form_i in xrange(1,option_count): #Get web element corresponding to a form form = form_options[form_i] #Click as a mouse click-action in browser form.click() #Get text, because we need to store the form number form_id = form.text #Locate a web element corresponding to the submit button. By CSS selector which we found by inspection in Chrome browser (same logic as above) submit_button = WebDriverWait(driver,3).until(EC.presence_of_element_located((By.CSS_SELECTOR,"#MainContent_btn_GetForm"))) #Click as a mouse click-action in browser submit_button.click() #Prepare data structures to store all the info we want to scrape a = dict.fromkeys(['Description','OMB','Background','RespondentPanel','Frequency','PublicRelease']) #We are on a web page after submit-click, following will search for all items of interest. Or for corresponding #web-elements for el in a.keys(): try: item = driver.find_element_by_css_selector("#MainContent_lbl_"+el+"_data") #Once found it will store them in our dictionary, if not it will proceed to "except" section and do nothing a[el] = item.text except: #case when there is no such field pass #we need form number as well a['FormNumber'] = form_id #keeping them all in one list, which will have a dictionary per Form Number - and later, a row in your excel file per Form number all_items.append(a) #Ok, that part bothers me a little: it looks like I have to refresh "form_options" each time... #Otherwise I get following exception: selenium.common.exceptions.StaleElementReferenceException: Message: Element not found in the cache - perhaps the page has changed since it was looked up driver.get("http://www.federalreserve.gov/apps/reportforms/default.aspx") main_menu = WebDriverWait(driver,10).until(EC.presence_of_element_located((By.CSS_SELECTOR,"#MainContent_ddl_ReportForms"))) form_options = main_menu.find_elements_by_tag_name("option") driver.close() display.stop() return all_items
def openurl(companyname=first_arg): display = Display(visible=0, size=(1024, 768)) display.start() browser = webdriver.Firefox() time.sleep(randint(8,10)) try: browser.get('http://www.google.com') time.sleep(5) search = browser.find_element_by_name('q') input_text = companyname + str(" crunchbase") search.send_keys(input_text) time.sleep(randint(10,15)) search.send_keys(Keys.RETURN) time.sleep(randint(10,15)) gn = browser.find_element_by_tag_name('h3').text gnc = str(gn).split(' | ')[0].replace(" ","") output_file = '0515' + gnc + '.html' browser.find_element_by_link_text(gn).click() time.sleep(randint(55,60)) company_html = browser.page_source time.sleep(randint(5,10)) with open("smallname.txt", 'a') as myfile: json.dump(output_file,myfile) with open(output_file, 'a+') as myfile: myfile.write(company_html) except: company_html = 'none' with open("missedname.txt", "a") as myfile: json.dump(companyname,myfile) time.sleep(1) browser.close() time.sleep(1) display.stop() return company_html
def process_screenshots(app, env): if not hasattr(env, 'screenshot_all_screenshots'): return if not app.config['screenshots_create']: print("Not doing screenshots on maggies farm no more") return if 'SPHINX_SS_USE_PVD' in os.environ.keys() and os.environ['SPHINX_SS_USE_PVD'] == "true": from pyvirtualdisplay import Display # Start a virtual headless display display = Display(visible=0, size=(1024, 768)) display.start() else: display = None # Don't bother building screenshots if we're just collecting messages. # Just checks if we invoked the build command with "gettext" in there somewhere if "gettext" in sys.argv: return all_args = map(lambda x: x['from_str_arg'], env.screenshot_all_screenshots) # If building in a different language, start the server in a different language command = SCREENSHOT_COMMAND + SCREENSHOT_COMMAND_OPTS + \ [re.sub(r"\s", r"", "--from-str={0}".format(json.dumps(all_args)))] language = env.config.language if language: command += ["--lang={0}".format(language)] subprocess = Popen(command) subprocess.wait() try: if subprocess.returncode: raise Exception("Screenshot process had nonzero return code: {0}".format(subprocess.returncode)) finally: if display: display.stop()
class BrowserManager: def __init__(self): self._lock = False def bootup(self): self._display = Display(visible=0, size=(1024, 768)) self._display.start() profile = {} if 'HTTP_PROXY' in os.environ: proxy_url = os.environ['HTTP_PROXY'] proxy_server = proxy_url.split(':')[1][2:] proxy_port = proxy_url.split(':')[-1] profile['network.proxy.type'] = 1 profile['network.proxy.http'] = proxy_server profile['network.proxy.http_port'] = proxy_port profile['network.proxy.https'] = proxy_server profile['network.proxy.https_port'] = proxy_port self.browser = Browser(profile_preferences=profile) def obtain(self,background): while self._lock: background.wait('Browser lock', 15) self._lock = True return self.browser def release(self,background): self._lock = False def shutdown(self): self.browser.quit() self._display.stop()
class SeleniumRunner(object): def __call__(self, f): @functools.wraps(f) def decorated(_self, *args, **kwargs): with self as driver: return f(_self, driver, *args, **kwargs) return decorated def __enter__(self): self.display = Display(visible=0, size=(800, 600)) self.display.start() self.driver = webdriver.Chrome() return self.driver def __exit__(self, *args, **kwargs): try: self.driver.quit() except (AttributeError,) as e: # Someone has messed with our browser pass try: self.display.stop() except (AttributeError,) as e: # Someone has messed with our display pass
class Spider(scrapy.Spider): name = "mayors" allowed_domains = ["www.cec.gov.tw"] start_urls = ["https://www.cec.gov.tw/pc/zh_TW/IDX/indexC.html",] download_delay = 1 def __init__(self, ad=None, *args, **kwargs): super(Spider, self).__init__(*args, **kwargs) self.display = Display(visible=0, size=(800, 600)) self.display.start() self.driver = webdriver.Chrome("/var/chromedriver/chromedriver") def spider_closed(self, spider): self.display.close() def parse(self, response): self.driver.get(response.url) nodes = scrapy.Selector(text=self.driver.page_source).xpath('//a[@target="_top"]') for node in nodes: county = node.xpath('text()').extract_first() print county yield response.follow(node, callback=self.parse_list, meta={'meta': county}) def parse_list(self, response): for tr in response.css(u'table.tableT tr.trT'): d = {} d['type'] = 'mayors' d['county'] = response.meta['meta'] d['constituency'] = 0 d['elected'] = tr.xpath('td[1]/text()').extract_first().strip() d['number'] = int(tr.xpath('td[2]/text()').extract_first()) d['votes'] = int(re.sub('\D', '', tr.xpath('td[5]/text()').extract_first())) d['votes_percentage'] = tr.xpath('td[6]/text()').extract_first() yield d
def virtual_display_if_enabled(enabled): if enabled: display = Display(visible=0, size=(800, 600)) display.start() return display else: return NoopDisplay()
def run_selenium(landmark): display = Display(visible=0, size=(800, 600)) display.start() logTo(TEST_LOG,'Selenium : Starting Selenium for '+landmark,'INFO','a') interFace=open(HOME_DIR+'/Desktop/one-time-test-suite/iface.txt','r') tmp=interFace.readlines() iface=tmp[0].split('\n')[0] tmpstmp=datetime.now().strftime("%s") profile = webdriver.FirefoxProfile() profile.update_preferences() browser = webdriver.Firefox(firefox_profile=profile) # assign profile to browser browser.delete_all_cookies() logTo(TEST_LOG,' Selenium : Starting tcpdump .. ','INFO','a') tcpcmd='tcpdump -i '+iface+' -w '+EXP_DIR+'/'+'tcpdump_'+landmark.split('.')[0]+'_'+tmpstmp args=shlex.split(tcpcmd) ptcpdmp=sub.Popen((args)) time.sleep(10) logTo(TEST_LOG,' Selenium : Starting get '+landmark,'INFO','a') browser.get('http://www.'+landmark) time.sleep(5) perfData=browser.execute_script('return window.performance.timing') fname=EXP_DIR+'/'+'perfdata_'+landmark.split('/')[0] fname=fname.replace('.','-') pickle.dump(perfData,open(fname,'wb')) logTo(TEST_LOG,'Selenium : Writing done to '+EXP_DIR+'/perfdata_'+landmark,'INFO','a') browser.quit() display.stop() ptcpdmp.terminate() logTo(TEST_LOG,'Finished Selenium for '+landmark,'INFO','a')
class Order: def __init__(self, username, password, url): self.username = username self.password = password self.url = url self.display = Display(visible=0, size=(1920, 1080)) self.display.start() self.browser = webdriver.Chrome() self.browser.implicitly_wait(15) def goToPage(self): self.browser.get(self.url) print(self.browser.title) def login(self): loginButton = self.browser.find_element_by_css_selector( "a#nav-link-yourAccount span.nav-line-1") print(loginButton.text) if loginButton.text == "Hello. Sign in": loginButton.click() email = self.browser.find_element_by_id("ap_email") pw = self.browser.find_element_by_id("ap_password") email.clear() pw.clear() email.send_keys(self.username) pw.send_keys(self.password) submit = self.browser.find_element_by_id("signInSubmit") submit.click() else: print("Already logged in.") loginButton = self.browser.find_element_by_css_selector( "a#nav-link-yourAccount span.nav-line-1") print(loginButton.text) def placeOrder(self): print(self.browser.title) print("Placing order.") wait = WebDriverWait(self.browser, 10) addToCart = self.browser.find_element_by_css_selector( "input#add-to-cart-button") addToCart.click() time.sleep(10) print(self.browser.title) wait.until(EC.title_contains('Amazon.com Shopping Cart')) checkout = self.browser.find_element_by_css_selector( "a#hlb-ptc-btn-native") checkout.click() time.sleep(10) print(self.browser.title) wait.until(EC.title_contains('Amazon.com Checkout')) placeOrder = self.browser.find_element_by_name("placeYourOrder1") placeOrder.click() time.sleep(20) print(self.browser.title) wait.until(EC.title_contains('Amazon.com Thanks You')) def kill(self): self.browser.close() self.display.stop() def start(self): try: self.goToPage() self.login() self.placeOrder() except Exception: print("Exception Raised") raise finally: self.kill()
class Scraper(): """Scraper parent class, child classes are media streaming sites.""" def __init__(self): """Sets creds for each instance.""" with open('creds.json', 'r') as f: self.creds = json.loads(f.read()) def start_driver(self, window_size='--window-size=1920,1080'): """Starts headless chrome browser/driver.""" logging.info('starting driver') self.display = Display(visible=0) # self.display = Display(visible=0, size=(1920, 1080)) self.display.start() options = Options() options.add_argument('--headless') options.add_argument('--disable-gpu') # likely necessary options.add_argument(window_size) self.driver = webdriver.Chrome(CHROMEDRIVER_PATH, options=options) self.driver.implicitly_wait(10) # seconds def stop_driver(self): """Stops headless browser/driver.""" logging.info('stopping driver') self.display.stop() self.driver.quit() def lookup_and_write_medias(self, medias, mtype): """Takes list of movies or shows, searches themoviedb, creates object to write to database, then inserts if new or updates timestamp if not new. """ logging.info('len(medias) before take unique: {}'.format(len(medias))) # get unique: list of dict into list of tuples, set, back to dict medias = [dict(t) for t in set([tuple(d.items()) for d in medias])] logging.info('len(medias) after take unique: {}'.format(len(medias))) for m in medias: source_to_write = dict(self.source) # if media link exists, set source link, try link db lookup / update if 'link' in m.keys(): source_to_write['link'] = m['link'] full_media = flaskapp.db_lookup_via_link(m['link']) if full_media: # logging.info(u'db media link found: {}'.format(m['title'])) flaskapp.update_media_with_source(full_media, source_to_write) continue # link url was not in database, therefore do themoviedb search sleep(0.2) year = m.get('year', '') results = flaskapp.themoviedb_search(m['title'], mtype, year=year) # exit iteration if search not complete or no results if 'total_results' not in results: logging.error(u'tmdb search not complete for {}: {} {}'.format( mtype, m['title'], year)) continue if results['total_results'] < 1: logging.warning(u'tmdb 0 results for {}: {} {}'.format( mtype, m['title'], year)) # empty media for db write, prevent re-searching full_media = dict() full_media['title'] = m['title'] full_media['mtype'] = mtype full_media['year'] = year full_media['id'] = m['link'] full_media['sources'] = [] else: # assume top result is best match and use it full_media = results['results'][0] # append data so dict can be saved to database full_media['mtype'] = mtype full_media['sources'] = [] if mtype == 'movie': full_media['year'] = full_media['release_date'][:4] else: full_media['title'] = full_media['name'] full_media['year'] = full_media['first_air_date'][:4] # check if titles are not exact match, in future may not append these if not flaskapp.doTitlesMatch(m['title'], full_media['title']): logging.warning(u'not exact titles: {} | {}'.format( m['title'], full_media['title'])) # write db media if new flaskapp.insert_media_if_new(full_media) # update db media with source flaskapp.update_media_with_source(full_media, source_to_write) def update_watchlist_amz(self): """For watchlist items check if amazon prime and amazon pay are sources and add to db""" wl_unique = flaskapp.get_all_watchlist_in_db() for m in wl_unique: media = flaskapp.themoviedb_lookup(m['mtype'], m['id']) flaskapp.amz_prime_check(media) sleep(2.5) flaskapp.amz_pay_check(media) sleep(2.5)
# -*- coding: utf-8 -*- from selenium import webdriver from pyvirtualdisplay import Display from selenium.webdriver.common.by import By from selenium.webdriver.common.keys import Keys from selenium.webdriver.support.ui import Select from selenium.common.exceptions import NoSuchElementException from selenium.common.exceptions import NoAlertPresentException import unittest, time, re import login display = Display(visible=0, size=(1024, 768)) display.start() class TestVerCursoEditarNombre(unittest.TestCase): def setUp(self): self.driver = webdriver.Firefox() self.driver.implicitly_wait(30) self.base_url = "http://bakhan.accionstem.cl/" self.verificationErrors = [] self.accept_next_alert = True def test_ver_curso_editar_nombre(self): driver = login.test_login_utp(self) self.assertEqual("1ro basico A 2016 Test", driver.find_element_by_css_selector("font").text) driver.find_element_by_link_text("Ver Curso").click() driver.find_element_by_css_selector("button.editButton").click() driver.find_element_by_id("input_nombre").clear() driver.find_element_by_id("input_nombre").send_keys(u"prueba máil1") driver.find_element_by_id("button_editar").click()
def deploy_firefox( status_queue: Queue, browser_params: BrowserParamsInternal, manager_params: ManagerParamsInternal, crash_recovery: bool, ) -> Tuple[webdriver.Firefox, Path, Optional[Display]]: """ launches a firefox instance with parameters set by the input dictionary """ firefox_binary_path = get_firefox_binary_path() root_dir = os.path.dirname(__file__) # directory of this file browser_profile_path = Path(tempfile.mkdtemp(prefix="firefox_profile_")) status_queue.put(("STATUS", "Profile Created", browser_profile_path)) # Use Options instead of FirefoxProfile to set preferences since the # Options method has no "frozen"/restricted options. # https://github.com/SeleniumHQ/selenium/issues/2106#issuecomment-320238039 fo = Options() # Set a custom profile that is used in-place and is not deleted by geckodriver. # https://firefox-source-docs.mozilla.org/testing/geckodriver/CrashReports.html # Using FirefoxProfile breaks stateful crawling: # https://github.com/mozilla/OpenWPM/issues/423#issuecomment-521018093 fo.add_argument("-profile") fo.add_argument(str(browser_profile_path)) assert browser_params.browser_id is not None if browser_params.seed_tar and not crash_recovery: logger.info("BROWSER %i: Loading initial browser profile from: %s" % (browser_params.browser_id, browser_params.seed_tar)) load_profile( browser_profile_path, manager_params, browser_params, browser_params.seed_tar, ) elif browser_params.recovery_tar: logger.debug("BROWSER %i: Loading recovered browser profile from: %s" % (browser_params.browser_id, browser_params.recovery_tar)) load_profile( browser_profile_path, manager_params, browser_params, browser_params.recovery_tar, ) status_queue.put(("STATUS", "Profile Tar", None)) display_mode = browser_params.display_mode display_pid = None display_port = None display = None if display_mode == "headless": fo.headless = True fo.add_argument("--width={}".format(DEFAULT_SCREEN_RES[0])) fo.add_argument("--height={}".format(DEFAULT_SCREEN_RES[1])) if display_mode == "xvfb": try: display = Display(visible=0, size=DEFAULT_SCREEN_RES) display.start() display_pid, display_port = display.pid, display.display except EasyProcessError: raise RuntimeError("Xvfb could not be started. \ Please ensure it's on your path. \ See www.X.org for full details. \ Commonly solved on ubuntu with `sudo apt install xvfb`") # Must do this for all display modes, # because status_queue is read off no matter what. status_queue.put(("STATUS", "Display", (display_pid, display_port))) if browser_params.extension_enabled: # Write config file extension_config: Dict[str, Any] = dict() extension_config.update(browser_params.to_dict()) extension_config["logger_address"] = manager_params.logger_address extension_config[ "storage_controller_address"] = manager_params.storage_controller_address extension_config["testing"] = manager_params.testing ext_config_file = browser_profile_path / "browser_params.json" with open(ext_config_file, "w") as f: json.dump(extension_config, f, cls=ConfigEncoder) logger.debug("BROWSER %i: Saved extension config file to: %s" % (browser_params.browser_id, ext_config_file)) # TODO restore detailed logging # fo.set_preference("*****@*****.**", "all") # Geckodriver currently places the user.js file in the wrong profile # directory, so we have to create it manually here. # TODO: See https://github.com/mozilla/OpenWPM/issues/867 for when # to remove this workaround. # Load existing preferences from the profile's user.js file prefs = configure_firefox.load_existing_prefs(browser_profile_path) # Load default geckodriver preferences prefs.update(configure_firefox.DEFAULT_GECKODRIVER_PREFS) # Pick an available port for Marionette (https://stackoverflow.com/a/2838309) # This has a race condition, as another process may get the port # before Marionette, but we don't expect it to happen often s = socket.socket() s.bind(("", 0)) marionette_port = s.getsockname()[1] s.close() prefs["marionette.port"] = marionette_port # Configure privacy settings configure_firefox.privacy(browser_params, prefs) # Set various prefs to improve speed and eliminate traffic to Mozilla configure_firefox.optimize_prefs(prefs) # Intercept logging at the Selenium level and redirect it to the # main logger. interceptor = FirefoxLogInterceptor(browser_params.browser_id) interceptor.start() # Set custom prefs. These are set after all of the default prefs to allow # our defaults to be overwritten. for name, value in browser_params.prefs.items(): logger.info("BROWSER %i: Setting custom preference: %s = %s" % (browser_params.browser_id, name, value)) prefs[name] = value # Write all preferences to the profile's user.js file configure_firefox.save_prefs_to_profile(prefs, browser_profile_path) # Launch the webdriver status_queue.put(("STATUS", "Launch Attempted", None)) fb = FirefoxBinary(firefox_path=firefox_binary_path) driver = webdriver.Firefox( firefox_binary=fb, options=fo, log_path=interceptor.fifo, # TODO: See https://github.com/mozilla/OpenWPM/issues/867 for # when to remove this service_args=["--marionette-port", str(marionette_port)], ) # Add extension if browser_params.extension_enabled: # Install extension ext_loc = os.path.join(root_dir, "../Extension/firefox/openwpm.xpi") ext_loc = os.path.normpath(ext_loc) driver.install_addon(ext_loc, temporary=True) logger.debug("BROWSER %i: OpenWPM Firefox extension loaded" % browser_params.browser_id) # set window size driver.set_window_size(*DEFAULT_SCREEN_RES) # Get browser process pid if hasattr(driver, "service") and hasattr(driver.service, "process"): pid = driver.service.process.pid elif hasattr(driver, "binary") and hasattr(driver.binary, "process"): pid = driver.binary.process.pid else: raise RuntimeError("Unable to identify Firefox process ID.") status_queue.put(("STATUS", "Browser Launched", int(pid))) return driver, browser_profile_path, display
def selecting_data(star_name): ''' Search the INES website for a specified star. :param star_name: name of the star (string) :return: request of the star name in INES page ''' from pyvirtualdisplay import Display display = Display(visible=0, size=(800, 600)) display.start() # now Chrome will run in a virtual display. # you will not see the browser. # Starting the searching if os.path.isdir('iue/' + star_name) is False: os.mkdir('iue/' + star_name) folder_data = 'iue/' + star_name # Define global Chrome properties options = webdriver.ChromeOptions() prefs = {"download.default_directory": folder_data} options.add_experimental_option("prefs", prefs) browser = webdriver.Chrome(chrome_options=options) # browser = webdriver.Firefox(firefox_profile=fp) # Define web source ines_site = "http://sdc.cab.inta-csic.es/cgi-ines/IUEdbsMY" # Openning it browser.get(ines_site) # browser.maximize_window() # Selecting all data mySelect = Select(browser.find_element_by_name("limit")) mySelect.select_by_value("all") time.sleep(3) # Selecting some stars browser.find_element_by_name("object").send_keys(star_name) browser.find_element_by_name(".submit").click() # time.sleep(3) # Taking the data try: browser.find_element_by_name("markRebin").click() browser.find_element_by_name(".submitNH").click() time.sleep(10) except: print('There is no data for this star!') # browser.close() # Unzip files outdir = os.getcwd() os.chdir(folder_data) file_list = glob('*') if len(file_list) != 0: # print(file_list) fname = str(file_list[0]) # print(fname) tar = tarfile.open(fname, "r:gz") tar.extractall() tar.close() os.system('rm *.gz') os.chdir(outdir) browser.close() return
class BrowserWebdriver(BrowserBase): skip_urls = [] def __init__(self, *args, **kwargs): BrowserBase.__init__(self, *args, **kwargs) self._first_navigation_ts = None self._first_navigation_netloc = None self._ts_offset = None def _skip_url(self, page, url): if not url: return False _, req_netloc, _ = parse_url(url) for su in self.skip_urls: if su in req_netloc: _, page_netloc, _ = parse_url(page.url) if not any(x in page_netloc for x in self.skip_urls): self.log_debug("skipping URL %s" % req_netloc) return True return False def _browser_clear_caches(self): BrowserBase._browser_clear_caches(self) self.driver.quit() self.pid = self.browser_start() def _browser_navigate(self, location, cached=True, name=None): url = location.url if isinstance(location, Page) else location real_navigation = self._http_get(url) return Page(self, url, cached, name=name, real_navigation=real_navigation) def _browser_wait(self, page, timeout=None): self.log_info("_browser_wait()...") if timeout is None: timeout = self.nav_timeout start = time.time() while time.time() - start < timeout / 2: time.sleep(0.2) if self.driver.execute_script( "return window.performance.timing.loadEventEnd"): break # onload event has not been processed yet, so need to wait and retry self.log_info("Waiting for loadEventEnd ... ") while time.time() - start < timeout: time.sleep(self.ajax_threshold) # hack. Execute something in browser context to flush logs... self.driver.execute_script( "return window.performance.timing.loadEventEnd") self._browser_get_events(page) ir = page.get_incomplete_reqs() if not ir: break self.log_info( "Waiting for incomplete requests:\n %s" % ("\n ".join(["%s - %s" % (r.id, r.url) for r in ir]))) if time.time() - start >= timeout: if not self.driver.execute_script( "return window.performance.timing.loadEventEnd"): self.log_error( "Page '%s' load timeout, window.performance.timing.loadEventEnd = 0" % page.url) ir = page.get_incomplete_reqs() if ir: self.log_error( "Can't wait for page '%s' load completion, " "see '%s' for details\nincomplete requests:\n %s" % (page.url, self.log_path, "\n ".join( ["%s - %s" % (r.id, r.url) for r in ir]))) page.complete(self) def _browser_warmup_page(self, location, name=None): self.navigate_to(location, cached=False, stats=False, name=name) def _browser_display_init(self, headless, resolution): if headless: try: from pyvirtualdisplay import Display except ImportError as e: abort(e) self.display = Display(visible=0, size=resolution) self.display.start() else: self.display = None def _browser_execute_script(self, js): val = self.driver.execute_script("return %s" % js) self.log_debug("%s = %s" % (js, val)) return val def browser_get_name(self): c = self.driver.capabilities return c['browserName'] def browser_get_version(self): c = self.driver.capabilities return self._get_val(c, ['version', 'browserVersion']) def browser_get_platform(self): c = self.driver.capabilities return self._get_val(c, ['platform', 'platformName']) def browser_get_screenshot_as_file(self, filename): self.driver.get_screenshot_as_file(filename) def browser_get_page_timeline(self, page): values = {} for t in PageTimeline.types: if t in PageTimeline.jstypes: js = "window.performance.timing.%s" % PageTimeline.jstypes[t] values[t] = self._browser_execute_script(js) return PageTimeline(page, values) # def browser_set_session(self, domain, session_id): # self._http_get(domain) # self.driver.add_cookie({'name': 'sessionid', 'value': session_id}) def browser_get_current_url(self): return self.driver.current_url def browser_get_screenshot(self, filename): self.driver.get_screenshot_as_file(filename) def browser_stop(self): try: if self.driver: self.driver.quit() self.driver = None if self.display: self.display.stop() self.display = None except URLError: pass def _xpath_click(self, xpath): exc = None # take into account possible replacements of %23/# xpaths = [xpath] if "%23" in xpath: xpaths.append(xpath.replace("%23", "#")) if "#" in xpath: xpaths.append(xpath.replace("#", "%23")) for x in xpaths: self.log_debug("Looking for xpath: %s ..." % x) try: el = self.driver.find_element_by_xpath(x) el.click() self.log_debug("Looking for xpath: %s ... OK" % x) return except NoSuchElementException as e: self.log_debug( "Looking for xpath: %s ... Failed, no such element" % x) exc = e except ElementNotVisibleException as e: self.log_warning( "Looking for xpath: %s ... Failed, element not visible" % x) exc = e self.log_error("NoSuchElementException, xpath: %s, see debug log" % xpath) self.log_debug("page source:\n%s" % self.driver.page_source.encode('ascii', 'ignore')) raise BrowserExc(e) def _http_get(self, url, validator=None): self.log_debug("Execute GET request: %s" % url) if not self._first_navigation_ts: self._first_navigation_ts = time.time() _, self._first_navigation_netloc, _ = parse_url(url) ar = url.split("^") if len(ar) > 1: self._xpath_click(ar[1]) return False try: self.driver.get(url) except WebDriverException as e: raise BrowserExc(e) return True @staticmethod def _get_val(d, keys): for key in keys: if key in d: return d[key] return "unknown" def print_browser_info(self): c = self.driver.capabilities self.print_stats_title("Browser summary") print(" - platform: %s" % self.browser_get_platform()) print(" - browser: %s %s" % (self.browser_get_name(), self.browser_get_version())) print(" - PID: %d" % self.pid) print(" - log file: %s" % self.log_path) def print_log_file_path(self): self.print_stats_title("Browser log file") print(" %s" % self.log_path) # === virtual methods that must be implemented in every webdriver-based browser === # def _browser_parse_logs(self, page, logs): raise BrowserExcNotImplemented() def _browser_get_events(self, page): raise BrowserExcNotImplemented() # === webdriver specific === # def dom_wait_element_stale(self, el, timeout_s=None, name=None): start_time = time.time() if timeout_s is None: timeout_s = self.nav_timeout # http://www.obeythetestinggoat.com/how-to-get-selenium-to-wait-for-page-load-after-a-click.html while time.time() < start_time + timeout_s: try: el.find_elements_by_id('doesnt-matter') pass except StaleElementReferenceException: break time.sleep(0.1) if time.time() > start_time + timeout_s: msg = "DOM element '%s' click() timeout: %.1fs" % ( name, time.time() - start_time) self.log_error(msg) raise BrowserExcTimeout(msg) def dom_click(self, el, timeout_s=None, name=None, wait_callback=None, wait_callback_obj=None): self.log_debug("dom_click(%s, %s)" % (str(el), str(name))) if timeout_s is None: timeout_s = self.nav_timeout p = Page(self, self.browser_get_current_url(), True, name=name, real_navigation=False) p.start() # 1. click on the element old_page = self.driver.find_element_by_tag_name('html') el.click() # 2. wait for selenium onclick completion if wait_callback: self.log_debug( "wait callback: %s, %s" % (str(wait_callback.__name__), str(wait_callback_obj))) wait_callback(wait_callback_obj, el, timeout_s, name) else: self.log_debug("wait stale: %s, %s, %s" % (el, timeout_s, name)) self.dom_wait_element_stale(el, timeout_s, name) # 3. wait for ajax completion, because browser URL can be update only after that self._browser_wait(p, timeout=timeout_s) p.url = self.browser_get_current_url() time.sleep(0.2) def dom_find_element_by_id(self, id): try: return self.driver.find_element_by_id(id) except NoSuchElementException as e: raise BrowserExc(e) def dom_find_element_by_name(self, name): try: return self.driver.find_element_by_name(name) except NoSuchElementException as e: raise BrowserExc(e) def dom_find_element_by_xpath(self, xpath): try: return self.driver.find_element_by_xpath(xpath) except NoSuchElementException as e: raise BrowserExc(e) def dom_find_frames(self): frames = [] for name in ("frame", "iframe"): try: frames += self.driver.find_elements_by_tag_name(name) except NoSuchElementException as e: pass return frames def dom_switch_to_frame(self, frame): self.log_info("Switching to frame %s" % frame) return self.driver.switch_to.frame(frame) def dom_switch_to_default_content(self): self.log_info("Switching to default content") return self.driver.switch_to.default_content() def dom_send_keys(self, el, keys): val = el.get_attribute('value') if val != '': # clear initial value self.log_info("Element value is not empty, clear content...") self.driver.execute_script("arguments[0].value = ''", el) time.sleep(2.0) for ch in keys: el.send_keys(ch) time.sleep(0.2) val = el.get_attribute('value') if val == keys: return True self.log_warning("Bogus selenium send_keys(). Entered: '%s', " "but see: '%s', using set_attribute()..." % (keys, val)) time.sleep(2.0) self.driver.execute_script("arguments[0].value = '%s'" % keys, el) time.sleep(2.0) val = el.get_attribute('value') if val == keys: self.log_info("Ok, set_attribute() works fine") return True self.log_error( "Bogus selenium send_keys() and set_attribute(), can't enter value into the element" ) return False # === some predefined scenarios === # def _do_send_keys(self, title, keys, tag_names, tag_ids): for tag, name in tag_names: try: el = self.dom_find_element_by_name(name) if el.tag_name != tag: continue if not self.dom_send_keys(el, keys): self.log_error("Couldn't enter %s" % title) return False return True except BrowserExc as e: pass for tag, name in tag_names: try: el = self.dom_find_element_by_xpath( '//*[@label="{}"]'.format(name)) if el.tag_name != tag: continue if not self.dom_send_keys(el, keys): self.log_error("Couldn't enter %s" % title) return False return True except BrowserExc as e: pass for tag, id in tag_ids: try: el = self.dom_find_element_by_id(id) if el.tag_name != tag: continue if not self.dom_send_keys(el, keys): self.log_error("Couldn't enter %s" % title) return False return True except BrowserExc as e: pass self.log_info("Couldn't find %s input field" % title) return False def _do_login(self, url, user, password, login_form, timeout_s=None): if not self._do_send_keys('user name', user, login_form.user_tags, login_form.user_ids): return False time.sleep(1) if not self._do_send_keys('password', password, login_form.pass_tags, login_form.pass_ids): return False time.sleep(1) submit_form_found = False for tag, name in login_form.sbmt_tags: try: el = self.dom_find_element_by_name(name) if el.tag_name != tag: continue submit_form_found = True self.dom_click(el, name=name, timeout_s=timeout_s) try: el = self.dom_find_element_by_name(name) except BrowserExc: self.log_info("Login succeed") return True except BrowserExc as e: pass for tag, id in login_form.sbmt_ids: try: el = self.dom_find_element_by_id(id) if el.tag_name != tag: continue submit_form_found = True self.dom_click(el, name=id, timeout_s=timeout_s) try: el = self.dom_find_element_by_id(id) except BrowserExc: self.log_info("Login succeed") return True except BrowserExc as e: pass for x in login_form.sbmt_xpath: try: el = self.dom_find_element_by_xpath(x) submit_form_found = True self.dom_click(el, name=id, timeout_s=timeout_s) try: el = self.dom_find_element_by_xpath(x) except BrowserExc: self.log_info("Login succeed") return True except BrowserExc as e: pass if not submit_form_found: self.log_info("Couldn't find login submit form") self.log_info("Login failed") return False def do_login(self, url, user, password, login_form, timeout_s=None): self.log_info("Trying to login to '%s' under user %s" % (url, user)) self.navigate_to(url, cached=None) if self._do_login(url, user, password, login_form, timeout_s=timeout_s): return True for frame in self.dom_find_frames(): self.dom_switch_to_frame(frame) if self._do_login(url, user, password, login_form, timeout_s=timeout_s): return True self.log_info("Login to '%s' under user '%s' has been failed" % (url, user)) return False
class SlackSpider(): def __init__(self): #self.all_items = [] self.channelList = [] self.dataList = [] self.pageSize = 0 self.urlsToHit = [] self.TeamName = '' self.ChannelName = '' # Open headless chromedriver def start_driver(self): print('starting driver...') self.display = Display(visible=0, size=(800, 600)) self.display.start() self.driver = webdriver.Chrome("/var/chromedriver/chromedriver") sleep(randint(9, 10)) # Close chromedriver def close_driver(self): print('closing driver...') self.display.stop() self.driver.quit() print('closed!') # Tell the browser to get a page def get_page(self, url): print('getting page...{0}'.format(url)) self.driver.get(url) sleep(randint(9, 10)) # Grab items from divisions def grab_list_items(self): print('grabbing list of items...') senderAvatar = '' all_items = [] for div in self.driver.find_elements_by_xpath( '//ul[@class="messages"]//li'): data = self.process_elements(div, senderAvatar) if data: all_items.append(data) if data.senderAvatar != '': senderAvatar = data.senderAvatar return all_items # Process division elements def process_elements(self, div, senderAvatar): msg_sender_avatar = '' try: msg_sender = div.find_element_by_class_name( "msg-user").get_attribute('innerText') msg_time = div.find_element_by_class_name( "msg-time").get_attribute('innerText') msg_body = div.find_element_by_class_name( "msg-body").get_attribute('innerText') except Exception as error: print 'element not found exception' return None try: avatar = div.find_element_by_xpath('.//*[@class="msg-avatar"]') msg_sender_avatar = avatar.find_element_by_class_name( 'msg-thumb').get_attribute('src') except Exception as error: msg_sender_avatar = senderAvatar if msg_sender and msg_time and msg_body: archiveObj = SlackArchive() archiveObj.teamName = self.TeamName archiveObj.channelName = self.ChannelName archiveObj.messageBody = msg_body archiveObj.senderAvatar = msg_sender_avatar archiveObj.messageTime = msg_time archiveObj.messageSender = msg_sender return archiveObj else: return None # Parse the URL def parse(self, url): self.get_page(url) return self.grab_list_items() pass # Get list of channels in a team def getChannelList(self): for channelName in self.driver.find_elements_by_xpath( '//ul[@class="channels-list"]//li//a'): self.channelList.append(channelName.text) pass # Get the total number of pages in each channel in each page def getPageSize(self, url_Template): for page in self.driver.find_elements_by_xpath( '//ul[@class="pagination pagination-vertical"]//li[@class="page-item active"]' ): self.pageSize = int(page.text) pass # Build the list of URL's to hit def buildTarget(self, teamName): url_Template = "https://{0}.slackarchive.io/".format(teamName) self.get_page(url_Template) self.getChannelList() if teamName == 'buffercommunity': self.channelList = self.channelList[7:] for channel in self.channelList: channelName = channel[1:].strip() urlA = url_Template + channelName + "/" self.get_page(urlA) self.getPageSize(urlA) print 'Page size: {0}'.format(self.pageSize) for i in range(1, self.pageSize + 1): urlObject = [] urlObject.append(teamName) urlObject.append(channelName) urlObject.append(urlA + "page-" + str(i)) self.urlsToHit.append(urlObject) pass # Run the crawler def runSpider(self, teamName): self.buildTarget(teamName) Utils.get_Connection_SNA4Slack() sync_table(SlackArchive) for url in self.urlsToHit: self.TeamName = url[0] self.ChannelName = url[1] count = 0 for data in self.parse(url[2]): if data: count += 1 node_object = SlackArchive( id=uuid.uuid1(), teamName=data.teamName, channelName=data.channelName, messageSender=data.messageSender.rstrip().lstrip(), messageBody=data.messageBody.rstrip().lstrip(), senderAvatar=data.senderAvatar, messageTime=dateutil.parser.parse(data.messageTime)) node_object.save() if count > 0: print '{0} rows saved'.format(count) else: print url[2] print 'No data found' pass
class LexisNexisSpider(scrapy.Spider): name = 'lexisnexis' start_urls = [] s_date = '' e_date = '' c_date = '' page_cnt = 1 dont_filter = True agency_list = [] ''' today = datetime.now() + timedelta(days = -3) date = str(today)[0:10] year = date[0:4] month = date[5:7] day = date[8:10] ''' ''' Constructor ''' def __init__(self, keyword='nation', *args, **kwargs): self.keyword = keyword self.start_urls = ['http://www.google.com'] super(LexisNexisSpider, self).__init__(*args, **kwargs) self.display = Display(visible=0, size=(1280, 1024)) self.display.start() profile = webdriver.FirefoxProfile() profile.native_events_enabled = True self.driver = webdriver.Firefox(profile) # self.driver2 = webdriver.Firefox(profile) self.driver.get(self.get_query_url(self.keyword)) time.sleep(3) def __del__(self): self.driver.close() self.driver.quit() self.display.stop() print '************************************************************************' print 'CLOSED!!!' ''' Get the query url ''' def get_query_url(self, keyword): today = datetime.now() + timedelta(days=-25) date = str(today)[0:10] year = date[0:4] month = date[5:7] day = date[8:10] return 'http://www.lexisnexis.com/lnacui2api/api/version1/sr?sr=%28' + keyword + '%29%20and%20Date%28geq%28' + month + '/' + day + '/' + year + '%29%29&csi=8006%2C6742%2C8213%2C8142%2C8075%2C11810%2C306884%2C247189%2C163823%2C301477&oc=00006&hgn=t&hl=t&hes=t&hnsl=t&hsl=t&hdym=t&hfb=t&ssl=f&stp=bool&icvrpg=true' ''' #The New York Times +'%2C6742' \ # USA TODAY +'%2C8213' \ #Wall Street Journal Abstracts +'%2C8142' \ #The Washington Post +'%2C8075' \ #Post-Dispatch +'%2C11810' \ #The Baltimore Sun +'%2C306884' \ #The Philadelphia Inquirer +'%2C247189' \ #Chicago Daily Herald +'%2c163823' #Arizona Capitol Times +'%2c301477' ''' #return 'http://www.lexisnexis.com/lnacui2api/api/version1/sr?sr=%28' + keyword + '%29%20and%20Date%28geq%28'+ month + '/' + day + '/' + year + '%29%29&csi=8006%2C6742%2C8213%2C8142%2C8075&oc=00006&hgn=t&hl=t&hes=t&hnsl=t&hsl=t&hdym=t&hfb=t&ssl=f&stp=bool&icvrpg=true' #return 'http://www.lexisnexis.com/lnacui2api/api/version1/sr?sr=%28' + keyword + '%29%20and%20Date%28geq%284/5/2011%29%29&csi=8006%2C6742%2C8213%2C8142%2C8075&oc=00006&hgn=t&hl=t&hes=t&hnsl=t&hsl=t&hdym=t&hfb=t&ssl=f&stp=bool&icvrpg=true' def next_page(self, start_index): try: next_button = self.driver.find_element_by_xpath( '//table//table//table//table//table//table//td[@align="right"]/a/img[@src="images/IconPaginationNext.gif"]' ) except: return False pass risb = self.driver.find_element_by_xpath( '//input[@name="risb"]').get_attribute("value") nexpage = "http://www.lexisnexis.com/lnacui2api/results/listview/listview.do?start=" + str( start_index) + "&sort=RELEVANCE&format=GNBLIST&risb=" + risb self.driver.get(nexpage) time.sleep(2) source = self.driver.find_element_by_xpath( '//frame[@title="Results Content Frame"]') self.driver.get(source.get_attribute("src")) time.sleep(2) return True ''' Starting point Retrieve the news link from the list of search results. Args: response - the response object pertaining to the search results page ''' def parse(self, response): button_continue = self.driver.find_element_by_xpath( '//a[@id="firstbtn"]') try: button_continue.click() except: print 'can' 't find continue button ' source = self.driver.find_element_by_xpath( '//frame[@title="Results Content Frame"]') self.driver.get(source.get_attribute("src")) time.sleep(5) item_list = list() start_id = 1 while self.next_page(start_id): noshade_list = self.driver.find_elements_by_xpath( '//tr[@class="noshaderow1st"]') shade_list = self.driver.find_elements_by_xpath( '//tr[@class="shaderow1st"]') for news in noshade_list + shade_list: button = news.find_element_by_xpath('.//a') news_title = button.text news_url = button.get_attribute("href") news_agency = news.find_element_by_xpath( './/span[@class="notranslate"]').text article = LexisnexisArticleItem() article['title'] = news_title article['url'] = news_url article['agency'] = news_agency item_list.append(article) start_id += 25 print "++++++++++++++++++", len(item_list) for article in item_list: self.driver.get(article['url']) time.sleep(2) try: source = self.driver.find_element_by_xpath( '//frame[@title="Results Document Content Frame"]') self.driver.get(source.get_attribute('src')) time.sleep(2) date_str = self.driver.find_element_by_xpath( '//span[@class="verdana"]/center').text news_date = self.parse_date(date_str) news_id = self.driver.find_element_by_xpath( '//input[@name="docIdentifier"]') news_id = news_id.get_attribute('value') news_content_list = self.driver.find_elements_by_xpath( '//span[@class="verdana"]/p[@class="loose"]') news_content_list = [n.text for n in news_content_list] news_content = '.'.join(news_content_list) #Get keywords rake = Rake() keywords_list = rake.run(news_content) keywords = '\n'.join(keywords_list) tag = rake.get_tagged_text() #article['keywords'] = keywords article['aid'] = news_id article['date'] = news_date article['contents'] = news_content article['keywords'] = keywords article['tagged_text'] = tag except Exception, e: print 'ERROR!!!!!!!!!!!!! URL :' print traceback.print_exc(file=sys.stdout) yield article
def __init__(self): display = Display(visible=0, size=(1120, 600)) display.start() self.driver = webdriver.Chrome() self.url = 'https://edit.yahoo.com/forgot?stage=fe100'
def display(request): """Logged in user session for [email protected] """ from pyvirtualdisplay import Display display = Display(visible=0, size=(1920, 1080)) display.start()
def selenium_browser(self): browser = str(os.getenv('BROWSER', None)) if browser == "HEADLESS": display = Display(visible=0, size=(800, 600)) display.start() return webdriver.Chrome(chrome_options=self.chrome_options)
def get_urls(query, url, verbose=False, warning=True, user_agent=None, proxy=None, **kwargs): """ Bypass Google captchas and Google API by using selenium-webdriver to gather the Google URL. This will open a robot controlled browser window and attempt to get a URL from Google that will be used for scraping afterwards. Only downside to this method is that your IP and user agent will be visible until the application pulls the URL. """ if verbose: logger.debug(set_color( "setting up the virtual display to hide the browser...", level=10 )) ff_display = Display(visible=0, size=(800, 600)) ff_display.start() logger.info(set_color( "firefox browser display will be hidden while it performs the query..." )) if warning: logger.warning(set_color( "your web browser will be automated in order for Zeus to successfully " "bypass captchas and API calls. this is done in order to grab the URL " "from the search and parse the results. please give selenium time to " "finish it's task...", level=30 )) if verbose: logger.debug(set_color( "running selenium-webdriver and launching browser...", level=10 )) if verbose: logger.debug(set_color( "adjusting selenium-webdriver user-agent to '{}'...".format(user_agent), level=10 )) if proxy is not None: proxy_type = proxy.keys() proxy_to_use = Proxy({ "proxyType": ProxyType.MANUAL, "httpProxy": proxy[proxy_type[0]], "ftpProxy": proxy[proxy_type[0]], "sslProxy": proxy[proxy_type[0]], "noProxy": "" }) if verbose: logger.debug(set_color( "setting selenium proxy to '{}'...".format( ''.join(proxy_type) + "://" + ''.join(proxy.values()) ), level=10 )) else: proxy_to_use = None profile = webdriver.FirefoxProfile() profile.set_preference("general.useragent.override", user_agent) browser = webdriver.Firefox(profile, proxy=proxy_to_use) logger.info(set_color("browser will open shortly...")) browser.get(url) if verbose: logger.debug(set_color( "searching search engine for the 'q' element (search button)...", level=10 )) search = browser.find_element_by_name('q') logger.info(set_color( "searching '{}' using query '{}'...".format(url, query) )) search.send_keys(query) search.send_keys(Keys.RETURN) # hit return after you enter search text time.sleep(3) if verbose: logger.debug(set_color( "obtaining URL from selenium..." )) retval = browser.current_url if verbose: logger.debug(set_color( "found current URL from selenium browser '{}'...".format(retval), level=10 )) logger.info(set_color( "closing the browser and continuing process.." )) browser.close() ff_display.stop() return retval
def AdjustResolution(): display = Display(visible=0, size=(800, 800)) display.start()
class PinterestImages(): def __init__(self): self.display = Display(visible=0, size=(800, 600)) self.display.start() self.srchurl = 'https://in.pinterest.com/search/pins/?q=%s' self.base_url = self.srchurl self.path_to_chromedriver = './chromedriver' self.browser = webdriver.Chrome( executable_path=self.path_to_chromedriver) self.browser = webdriver.Chrome() self.browser.get('https://in.pinterest.com/login/') self.elem = self.browser.find_elements_by_name("username_or_email") self.elem[0].send_keys("*****@*****.**") self.elem = self.browser.find_elements_by_name("password") self.elem[0].send_keys("qawsedrf") self.elem = self.browser.find_elements_by_xpath( "/html/body/div[1]/div[1]/div[1]/div/div/div/form/div[4]/div/button" ) self.elem[0].click() self.buton = '//*[@id="yui_3_5_1_1_1440135195051_1805"]' def crawl(self, qry): def noImages(psource): if psource == None: return 0 soup = BeautifulSoup(psource, 'lxml') imgs = soup.findAll('div', 'Image Module pinUiImage') return len(imgs) url = self.base_url % ('+'.join(qry)) self.browser.get(url) time.sleep(1) pps = None cps = None for i in range(1, 20): self.browser.execute_script("window.scrollTo(0, %d);" % (i * 10000)) time.sleep(10) cps = self.browser.page_source if noImages(cps) < noImages(pps): break pps = cps pagesource = pps soup = BeautifulSoup(pagesource, 'lxml') imgs = soup.findAll('div', 'Image Module pinUiImage') extractedUrls = [] for img in imgs: imgd = img.findAll('img') url = imgd[0]['src'] title = imgd[0]['alt'].encode('ascii', 'ignore') extractedUrls.append(url.replace('236x', '736x') + '\t' + title) with open('_'.join(sys.argv[1:]) + '_Pinterest', 'w') as outfile: for x in extractedUrls: outfile.write(x + '\n') def stop(self): self.browser.quit() self.display.stop()
class Scrape: def __init__(self): self.site_url = 'https://www.iextrading.com/apps/tops/' self.csv_file = None if platform == 'darwin': # OSX self.driver = webdriver.Chrome() elif platform == 'linux' or platform == 'linux2': # headless self.display = Display(visible=0, size=(800, 600)) self.display.start() self.driver = webdriver.Chrome() else: # windows self.driver = webdriver.Chrome('chromedriver.exe') def scrape_soup(self, r): self.driver.get(self.site_url) if r: self._select_report(r) self.csv_file = "{}_{}.csv".format( time.strftime("%Y%m%d%H%M", time.localtime()), r) else: self.csv_file = "{}_{}.csv".format( time.strftime("%Y%m%d%H%M", time.localtime()), 'top') for _ in range(30): page_source = self.driver.page_source soup = BeautifulSoup(page_source, 'lxml') table = soup.find("table", {"id": "quotesTable"}) table_ = self._scrape_table(table) if table_: logger.info('Table scraped! ') break else: time.sleep(1) else: logger.info( 'Is the site live? Found placeholders on page. Scrape either way' ) table_ = self._scrape_table(table, True) self._write_row([ 'No', 'Ticker', 'Mkt %', 'Shares', 'Bid Quantity', 'Bid Price', 'Ask Price', 'Ask Quantity', 'Last Sale Price', 'Last Sale Quantity' ]) table_.pop(0) # off header for wr in table_: self._write_row(wr) @staticmethod def _scrape_table(table, force=False): rows = table.findAll("tr") tbl_data = [] for row in rows: tds_ = row.find_all("td") row_data = [] for ctr, td in enumerate(tds_): if td.text.strip().count( '-') and not td.text.strip().count('--:--:'): if not force: logger.info('Table not ready for scraping') return False # drop time if ctr in [1, 6]: d = td.text.strip()[:-8] else: d = td.text.strip() # split x if d.count(u'\xd7'): dsplit = d.split(u'\xd7') row_data.append(dsplit[0].strip()) row_data.append(dsplit[1].strip()) elif d.count('%'): row_data.append(d.rstrip('%')) else: row_data.append(d) tbl_data.append(row_data) logger.info('Scraped row data: {}'.format(row_data)) return tbl_data def _write_row(self, row): with open(self.csv_file, 'ab') as hlr: wrt = csv.writer(hlr, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) wrt.writerow(row) logger.info('Add row: {}'.format(row)) def tear_down(self): if self.driver: self.driver.quit() def stdout_options(self): self.driver.get(self.site_url) try: o = [] op_el = WebDriverWait(self.driver, 10).until( EC.presence_of_element_located((By.ID, "lists"))) select = Select(op_el) opts_ = select.options for opt_ in opts_: option = opt_.get_attribute('value') logger.info('report: {}'.format(option)) o.append(option) return o except TimeoutException: logger.error( 'Is the page live? Timed out on the reports select element') def _select_report(self, value): try: op_el = WebDriverWait(self.driver, 10).until( EC.presence_of_element_located((By.ID, "lists"))) select = Select(op_el) select.select_by_value(value) except TimeoutException: logger.error( 'Is the page live? Timed out on the reports select element')
class BaseCase(unittest.TestCase): ''' A base test case that wraps methods for enhanced usage. You can also add your own methods here. ''' def __init__(self, *args, **kwargs): super(BaseCase, self).__init__(*args, **kwargs) try: self.driver = WebDriver() except Exception: pass self.environment = None def open(self, url): self.driver.get(url) if settings.WAIT_FOR_RSC_ON_PAGE_LOADS: self.wait_for_ready_state_complete() self._demo_mode_pause_if_active() def open_url(self, url): """ In case people are mixing up self.open() with open(), use this alternative. """ self.open(url) def click(self, selector, by=By.CSS_SELECTOR, timeout=settings.SMALL_TIMEOUT): element = page_actions.wait_for_element_visible( self.driver, selector, by, timeout=timeout) self._demo_mode_scroll_if_active(selector, by) element.click() if settings.WAIT_FOR_RSC_ON_CLICKS: self.wait_for_ready_state_complete() self._demo_mode_pause_if_active() def click_chain(self, selectors_list, by=By.CSS_SELECTOR, timeout=settings.SMALL_TIMEOUT, spacing=0): """ This method clicks on a list of elements in succession. 'spacing' is the amount of time to wait between clicks. (sec) """ for selector in selectors_list: self.click(selector, by=by, timeout=timeout) if spacing > 0: time.sleep(spacing) def click_link_text(self, link_text, timeout=settings.SMALL_TIMEOUT): element = self.wait_for_link_text_visible(link_text, timeout=timeout) element.click() if settings.WAIT_FOR_RSC_ON_CLICKS: self.wait_for_ready_state_complete() self._demo_mode_pause_if_active() def add_text(self, selector, new_value, timeout=settings.SMALL_TIMEOUT): """ The more-reliable version of driver.send_keys() Similar to update_text(), but won't clear the text field first. """ element = self.wait_for_element_visible(selector, timeout=timeout) element.send_keys(new_value) self._demo_mode_pause_if_active() def send_keys(self, selector, new_value, timeout=settings.SMALL_TIMEOUT): """ Same as add_text() -> more reliable, but less name confusion. """ self.add_text(selector, new_value, timeout=timeout) def update_text_value(self, selector, new_value, timeout=settings.SMALL_TIMEOUT, retry=False): """ This method updates an element's text value with a new value. @Params selector - the selector with the value to update new_value - the new value for setting the text field timeout - how long to wait for the selector to be visible retry - if True, use jquery if the selenium text update fails """ element = self.wait_for_element_visible(selector, timeout=timeout) element.clear() self._demo_mode_pause_if_active(tiny=True) element.send_keys(new_value) if (retry and element.get_attribute('value') != new_value and ( not new_value.endswith('\n'))): logging.debug('update_text_value is falling back to jQuery!') selector = self.jq_format(selector) self.set_value(selector, new_value) self._demo_mode_pause_if_active() def update_text(self, selector, new_value, timeout=settings.SMALL_TIMEOUT, retry=False): """ The shorter version of update_text_value(), which clears existing text and adds new text into the text field. We want to keep the old version for backward compatibility. """ self.update_text_value(selector, new_value, timeout=timeout, retry=retry) def is_element_present(self, selector, by=By.CSS_SELECTOR): return page_actions.is_element_present(self.driver, selector, by) def is_element_visible(self, selector, by=By.CSS_SELECTOR): return page_actions.is_element_visible(self.driver, selector, by) def is_link_text_visible(self, link_text): return page_actions.is_element_visible(self.driver, link_text, by=By.LINK_TEXT) def is_text_visible(self, text, selector, by=By.CSS_SELECTOR): return page_actions.is_text_visible(self.driver, text, selector, by) def find_visible_elements(self, selector, by=By.CSS_SELECTOR): return page_actions.find_visible_elements(self.driver, selector, by) def execute_script(self, script): return self.driver.execute_script(script) def set_window_size(self, width, height): return self.driver.set_window_size(width, height) self._demo_mode_pause_if_active() def maximize_window(self): return self.driver.maximize_window() self._demo_mode_pause_if_active() def activate_jquery(self): """ If "jQuery is not defined", use this method to activate it for use. This happens because jQuery is not always defined on web sites. """ try: # Let's first find out if jQuery is already defined. self.driver.execute_script("jQuery('html')") # Since that command worked, jQuery is defined. Let's return. return except Exception: # jQuery is not currently defined. Let's proceed by defining it. pass self.driver.execute_script( '''var script = document.createElement("script"); ''' '''script.src = "https://ajax.googleapis.com/ajax/libs/jquery/1/''' '''jquery.min.js"; document.getElementsByTagName("head")[0]''' '''.appendChild(script);''') for x in xrange(30): # jQuery needs a small amount of time to activate. (At most 3s) try: self.driver.execute_script("jQuery('html')") return except Exception: time.sleep(0.1) # Since jQuery still isn't activating, give up and raise an exception raise Exception("Exception: WebDriver could not activate jQuery!") def scroll_to(self, selector): self.wait_for_element_visible(selector, timeout=settings.SMALL_TIMEOUT) scroll_script = "jQuery('%s')[0].scrollIntoView()" % selector try: self.driver.execute_script(scroll_script) except Exception: # The likely reason this fails is because: "jQuery is not defined" self.activate_jquery() # It's a good thing we can define it here self.driver.execute_script(scroll_script) self._demo_mode_pause_if_active(tiny=True) def scroll_click(self, selector): self.scroll_to(selector) self.click(selector) def jquery_click(self, selector): self.scroll_to(selector) self.driver.execute_script("jQuery('%s').click()" % selector) self._demo_mode_pause_if_active() def jq_format(self, code): return page_utils.jq_format(code) def set_value(self, selector, value): self.scroll_to(selector) val = json.dumps(value) self.driver.execute_script("jQuery('%s').val(%s)" % (selector, val)) self._demo_mode_pause_if_active() def jquery_update_text_value(self, selector, new_value, timeout=settings.SMALL_TIMEOUT): element = self.wait_for_element_visible(selector, timeout=timeout) self.scroll_to(selector) self.driver.execute_script("""jQuery('%s').val('%s')""" % (selector, self.jq_format(new_value))) if new_value.endswith('\n'): element.send_keys('\n') self._demo_mode_pause_if_active() def jquery_update_text(self, selector, new_value, timeout=settings.SMALL_TIMEOUT): self.jquery_update_text_value(selector, new_value, timeout=timeout) def hover_on_element(self, selector): self.wait_for_element_visible(selector, timeout=settings.SMALL_TIMEOUT) self.scroll_to(selector) time.sleep(0.05) # Settle down from scrolling before hovering return page_actions.hover_on_element(self.driver, selector) def hover_and_click(self, hover_selector, click_selector, click_by=By.CSS_SELECTOR, timeout=settings.SMALL_TIMEOUT): self.wait_for_element_visible(hover_selector, timeout=timeout) self.scroll_to(hover_selector) # Settle down from the scrolling before hovering element = page_actions.hover_and_click( self.driver, hover_selector, click_selector, click_by, timeout) self._demo_mode_pause_if_active() return element def wait_for_element_present(self, selector, by=By.CSS_SELECTOR, timeout=settings.LARGE_TIMEOUT): return page_actions.wait_for_element_present( self.driver, selector, by, timeout) def wait_for_element_visible(self, selector, by=By.CSS_SELECTOR, timeout=settings.LARGE_TIMEOUT): return page_actions.wait_for_element_visible( self.driver, selector, by, timeout) def wait_for_text_visible(self, text, selector, by=By.CSS_SELECTOR, timeout=settings.LARGE_TIMEOUT): return page_actions.wait_for_text_visible( self.driver, text, selector, by, timeout) def wait_for_link_text_visible(self, link_text, timeout=settings.LARGE_TIMEOUT): return self.wait_for_element_visible( link_text, by=By.LINK_TEXT, timeout=timeout) def wait_for_element_absent(self, selector, by=By.CSS_SELECTOR, timeout=settings.LARGE_TIMEOUT): return page_actions.wait_for_element_absent( self.driver, selector, by, timeout) def wait_for_element_not_visible(self, selector, by=By.CSS_SELECTOR, timeout=settings.LARGE_TIMEOUT): return page_actions.wait_for_element_not_visible( self.driver, selector, by, timeout) def wait_for_ready_state_complete(self, timeout=settings.EXTREME_TIMEOUT): return page_actions.wait_for_ready_state_complete(self.driver, timeout) def wait_for_and_accept_alert(self, timeout=settings.LARGE_TIMEOUT): return page_actions.wait_for_and_accept_alert(self.driver, timeout) def wait_for_and_dismiss_alert(self, timeout=settings.LARGE_TIMEOUT): return page_actions.wait_for_and_dismiss_alert(self.driver, timeout) def wait_for_and_switch_to_alert(self, timeout=settings.LARGE_TIMEOUT): return page_actions.wait_for_and_switch_to_alert(self.driver, timeout) def save_screenshot(self, name, folder=None): return page_actions.save_screenshot(self.driver, name, folder) def _demo_mode_pause_if_active(self, tiny=False): if self.demo_mode: if self.demo_sleep: wait_time = float(self.demo_sleep) else: wait_time = settings.DEFAULT_DEMO_MODE_TIMEOUT if not tiny: time.sleep(wait_time) else: time.sleep(wait_time/3.0) def _demo_mode_scroll_if_active(self, selector, by): if self.demo_mode: if by == By.CSS_SELECTOR: self.scroll_to(selector) # PyTest-Specific Code # def setUp(self): """ pytest-specific code Be careful if a subclass of BaseCase overrides setUp() You'll need to add the following line to the subclass setUp() method: super(SubClassOfBaseCase, self).setUp() """ self.is_pytest = None try: # This raises an exception if the test is not coming from pytest self.is_pytest = pytest.config.option.is_pytest except Exception: # Not using pytest (probably nosetests) self.is_pytest = False if self.is_pytest: self.with_selenium = pytest.config.option.with_selenium self.headless = pytest.config.option.headless self.headless_active = False self.with_testing_base = pytest.config.option.with_testing_base self.log_path = pytest.config.option.log_path self.browser = pytest.config.option.browser self.data = pytest.config.option.data self.demo_mode = pytest.config.option.demo_mode self.demo_sleep = pytest.config.option.demo_sleep if self.headless: self.display = Display(visible=0, size=(1200, 800)) self.display.start() self.headless_active = True if self.with_selenium: self.driver = browser_launcher.get_driver(self.browser) def tearDown(self): """ pytest-specific code Be careful if a subclass of BaseCase overrides setUp() You'll need to add the following line to the subclass's tearDown(): super(SubClassOfBaseCase, self).tearDown() """ if self.is_pytest: if self.with_selenium: # Save a screenshot if logging is on when an exception occurs if self.with_testing_base and (sys.exc_info()[1] is not None): test_id = "%s.%s.%s" % (self.__class__.__module__, self.__class__.__name__, self._testMethodName) test_logpath = self.log_path + "/" + test_id if not os.path.exists(test_logpath): os.makedirs(test_logpath) # Handle screenshot logging log_helper.log_screenshot(test_logpath, self.driver) # Handle basic test info logging log_helper.log_test_failure_data( test_logpath, self.driver, self.browser) # Handle page source logging log_helper.log_page_source(test_logpath, self.driver) # Finally close the browser self.driver.quit() if self.headless: if self.headless_active: self.display.stop()
"""pecli command line interface""" import sys # import datetime from getpass import getpass from tabulate import tabulate import click import inquirer import botcore if sys.platform == "linux" or sys.platform == "linux2": # headless executable on Ubuntu from pyvirtualdisplay import Display DISPLAY = Display(visible=0, size=(800, 600)) DISPLAY.start() elif sys.platform == "darwin": pass elif sys.platform == "win32": raise "I don't give a shit to Windows system." @click.group() def cli(): """pecli command line interface""" pass @cli.command() def register(): """Register class"""
def start_pyvirtualdisplay(rows=1400, columns=900): from pyvirtualdisplay import Display display = Display(visible=0, size=(rows, columns)) display.start()
def parse(self, response): socket.setdefaulttimeout(int(self.timeout)) # temporary file for the output image t_file = tempfile.NamedTemporaryFile(delete=False, suffix='.png') t_file.close() print('Created temporary image file: %s' % t_file.name) self.log('Created temporary image file: %s' % t_file.name) if not DEBUG_MODE: display = Display(visible=int(bool(DEBUG_MODE)), size=(self.width, self.height)) display.start() # we will use requesocks for checking response code r_session = requests.session() if self.timeout: self.timeout = int(self.timeout) r_session.timeout = self.timeout # Proxies activated again because of walmart bans if self.proxy: r_session.proxies = {"http": "{}://{}".format(self.proxy_type, self.proxy), \ "https": "{}://{}".format(self.proxy_type, self.proxy)} if self.user_agent: r_session.headers = {'User-Agent': self.user_agent} # check if the page returns code != 200 if self.code_200_required and str( self.code_200_required).lower() not in ('0', 'false', 'off'): page_code = r_session.get(self.product_url, verify=False).status_code if page_code != 200: self.log( 'Page returned code %s at %s' % (page_code, self.product_url), ERROR) yield ScreenshotItem() # return empty item if not DEBUG_MODE: display.stop() return driver = self.init_driver() item = ScreenshotItem() if self.proxy: ip_via_proxy = URL2ScreenshotSpider._get_proxy_ip(driver) item['via_proxy'] = ip_via_proxy print 'IP via proxy:', ip_via_proxy self.log('IP via proxy: %s' % ip_via_proxy) try: self.prepare_driver(driver) self.make_screenshot(driver, t_file.name) self.log('Screenshot was made for file %s' % t_file.name) except Exception as e: self.log('Exception while getting response using selenium! %s' % str(e)) # lets try with another driver another_driver_name = self._choose_another_driver() try: if not DEBUG_MODE: driver.quit() # clean RAM except Exception as e: pass driver = self.init_driver(name=another_driver_name) self.prepare_driver(driver) self.make_screenshot(driver, t_file.name) self.log('Screenshot was made for file %s (2nd attempt)' % t_file.name) try: if not DEBUG_MODE: driver.quit() except: pass # crop the image if needed if self.crop_width and self.crop_height: self.crop_width = int(self.crop_width) self.crop_height = int(self.crop_height) from PIL import Image # size is width/height img = Image.open(t_file.name) box = (self.crop_left, self.crop_top, self.crop_left + self.crop_width, self.crop_top + self.crop_height) area = img.crop(box) area.save(t_file.name, 'png') self.log('Screenshot was cropped and saved to %s' % t_file.name) if self.image_copy: # save a copy of the file if needed area.save(self.image_copy, 'png') with open(t_file.name, 'rb') as fh: img_content = fh.read() self.log('Screenshot content was read, size: %s bytes' % len(img_content)) if self.remove_img is True: os.unlink(t_file.name) # remove old output file self.log('Screenshot file was removed: %s' % t_file.name) # yield the item item['url'] = response.url item['image'] = base64.b64encode(img_content) item['site_settings'] = getattr(self, '_site_settings_activated_for', None) item['creation_datetime'] = datetime.datetime.utcnow().isoformat() if not DEBUG_MODE: display.stop() self.log('Item image key length: %s' % len(item.get('image', ''))) if img_content: yield item
def setUp(self): display = Display(visible=0, size=(1366, 768)) display.start() self.driver = webdriver.Firefox()
def init_display(visible, size): display = Display(visible=visible, size=size) display.start() return display
class TflCrawler(): def __init__(self): ''' Constructor method that instantiate the TflCrawler. ''' self.__site = 'http://cycling.data.tfl.gov.uk/' self.__elements = {} # initialise an empty dictionary self._file_type = 'CSV file' self.__folder_dir = os.path.abspath(os.path.dirname(__file__)) def _start_crawling(self, driver_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), 'chromedriver'))): ''' Start crawling process, creating and invisible browser display, with 800 by 600 dimension. Additionally, the location of Chrome driver is specified. :param driver_dir: defines the location of Chrome driver. The directory of the driver is specified as a relative path of the user working directory. ''' try: print('start driver...') self._display = Display( visible= 0, size = (800,600)) # create a chrome display with 800*600 dimension self._display.start() # starts the browser self._driver = webdriver.Chrome(driver_dir) # set the location of web driver except Exception as e: print(f'[No driver was identified. Identified files: {os.listdir(driver_dir)}]') def _stop_crawling(self): ''' It closes the browser display that was initialised by the start_crawling method. The driver it also stops. ''' print('closing driver...') self._display.stop() self._driver.quit() def _get_site(self, url): ''' The current method performs a request on http://cycling.data.tfl.gov.uk/ server and gets a response. The content of the response is converted in HTML and is returned by the method. :param url: the url of http://cycling.data.tfl.gov.uk/' ''' try: self._driver.get(url) # navigates to page sleep(5) # stops the code execution so that the HTML content to be loaded (5 to 10 seconds) return self._driver.execute_script('return document.body.innerHTML') # load the HTML content except Exception as e: print(f'[Unable to reach {self.__site}. Error : {str(e)}]') def _populate_dictionary(self, html): ''' The HTML structure that has been retrieved by __get_site method is analysed such that a dictionary of all csv files within the website is constructed. The dictionary is populated by each csv file that may be uploaded on TFL website. Each csv file is encapsulated as a dictionary, containing keys such that [name, url, date, size]. :param html: the html structure that is created by the __get_site method ''' try: print('get the content...') soup = BeautifulSoup(html, 'html.parser') # creates a soup object and defines how the HTML will be parsed # finds all tr elements with an attribute of data-level=3 main_content = soup.find_all('tr', attrs= { 'data-level' : '3' }) # iterate over the tr elements for i,item in enumerate(main_content): td = item.find_all('td') # retrieves the td elements within the tr # checks if the type of the 4th td element is CSV if (td[3].string == self._file_type): # Populates the dictionary self.__elements[i] ={ 'name' : td[0].a.string, 'url' : td[0].a['href'], 'date' : td[1].string, 'size' : td[2].string } except Exception as e: print(f'[Unable to parse the content of {self.__site}. Error: {str(e)}]') def parse(self): ''' Performs the entire process to parse the TFL website. In particular, starts the Chrome driver, waits until the site to load the HTML content, and therefore performs a request to the website. Then, the response is parsed, populating a dictionary that maintains all the csv files that might exist on that site :param driver_dir: defines the Google driver relative directory ''' self._start_crawling(os.path.join(self.__folder_dir,'chromedriver')) html = self._get_site(self.__site) self._populate_dictionary(html) self._stop_crawling() def retrieve_csv_files(self, DNS,rel_path): ''' Iterates over the constructed dictionary and retrieves each csv file that is identified. The csv files are saved locally. Additionally, the corresponded relations of the DB are created :param path: the relative path, which determines the location that the created csv file would be stored. ''' def populate_stations_pairs_relation(df): def insert(l): if len(l) > 1: # adds a colon at the end of the statement l[-1] = no_space_join([l[-1][:-1], ';']) # joins the insert statements statement = no_space_join(l) # insert the query execute(statement) conn.commit() # Drops duplicate routes, that have a start-end station which already exists dfrout= df[['StartStation Id','EndStation Id']].drop_duplicates() # drop OD routes that started and ended at the same station dfrout = dfrout.drop(dfrout[(dfrout['StartStation Id'] == dfrout['EndStation Id'])].index) # Variables to avoid overheading execute = cur.execute fetchall = cur.fetchall # corresponds to the stations that already exists in the DB AND have a location execute('SELECT station_id,st_asText(location) FROM webapp_stations WHERE location IS NOT NULL') # gets the stations that have a location stations = dict([(station[0], station[1].replace('MULTIPOINT', '')) for station in fetchall()]) # stations that in do not have a location in the database, are removed from the data frame sids = [s for s in stations.keys()] dfrout = dfrout[dfrout['StartStation Id'].isin(sids) == dfrout['EndStation Id'].isin(sids)] # requests the pairs of stations that exist in the database execute('SELECT start_station_id,end_station_id FROM webapp_stations_pairs_routes') pairs_dict = dict([(pair,pair) for pair in fetchall()]) # Variables that will used to construct the request url #plan = '&plan=' #plan_options = ['fastest','balanced','quietest'] plan = '&plan=balanced' default_url = 'https://www.cyclestreets.net/api/journey.json?key=112d0fc4c69f3951&itinerarypoints=' nPairs = dfrout.shape[0] try: # Variables out of the for loop #l = ['INSERT INTO webapp_stations_pairs_routes(start_station_id,end_station_id,fastest_ref_dist,fastest_ref_time,fastest_ref_geom,balanced_ref_dist,balanced_ref_time,balanced_ref_geom,quietest_ref_dist,quietest_ref_time,quietest_ref_geom) VALUES '] l = ['INSERT INTO webapp_stations_pairs_routes(start_station_id,end_station_id,balanced_ref_dist,balanced_ref_time,balanced_ref_geom) VALUES '] comma_join = ','.join no_space_join = ''.join pipe_join = '|'.join for i_pair,pair in enumerate(dfrout.itertuples()): # every 100 requests, stop the execution for 10 seconds (request policy) if i_pair % 1000 == 0 and i_pair > 0: sleep(5) print(f'Pair : {i_pair+1} of {nPairs}') start_station_id = int(pair[1]) end_station_id = int(pair[2]) # checks for OD pairs that do not exist in the DP (if the ) if (start_station_id,end_station_id) not in pairs_dict: try: start_coords = stations[start_station_id][1:-1].replace(' ',',') end_coords = stations[end_station_id][1:-1].replace(' ',',') #time,distance,coords = [],[],[] #atime = time.append #adistance = distance.append #acoords = coords.append #for option in plan_options: # request the link from www.cyclestreet.com response = requests.get(no_space_join([default_url, pipe_join([start_coords,end_coords]), plan])).json()['marker'][0]['@attributes'] # loads the json file into a python object(dictionary) time = response['time'] distance = response['length'] coords = f"st_GeomFromText('LINESTRING({response['coordinates'].replace(' ','?').replace(',',' ').replace('?',',')})',4326)" #response_json = loads(response)['marker'][0]['@attributes'] #atime(response['time']) #adistance(response['length']) #acoords(f"st_GeomFromText('LINESTRING({response['coordinates'].replace(' ','?').replace(',',' ').replace('?',',')})',4326)") except (KeyError,AttributeError): continue # creates a statement of the current pair #statement = no_space_join(['(',comma_join([str(start_station_id),str(end_station_id),distance[0],time[0],coords[0],distance[1],time[1],coords[1],distance[2],time[2],coords[2]]),'),']) statement = no_space_join(['(',comma_join([str(start_station_id),str(end_station_id),distance,time,coords]),'),']) l.append(statement) if i_pair % 100 == 0: insert(l) l = ['INSERT INTO webapp_stations_pairs_routes(start_station_id,end_station_id,balanced_ref_dist,balanced_ref_time,balanced_ref_geom) VALUES '] #l = ['INSERT INTO webapp_stations_pairs_routes(start_station_id,end_station_id,fastest_ref_dist,fastest_ref_time,fastest_ref_geom,balanced_ref_dist,balanced_ref_time,balanced_ref_geom,quietest_ref_dist,quietest_ref_time,quietest_ref_geom) VALUES '] except Exception as e: print('Error while data of webapp_stations_ref_routes were requested...') try: insert(l) return stations except: print('Error while the INSERT statement was executed for the webapp_stations_ref_routes relation') def insert_values_db(values, table_attributes,relation,null_stations): # Local Variables statement = [f"INSERT INTO {table_attributes} VALUES "] append = statement.append # its assign so that we avoid the overheating inside the loop replace = str.replace # its assign so that we avoid overheating inside the loop n = values.shape[0]-1 # number of observations # If the relation that is examined is the stations, receive the spatial location of each station if relation == 'webapp_stations': # stations_location =[(randint(0,89) + random() ,randint(0,89) + random()) for e in range(values.shape[0])] try: stations_location, null_stations = get_station_location(driver_dir= os.path.join(self.__folder_dir,'chromedriver'), url ='https://api.tfl.gov.uk/swagger/ui/index.html?url=/swagger/docs/v1#!/BikePoint/BikePoint_Search' , stations = values['StartStation Name'].values.tolist(), null_stations = null_stations) except Exception as e: print('Error - line 228') elif relation == 'webapp_routes': stations = populate_stations_pairs_relation(values) # returns a dictionary, with all the stations that have a location cur.execute('SELECT id,start_station_id,end_station_id FROM webapp_stations_pairs_routes') pairs = dict([((pair[1],pair[2]),pair[0]) for pair in cur.fetchall()]) # Iterate over each observation and create the corresponded INSERT statement for irow, row in enumerate(values.itertuples()): pk = row[1] # assign the value of pk to a local variable try: if relation == 'webapp_bikes': append(replace(f"({pk}),", "\\'", "''")) elif relation == 'webapp_stations': try: append(replace(f"({pk},'{row[2]}', ST_GeomFromText('MULTIPOINT({stations_location[irow][0]} {stations_location[irow][1]})',4326)),", "\\'", "''")) except: continue elif relation == 'webapp_routes': # get only the routes that i) do not have the same starting and ending station and i) have a start or end station that contains a location in the db if (row[6] != row[7]) and (row[6] in stations) and (row[7] in stations) : pair_id = pairs[(row[6],row[7])] append(replace(f"({pk},'{row[2]}','{row[3]}',{abs(row[4])},{row[5]},{pair_id}),", "\\'", "''")) except (ValueError,KeyError): continue # Constructs the INSERT statement if len(statement) > 1: statement[-1] = ''.join([statement[-1][:-1] + ';']) statement = ''.join(statement) # INSERT the new values into the database sql_execute(statement) conn.commit() # commit the transaction if relation =='webapp_stations': return null_stations def populate_relation(df, df_main_all_names, relation, pk , table_attributes, null_stations): # Local variables def process_df(df, df_main_all_names,relation): # in order to avoid error in subsequent procedures, we need to receive the Id of the starting and ending stations if relation == 'webapp_stations': start_stations_df = df[df_main_all_names[1]].dropna() scol = start_stations_df.columns end_stations_df = df[['EndStation Id','EndStation Name']].dropna() end_stations_df.columns = [scol[0],scol[1]] ndf = pd.concat([start_stations_df,end_stations_df], axis= 0).drop_duplicates([df_main_all_names[0]]) else: # drops the duplicates from the primary key for the webapp_routes and webapp_bikes relation ndf = dataframe(df[df_main_all_names[1]]).drop_duplicates([df_main_all_names[0]]).dropna() return ndf new_values = [] append = new_values.append dataframe = pd.DataFrame # Retrieves the csv sub-dataframe that defines a relation try: ndf = process_df(df,df_main_all_names,relation) except (TypeError,IndexError,KeyError): df.columns = ['Rental Id','Duration','Bike Id','End Date','EndStation Id','EndStation Name','Start Date','StartStation Id', 'StartStation Name'] ndf = process_df(df,df_main_all_names,relation) # Dimensions of the table n = ndf.shape[1] # Performs a SELECT query that will return current values within the db sql_execute(f"SELECT {pk[1]} FROM {relation};") # identify the pk of each entity - a dictionary is used for more efficient search stored_pks= dict([(e[pk[0]],e[pk[0]]) for e in cur.fetchall()]) try: # Look for new values for row in ndf.itertuples(): if (row[1] not in stored_pks): append(row[1]) if len(stored_pks) != 0: if n == 1: # 1 Dimensional relations if len(new_values) > 0: insert_values_db(dataframe({f'{df_main_all_names[0]}' : new_values}), table_attributes, relation,null_stations) else: # n Dimensional relations if len(new_values) > 0: new_values_joined = dataframe({ df_main_all_names[0]: new_values}).merge(ndf,how='left',left_on= df_main_all_names[0], right_on = df_main_all_names[0]) if relation == 'webapp_stations': null_stations = insert_values_db(new_values_joined[df_main_all_names[1]], table_attributes, relation,null_stations) return null_stations else: insert_values_db(new_values_joined[df_main_all_names[1]], table_attributes, relation,null_stations) else: if relation == 'webapp_stations': null_stations = insert_values_db(dataframe(ndf[df_main_all_names[1]]), table_attributes, relation,null_stations) return null_stations else: insert_values_db(dataframe(ndf[df_main_all_names[1]]), table_attributes, relation, null_stations) except psycopg2.InternalError: conn.rollback() process_df(df, df_main_all_names, relation) except Exception as e: print(f'Line 327 - {e}') #------------------------------------------------------------------------------------------------------------------------- try: # Local Variables join = os.path.join exists = os.path.exists size = os.path.getsize cd = self.__folder_dir # gives the directory of tflcrawler read_csv = pd.read_csv # establish a connection with o PostgreSQL database, based on the given DNS parameter conn = psycopg2.connect(DNS) cur = conn.cursor() # initialise a cursor sql_execute = cur.execute # cur.execute command is assigned as local variable (avoid dot overheating) null_stations = ['Bourne Street, Belgravia'] # list that will check if a station is null path = join(cd,rel_path) # Defines the path where the csv files will be stored print('starts to retrieve the csv files...') elements = self.__elements # assign the current dictionary to a local variable # iterate over the dictionary elements for value in tqdm(elements.values()): name = value['name'] # file name try: csv_path = join(path, name) # assign a full path fof the file print(csv_path) # if the file does not exist or the file exists, having a size of zero (nothing within it) if (not exists(csv_path)) or (exists(csv_path) and size(csv_path) == 0): # request the csv file from the server try: response = requests.get(value['url']) except (requests.ConnectionError, requests.ConnectTimeout, requests.HTTPError, requests.TooManyRedirects) as error: print(str(error)) # convert the text to a generator splitted_text = response.iter_lines(chunk_size= 512) # opens and write the file with open(csv_path, 'w') as file: for line in splitted_text: file.write(str(line)[2:-1] + '\n') file.close() # reads the created csv file df = read_csv(filepath_or_buffer= csv_path, delimiter=',' ,encoding= 'utf-8') # Populates the Bikes entity populate_relation(df = df, df_main_all_names= ('Bike Id', 'Bike Id'), relation= 'webapp_bikes' , pk = (0,'bike_id'), table_attributes= 'webapp_bikes(bike_id)', null_stations = null_stations) # Populates the Stations entity condition = True # initialise a boolean variable that checks if the populate_relation function of stations has been correctly executed while(condition): try: # populate the db with the corresponded values of stations null_stations = populate_relation(df = df, df_main_all_names= ('StartStation Id', ['StartStation Id', 'StartStation Name']) , relation ='webapp_stations', pk = (0,'station_id'), table_attributes= 'webapp_stations(station_id,station_name,location)', null_stations = null_stations) # set the condition to false and exit from the while loop condition = False except ValueError: # If the function returns an error due to unsimilarity of the file, SKIP the file condition = False except Exception as e: # If the function returns any other error, execute the function again # The function may do not executed correctly due to problems with the connection with the API and other requests print('POPULATE_RELATION IS EXECUTED AGAIN...') continue # Populates the Routes entity populate_relation(df = df, df_main_all_names=('Rental Id', ['Rental Id','Start Date','End Date', 'Duration','Bike Id','StartStation Id', 'EndStation Id']), relation= 'webapp_routes', pk =(0,'rental_id'), table_attributes='webapp_routes(rental_id,start_date,end_date,duration,bike_id,station_pairs_id)', null_stations = null_stations) except Exception as e: print(f'[Error of file {name} - Inside the FOR loop]') continue except Exception as e: # Close the cursor and database connection as well cur.close() conn.close() print(f'[ Error while the files are retrieved. Error: {str(e)}]') @property def elements(self): return self.__elements @property def site(self): return self.__site
class LinkedinPy: """Class to be instantiated to use the script""" def __init__(self, username=None, userid=None, password=None, nogui=False, selenium_local_session=True, use_firefox=False, browser_profile_path=None, page_delay=25, show_logs=True, headless_browser=False, proxy_address=None, proxy_chrome_extension=None, proxy_port=None, disable_image_load=False, bypass_suspicious_attempt=False, bypass_with_mobile=False, multi_logs=True): cli_args = parse_cli_args() username = cli_args.username or username password = cli_args.password or password use_firefox = cli_args.use_firefox or use_firefox page_delay = cli_args.page_delay or page_delay headless_browser = cli_args.headless_browser or headless_browser proxy_address = cli_args.proxy_address or proxy_address proxy_port = cli_args.proxy_port or proxy_port disable_image_load = cli_args.disable_image_load or disable_image_load bypass_suspicious_attempt = (cli_args.bypass_suspicious_attempt or bypass_suspicious_attempt) bypass_with_mobile = cli_args.bypass_with_mobile or bypass_with_mobile if not get_workspace(Settings): raise SocialPyError( "Oh no! I don't have a workspace to work at :'(") self.nogui = nogui if nogui: self.display = Display(visible=0, size=(800, 600)) self.display.start() self.browser = None self.headless_browser = headless_browser self.proxy_address = proxy_address self.proxy_port = proxy_port self.proxy_chrome_extension = proxy_chrome_extension self.selenium_local_session = selenium_local_session self.bypass_suspicious_attempt = bypass_suspicious_attempt self.bypass_with_mobile = bypass_with_mobile self.disable_image_load = disable_image_load self.username = username or os.environ.get('LINKEDIN_USER') self.password = password or os.environ.get('LINKEDIN_PW') Settings.profile["name"] = self.username self.page_delay = page_delay self.switch_language = True self.use_firefox = use_firefox Settings.use_firefox = self.use_firefox self.browser_profile_path = browser_profile_path self.liked_img = 0 self.already_liked = 0 self.liked_comments = 0 self.commented = 0 self.replied_to_comments = 0 self.connected = 0 self.already_connected = 0 self.unconnected = 0 self.connected_by = 0 self.connecting_num = 0 self.inap_img = 0 self.not_valid_users = 0 self.connect_times = 1 self.start_time = time.time() # assign logger self.show_logs = show_logs Settings.show_logs = show_logs or None self.multi_logs = multi_logs self.logfolder = get_logfolder(self.username, self.multi_logs, Settings) self.logger = self.get_linkedinpy_logger(self.show_logs) get_database(Settings, make=True) # IMPORTANT: think twice before relocating if self.selenium_local_session is True: self.set_selenium_local_session(Settings) def get_linkedinpy_logger(self, show_logs): """ Handles the creation and retrieval of loggers to avoid re-instantiation. """ existing_logger = Settings.loggers.get(self.username) if existing_logger is not None: return existing_logger else: # initialize and setup logging system for the LinkedinPy object logger = logging.getLogger(self.username) logger.setLevel(logging.DEBUG) file_handler = logging.FileHandler('{}general.log'.format( self.logfolder)) file_handler.setLevel(logging.DEBUG) extra = {"username": self.username} logger_formatter = logging.Formatter( '%(levelname)s [%(asctime)s] [%(username)s] %(message)s', datefmt='%Y-%m-%d %H:%M:%S') file_handler.setFormatter(logger_formatter) logger.addHandler(file_handler) if show_logs is True: console_handler = logging.StreamHandler() console_handler.setLevel(logging.DEBUG) console_handler.setFormatter(logger_formatter) logger.addHandler(console_handler) logger = logging.LoggerAdapter(logger, extra) Settings.loggers[self.username] = logger Settings.logger = logger return logger def set_selenium_local_session(self, Settings): self.browser, err_msg = \ set_selenium_local_session(self.proxy_address, self.proxy_port, self.proxy_chrome_extension, self.headless_browser, self.use_firefox, self.browser_profile_path, # Replaces # browser User # Agent from # "HeadlessChrome". self.disable_image_load, self.page_delay, self.logger, Settings) if len(err_msg) > 0: raise SocialPyError(err_msg) def login(self): """Used to login the user either with the username and password""" if not login_user(self.browser, self.username, None, self.password, self.logger, self.logfolder, self.switch_language, self.bypass_suspicious_attempt, self.bypass_with_mobile): message = "Wrong login data!" highlight_print(Settings, self.username, message, "login", "critical", self.logger) # self.aborting = True else: message = "Logged in successfully!" highlight_print(Settings, self.username, message, "login", "info", self.logger) # try to save account progress try: save_account_progress(self.browser, "https://www.linkedin.com/", self.username, self.logger) except Exception: self.logger.warning( 'Unable to save account progress, skipping data update') return self def withdraw_old_invitations(self, skip_pages=10, sleep_delay=6): page_no = skip_pages while page_no < 100: page_no = page_no + 1 try: url = "https://www.linkedin.com/mynetwork/invitation-manager/sent/?page=" + str( page_no) web_address_navigator(Settings, self.browser, url) print("Starting page:", page_no) if self.browser.current_url == "https://www.linkedin.com/mynetwork/invitation-manager/sent/" or len( self.browser.find_elements_by_css_selector( "li.invitation-card div.pl5")) == 0: print("============Last Page Reached==============") break checked_in_page = 0 for i in range( 0, len( self.browser.find_elements_by_css_selector( "li.invitation-card div.pl5"))): try: res_item = self.browser.find_elements_by_css_selector( "li.invitation-card div.pl5")[i] try: link = res_item.find_element_by_css_selector( "div > a") profile_link = link.get_attribute("href") user_name = profile_link.split('/')[4] self.logger.info( "user_name : {}".format(user_name)) except Exception as e: print("Might be a stale profile", e) time = res_item.find_element_by_css_selector( "div > time") self.logger.info("time : {}".format(time.text)) check_button = res_item.find_element_by_css_selector( "div > div:nth-child(1) > input") check_status = check_button.get_attribute( "data-artdeco-is-focused") self.logger.info( "check_status : {}".format(check_status)) self.browser.execute_script("window.scrollTo(0, " + str((i + 1) * 104) + ");") if "month" in time.text: (ActionChains(self.browser).move_to_element( check_button).click().perform()) self.logger.info("check_button clicked") checked_in_page = checked_in_page + 1 delay_random = random.randint( ceil(sleep_delay * 0.42), ceil(sleep_delay * 0.57)) sleep(delay_random) except Exception as e: self.logger.error(e) if checked_in_page > 0: self.logger.info("Widraw to be pressed") try: self.browser.execute_script("window.scrollTo(0, 0);") withdraw_button = self.browser.find_element_by_css_selector( "ul > li.mn-list-toolbar__right-button > button") self.logger.info("withdraw_button : {}".format( withdraw_button.text)) if "Withdraw" in withdraw_button.text: (ActionChains(self.browser).move_to_element( withdraw_button).click().perform()) self.logger.info("withdraw_button clicked") page_no = page_no - 1 delay_random = random.randint( ceil(sleep_delay * 0.85), ceil(sleep_delay * 1.14)) sleep(delay_random) except Exception as e: print( "For some reason there is no withdraw_button inspite of checkings", e) else: self.logger.info("Nothing checked in this page") except Exception as e: self.logger.error(e) self.logger.info("============Next Page==============") def search_1stconnects_and_savetodb(self, query, city_code, school_code=None, past_company=None, random_start=True, max_pages=10, max_connects=25, sleep_delay=6): """ search linkedin and connect from a given profile """ self.logger.info( "Searching for: query={}, city_code={}, school_code={}".format( query, city_code, school_code)) search_url = "https://www.linkedin.com/search/results/people/?&facetNetwork=%5B%22F%22%5D" if city_code: search_url = search_url + "&facetGeoRegion=" + city_code if school_code: search_url = search_url + "&facetSchool=" + school_code if past_company: search_url = search_url + "&facetPastCompany=" + past_company search_url = search_url + "&keywords=" + query search_url = search_url + "&origin=" + "FACETED_SEARCH" for page_no in range(1, 101): try: temp_search_url = search_url + "&page=" + str(page_no) web_address_navigator(Settings, self.browser, temp_search_url) self.logger.info("Starting page: {}".format(page_no)) for jc in range(2, 11): sleep(1) self.browser.execute_script( "window.scrollTo(0, document.body.scrollHeight/" + str(jc) + ");") if len( self.browser.find_elements_by_css_selector( "div.search-result__wrapper")) == 0: self.logger.info( "============Last Page Reached or asking for Premium membership==============" ) break for i in range( 0, len( self.browser.find_elements_by_css_selector( "div.search-result__wrapper"))): try: res_item = self.browser.find_elements_by_css_selector( "li.search-result div.search-entity div.search-result__wrapper" )[i] link = res_item.find_element_by_css_selector("div > a") profile_link = link.get_attribute("href") user_name = profile_link.split('/')[4] self.logger.info("user_name : {}".format(user_name)) msg_button = res_item.find_element_by_xpath( "//div[3]/div/div/button[text()='Message']") print(msg_button.text, "present") if msg_button.text == "Message": connect_restriction("write", user_name, None, self.logger) self.logger.info( "saved {} to db".format(user_name)) except Exception as e: self.logger.error(e) except Exception as e: self.logger.error(e) self.logger.info("============Next Page==============") def test_page(self, search_url, page_no, css_selector_identifier): web_address_navigator(Settings, self.browser, search_url) self.logger.info("Testing page: {}".format(page_no)) if len( self.browser.find_elements_by_css_selector( css_selector_identifier)) > 0: return True return False def search_and_connect(self, query, connection_relationship_code, city_code, school_code=None, past_company=None, random_start=True, max_pages=10, max_connects=25, sleep_delay=6): """ search linkedin and connect from a given profile """ if quota_supervisor(Settings, "connects") == "jump": return 0 self.logger.info( "Searching for: query={}, connection_relationship_code={}, city_code={}, school_code={}" .format(query, connection_relationship_code, city_code, school_code)) connects = 0 prev_connects = -1 search_url = "https://www.linkedin.com/search/results/people/?" if connection_relationship_code: search_url = search_url + "&facetNetwork=" + connection_relationship_code if city_code: search_url = search_url + "&facetGeoRegion=" + city_code if school_code: search_url = search_url + "&facetSchool=" + school_code if past_company: search_url = search_url + "&facetPastCompany=" + past_company search_url = search_url + "&keywords=" + query search_url = search_url + "&origin=" + "FACETED_SEARCH" temp_search_url = search_url + "&page=1" print(temp_search_url) time.sleep(10) if self.test_page( search_url=temp_search_url, page_no=1, css_selector_identifier="div.search-result__wrapper") == False: self.logger.info( "============Definitely no Result, Next Query==============") return 0 if random_start: trial = 0 st = 5 while True and trial < 5 and st > 1: st = random.randint(1, st - 1) temp_search_url = search_url + "&page=" + str(st) if self.test_page(temp_search_url, st, "div.search-result__wrapper"): break trial = trial + 1 else: st = 1 for page_no in list(range(st, st + max_pages)): if prev_connects == connects: self.logger.info( "============Limits might have exceeded or all Invites pending from this page(let's exit either case)==============" ) break else: prev_connects = connects try: temp_search_url = search_url + "&page=" + str(page_no) if page_no > st and st > 1: web_address_navigator(Settings, self.browser, temp_search_url) self.logger.info("Starting page: {}".format(page_no)) for jc in range(2, 11): sleep(1) self.browser.execute_script( "window.scrollTo(0, document.body.scrollHeight/" + str(jc) + "-100);") if len( self.browser.find_elements_by_css_selector( "div.search-result__wrapper")) == 0: self.logger.info( "============Last Page Reached or asking for Premium membership==============" ) break for i in range( 0, len( self.browser.find_elements_by_css_selector( "div.search-result__wrapper"))): try: res_item = self.browser.find_elements_by_css_selector( "li.search-result div.search-entity div.search-result__wrapper" )[i] # div.search-result__actions div button") # pp.pprint(res_item.get_attribute('innerHTML')) link = res_item.find_element_by_css_selector("div > a") profile_link = link.get_attribute("href") self.logger.info("Profile : {}".format(profile_link)) user_name = profile_link.split('/')[4] # self.logger.info("user_name : {}".format(user_name)) name = res_item.find_element_by_css_selector( "h3 > span > span > span") #//span/span/span[1]") self.logger.info("Name : {}".format(name.text)) if connect_restriction("read", user_name, self.connect_times, self.logger): self.logger.info("already connected") continue try: connect_button = res_item.find_element_by_xpath( "//div[3]/div/button[text()='Connect']") self.logger.info( "Connect button found, connecting...") self.browser.execute_script( "var evt = document.createEvent('MouseEvents');" + "evt.initMouseEvent('click',true, true, window, 0, 0, 0, 0, 0, false, false, false, false, 0,null);" + "arguments[0].dispatchEvent(evt);", res_item.find_element_by_xpath( '//div[3]/div/button[text()="Connect"]')) self.logger.info("Clicked {}".format( connect_button.text)) sleep(2) except Exception: invite_sent_button = res_item.find_element_by_xpath( "//div[3]/div/button[text()='Invite Sent']") self.logger.info("Already {}".format( invite_sent_button.text)) continue try: modal = self.browser.find_element_by_css_selector( "div.modal-wormhole-content > div") if modal: try: sendnow_or_done_button = modal.find_element_by_xpath( "//div[1]/div/section/div/div[2]/button[2]" ) #text()='Send now']") self.logger.info( sendnow_or_done_button.text) if not (sendnow_or_done_button.text == 'Done' or sendnow_or_done_button.text == 'Send now'): raise Exception( "Send Now or Done button not found" ) if sendnow_or_done_button.is_enabled(): (ActionChains( self.browser).move_to_element( sendnow_or_done_button).click( ).perform()) self.logger.info("Clicked {}".format( sendnow_or_done_button.text)) connects = connects + 1 connect_restriction( "write", user_name, None, self.logger) try: # update server calls update_activity( Settings, 'connects') except Exception as e: self.logger.error(e) sleep(2) else: try: #TODO: input("find correct close XPATH") close_button = modal.find_element_by_xpath( "//div[1]/div/section/div/header/button" ) (ActionChains( self.browser).move_to_element( close_button).click(). perform()) print(sendnow_or_done_button.text, "disabled, clicked close") sleep(2) except Exception as e: print( "close_button not found, Failed with:", e) except Exception as e: print( "sendnow_or_done_button not found, Failed with:", e) else: self.logger.info("Popup not found") except Exception as e: print("Popup not found, Failed with:", e) try: new_popup_buttons = self.browser.find_elements_by_css_selector( "#artdeco-modal-outlet div.artdeco-modal-overlay div.artdeco-modal div.artdeco-modal__actionbar button.artdeco-button" ) gotit_button = new_popup_buttons[1] (ActionChains(self.browser).move_to_element( gotit_button).click().perform()) print(gotit_button.text, " clicked") sleep(2) except Exception as e: print("New Popup also not found, Failed with:", e) self.logger.info( "Connects sent in this iteration: {}".format( connects)) delay_random = random.randint(ceil(sleep_delay * 0.85), ceil(sleep_delay * 1.14)) sleep(delay_random) if connects >= max_connects: self.logger.info( "max_connects({}) for this iteration reached , Returning..." .format(max_connects)) return except Exception as e: self.logger.error(e) except Exception as e: self.logger.error(e) self.logger.info("============Next Page==============") return connects def endorse(self, profile_link, sleep_delay): try: web_address_navigator(Settings, self.browser, profile_link) for jc in range(1, 10): sleep(1) self.browser.execute_script( "window.scrollTo(0, document.body.scrollHeight*" + str(jc) + "/10);") skills_pane = self.browser.find_element_by_css_selector( "div.profile-detail > div.pv-deferred-area > div > section.pv-profile-section.pv-skill-categories-section" ) if (skills_pane.text.split('\n')[0] == 'Skills & Endorsements'): try: first_skill_button_icon = self.browser.find_element_by_css_selector( "div.profile-detail > div.pv-deferred-area > div > section.pv-profile-section.pv-skill-categories-section > ol > li > div > div > div > button > li-icon" ) button_type = first_skill_button_icon.get_attribute("type") if button_type == 'plus-icon': first_skill_button = self.browser.find_element_by_css_selector( "div.profile-detail > div.pv-deferred-area > div > section.pv-profile-section.pv-skill-categories-section > ol > li > div > div > div > button" ) self.browser.execute_script( "var evt = document.createEvent('MouseEvents');" + "evt.initMouseEvent('click',true, true, window, 0, 0, 0, 0, 0, false, false, false, false, 0,null);" + "arguments[0].dispatchEvent(evt);", first_skill_button) first_skill_title = self.browser.find_element_by_css_selector( "div.profile-detail > div.pv-deferred-area > div > section.pv-profile-section.pv-skill-categories-section > ol > li > div > div > p > a > span" ) print(first_skill_title.text, "clicked") delay_random = random.randint(ceil(sleep_delay * 0.85), ceil(sleep_delay * 1.14)) sleep(delay_random) else: self.logger.info( 'button_type already {}'.format(button_type)) except Exception as e: self.logger.error(e) else: self.logger.info('Skill & Endorsements pane not found') except Exception as e: self.logger.error(e) def search_and_endorse(self, query, city_code, school_code, random_start=True, max_pages=3, max_endorsements=25, sleep_delay=6): """ search linkedin and endose few first connections """ if quota_supervisor(Settings, "connects") == "jump": return #False, "jumped" print("Searching for: ", query, city_code, school_code) search_url = "https://www.linkedin.com/search/results/people/?" if city_code: search_url = search_url + "&facetGeoRegion=" + city_code if school_code: search_url = search_url + "&facetSchool=" + school_code search_url = search_url + "&facetNetwork=%5B%22F%22%5D" search_url = search_url + "&keywords=" + query search_url = search_url + "&origin=" + "FACETED_SEARCH" if random_start: trial = 0 while True and trial < 3: st = random.randint(1, 3) temp_search_url = search_url + "&page=" + str(st) web_address_navigator(Settings, self.browser, temp_search_url) self.logger.info("Testing page:".format(st)) result_items = self.browser.find_elements_by_css_selector( "div.search-result__wrapper") if len(result_items) > 0: break trial = trial + 1 else: st = 1 connects = 0 for page_no in list(range(st, st + 1)): collected_profile_links = [] try: temp_search_url = search_url + "&page=" + str(page_no) if page_no > st and st > 1: web_address_navigator(Settings, self.browser, temp_search_url) self.logger.info("Starting page: {}".format(page_no)) for jc in range(2, 11): sleep(1) self.browser.execute_script( "window.scrollTo(0, document.body.scrollHeight/" + str(jc) + "-100);") result_items = self.browser.find_elements_by_css_selector( "div.search-result__wrapper") # print(result_items) for result_item in result_items: try: link = result_item.find_element_by_css_selector( "div > a") self.logger.info("Profile : {}".format( link.get_attribute("href"))) collected_profile_links.append( link.get_attribute("href")) name = result_item.find_element_by_css_selector( "h3 > span > span > span") self.logger.info("Name : {}".format(name.text)) except Exception as e: self.logger.error(e) except Exception as e: self.logger.error(e) for collected_profile_link in collected_profile_links: self.endorse(collected_profile_link, sleep_delay=sleep_delay) connects = connects + 1 if connects >= max_endorsements: self.logger.info( "max_endorsements({}) for this iteration reached , Returning..." .format(max_endorsements)) return self.logger.info("============Next Page==============") def dump_connect_restriction(self, profile_name, logger, logfolder): """ Dump connect restriction data to a local human-readable JSON """ try: # get a DB and start a connection db, id = get_database(Settings) conn = sqlite3.connect(db) with conn: conn.row_factory = sqlite3.Row cur = conn.cursor() cur.execute( "SELECT * FROM connectRestriction WHERE profile_id=:var", {"var": id}) data = cur.fetchall() if data: # get the existing data filename = "{}connectRestriction.json".format(logfolder) if os.path.isfile(filename): with open(filename) as connectResFile: current_data = json.load(connectResFile) else: current_data = {} # pack the new data connect_data = { user_data[1]: user_data[2] for user_data in data or [] } current_data[profile_name] = connect_data # dump the fresh connect data to a local human readable JSON with open(filename, 'w') as connectResFile: json.dump(current_data, connectResFile) except Exception as exc: logger.error( "Pow! Error occurred while dumping connect restriction data to a " "local JSON:\n\t{}".format(str(exc).encode("utf-8"))) finally: if conn: # close the open connection conn.close() def end(self): """Closes the current session""" # IS_RUNNING = False close_browser(self.browser, False, self.logger) with interruption_handler(): # close virtual display if self.nogui: self.display.stop() # write useful information self.dump_connect_restriction(self.username, self.logger, self.logfolder) # dump_record_activity(self.username, # self.logger, # self.logfolder, # Settings) with open('{}connected.txt'.format(self.logfolder), 'w') \ as connectFile: connectFile.write(str(self.connected)) # output live stats before leaving self.live_report() message = "Session ended!" highlight_print(Settings, self.username, message, "end", "info", self.logger) print("\n\n") def set_quota_supervisor(self, Settings, enabled=False, sleep_after=[], sleepyhead=False, stochastic_flow=False, notify_me=False, peak_likes=(None, None), peak_comments=(None, None), peak_connects=(None, None), peak_unconnects=(None, None), peak_server_calls=(None, None)): """ Sets aside QS configuration ANY time in a session """ # take a reference of the global configuration configuration = Settings.QS_config # strong type checking on peaks entered peak_values_combined = [ peak_likes, peak_comments, peak_connects, peak_unconnects, peak_server_calls ] peaks_are_tuple = all( type(item) is tuple for item in peak_values_combined) if peaks_are_tuple: peak_values_merged = [ i for sub in peak_values_combined for i in sub ] integers_filtered = filter(lambda e: isinstance(e, int), peak_values_merged) peaks_are_provided = all( len(item) == 2 for item in peak_values_combined) peaks_are_valid = all( type(item) is int or type(item) is type(None) for item in peak_values_merged) peaks_are_good = all(item >= 0 for item in integers_filtered) # set QS if peak values are eligible if (peaks_are_tuple and peaks_are_provided and peaks_are_valid and peaks_are_good): peaks = { "likes": { "hourly": peak_likes[0], "daily": peak_likes[1] }, "comments": { "hourly": peak_comments[0], "daily": peak_comments[1] }, "connects": { "hourly": peak_connects[0], "daily": peak_connects[1] }, "unconnects": { "hourly": peak_unconnects[0], "daily": peak_unconnects[1] }, "server_calls": { "hourly": peak_server_calls[0], "daily": peak_server_calls[1] } } if not isinstance(sleep_after, list): sleep_after = [sleep_after] rt = time.time() latesttime = {"hourly": rt, "daily": rt} orig_peaks = deepcopy(peaks) # original peaks always remain static stochasticity = { "enabled": stochastic_flow, "latesttime": latesttime, "original_peaks": orig_peaks } if (platform.startswith("win32") and python_version() < "2.7.15"): # UPDATE ME: remove this block once plyer is # verified to work on [very] old versions of Python 2 notify_me = False # update QS configuration with the fresh settings configuration.update({ "state": enabled, "sleep_after": sleep_after, "sleepyhead": sleepyhead, "stochasticity": stochasticity, "notify": notify_me, "peaks": peaks }) else: # turn off QS for the rest of the session # since peak values are ineligible configuration.update(state="False") # user should be warned only if has had QS turned on if enabled is True: self.logger.warning("Quota Supervisor: peak rates are misfit! " "Please use supported formats." "\t~disabled QS") def live_report(self): """ Report live sessional statistics """ print('') stats = [ self.liked_img, self.already_liked, self.commented, self.connected, self.already_connected, self.unconnected, self.inap_img, self.not_valid_users ] if self.connecting_num and self.connected_by: owner_relationship_info = ( "On session start was connectING {} users" " & had {} connectERS".format(self.connecting_num, self.connected_by)) else: owner_relationship_info = '' sessional_run_time = self.run_time() run_time_info = ( "{} seconds".format(sessional_run_time) if sessional_run_time < 60 else "{} minutes".format(truncate_float( sessional_run_time / 60, 2)) if sessional_run_time < 3600 else "{} hours".format(truncate_float(sessional_run_time / 60 / 60, 2))) run_time_msg = "[Session lasted {}]".format(run_time_info) if any(stat for stat in stats): self.logger.info( "Sessional Live Report:\n" "\t|> LIKED {} images | ALREADY LIKED: {}\n" "\t|> COMMENTED on {} images\n" "\t|> connected {} users | ALREADY connected: {}\n" "\t|> UNconnected {} users\n" "\t|> LIKED {} comments\n" "\t|> REPLIED to {} comments\n" "\t|> INAPPROPRIATE images: {}\n" "\t|> NOT VALID users: {}\n" "\n{}\n{}".format(self.liked_img, self.already_liked, self.commented, self.connected, self.already_connected, self.unconnected, self.liked_comments, self.replied_to_comments, self.inap_img, self.not_valid_users, owner_relationship_info, run_time_msg)) else: self.logger.info("Sessional Live Report:\n" "\t|> No any statistics to show\n" "\n{}\n{}".format(owner_relationship_info, run_time_msg)) def run_time(self): """ Get the time session lasted in seconds """ real_time = time.time() run_time = (real_time - self.start_time) run_time = truncate_float(run_time, 2) return run_time def search_and_apply(self): usualjobslink = "https://www.linkedin.com/jobs" web_address_navigator(Settings, self.browser, usualjobslink) job_title_XP = '//input[contains(@id,"jobs-search-box-keyword-id")]' txt_job_title = self.browser.find_element_by_xpath(job_title_XP) print('Entering Job Title') (ActionChains(self.browser).move_to_element( txt_job_title).click().send_keys("Python Developer").perform()) job_location_XP = '//input[contains(@id,"jobs-search-box-location-id")]' txt_job_location = self.browser.find_element_by_xpath(job_location_XP) print('Entering Job Location') (ActionChains( self.browser).move_to_element(txt_job_location).click().send_keys( "San Jose, California, United States").perform()) # update server calls for both 'click' and 'send_keys' actions for i in range(2): update_activity(Settings) sleep(1) print("Clicking Search Button") job_search_XP = '//button[contains(@class,"jobs-search-box__submit-button")]' btn_job_search = self.browser.find_element_by_xpath(job_search_XP) print(btn_job_search) (ActionChains( self.browser).move_to_element(btn_job_search).click().perform()) # update server calls update_activity(Settings) sleep(10) input("Press Enter to continue...") def search_and_apply(self, job_title, job_location, distance=50, random_start=True, max_pages=20, max_connects=25, sleep_delay=6): self.logger.info( "Searching for: job_title={}, job_location={}, radius={}".format( job_title, job_location, distance)) connects = 0 prev_connects = -1 # https://www.linkedin.com/jobs/search/?keywords=python%20developer&location=San%20Jose%2C%20California%2C%20United%20States&distance=50 job_search_url = "https://www.linkedin.com/jobs/search/?" if job_title: job_search_url = job_search_url + "keywords=" + job_title if job_location: job_search_url = job_search_url + "&location=" + job_location if distance: job_search_url = job_search_url + "&distance=" + str(distance) temp_job_search_url = job_search_url + "&start=0" print(temp_job_search_url) time.sleep(10) if self.test_page( search_url=temp_job_search_url, page_no=1, css_selector_identifier="div.jobs-search-results ") == False: self.logger.info( "============Definitely no Result, Next Query==============") return 0 if random_start: trial = 0 st = 5 while True and trial < 5 and st > 1: st = random.randint(1, st - 1) temp_job_search_url = job_search_url + "&start=" + str(st * 25) if self.test_page(temp_job_search_url, st, "div.jobs-search-results"): break trial = trial + 1 else: st = 1 for page_no in list(range(st, st + max_pages)): try: temp_job_search_url = job_search_url + "&start=" + str(page_no) if page_no > st and st > 1: web_address_navigator(Settings, self.browser, temp_job_search_url) self.logger.info("Starting page: {}".format(page_no)) for jc in range(2, 11): sleep(1) self.browser.execute_script( "window.scrollTo(0, document.body.scrollHeight/" + str(jc) + "-100);") if len( self.browser.find_elements_by_css_selector( "div.jobs-search-results")) == 0: self.logger.info( "============Last Page Reached or asking for Premium membership==============" ) break for i in range( 0, len( self.browser.find_elements_by_css_selector( "div.jobs-search-results"))): print(i) except Exception as e: self.logger.error(e) input("Press Enter to continue...")
class CraigslistBot: @staticmethod def debug(inString): print(" [BOT] - %s" % inString.encode('utf-8').strip()) def __init__(self, protonLogin="", protonPassword="", loginEmail="", loginPass="", contactNumber="", contactName="", postCode="", listingsFile="", waitTime=10, waitTimeBetweenPosts=30): self.display = "" if not os.name == 'nt': self.display = Display(visible=1, size=(1248, 1000)) # 800x600 self.display.start() self.client = webdriver.Firefox() self.isLoggedIn = False self.protonLogin = protonLogin self.protonPassword = protonPassword self.loginEmail = loginEmail self.loginPass = loginPass self.contactNumber = contactNumber self.contactName = contactName self.postCode = postCode self.listingsFile = listingsFile self.waitTime = waitTime self.waitTimeBetweenPosts = waitTimeBetweenPosts self.locationCode = "chi" #nyc asks for more location data not implement yet s def __del__(self): if not os.name == 'nt': self.display.stop() self.client.quit() return 0 def login(self, oneTimeLoginLink=""): self.debug("Logging in...") if oneTimeLoginLink == "": self.client.get("https://accounts.craigslist.org/login") else: self.client.get(oneTimeLoginLink) self.waitForId("inputEmailHandle") #self.debug("Inputing information to login screen") self.client.find_element_by_css_selector( "#inputEmailHandle").send_keys(self.loginEmail) self.client.find_element_by_css_selector("#inputPassword").send_keys( self.loginPass) self.client.find_element_by_id("login").click() # if need activation: # otl = self.validatePostInEmail() # self.login(otl) # return try: self.client.find_element_by_css_selector('.tab') except NoSuchElementException: self.debug("Not logged in") return self.debug("Successfully logged in!") self.isLoggedIn = True def createpost(self, listing): if not self.isLoggedIn: self.debug("ERROR: You're not logged in!") return 0 #self.debug("Attempting to post this listing:") #self.debug(listing.tostring() + "\n") #self.debug("Navigating to post page") #self.debug("locationCode: " + self.locationCode) initialPostUrl = "https://post.craigslist.org/c/" + self.locationCode #self.debug("navigating to " + initialPostUrl) self.client.get(initialPostUrl) self.waitForCss("input[value='1']") self.client.find_element_by_css_selector("input[value='1']").click() # fso = for sale by owner # so = service offered self.client.find_element_by_css_selector("input[value='fso']").click() time.sleep(self.waitTime) # 199 = computer parts # 7 = computers # 96 = electronics self.client.find_element_by_css_selector("input[value='96']").click() time.sleep(self.waitTime) """ self.debug("Trying to fill in email") try: self.client.find_element_by_css_selector( '#FromEMail').send_keys(self.loginEmail) except NoSuchElementException: self.debug("Not avaliable") try: self.client.find_element_by_css_selector( '#FromEMail').send_keys(self.loginEmail) except NoSuchElementException: self.debug("Not avaliable") """ #self.debug("Checking 'Okay to contact by phone'") self.waitForName("show_phone_ok") self.client.find_element_by_name("show_phone_ok").click() self.client.find_element_by_name("contact_phone_ok").click() #self.debug("Checking 'Okay to contact by text'") self.client.find_element_by_name("contact_text_ok").click() #self.debug("Filling in contact phone number") self.client.find_element_by_name("contact_phone").send_keys( self.contactNumber) #self.debug("Filling in contact name") self.client.find_element_by_name("contact_name").send_keys( self.contactName) #self.debug("Filling in post title") spinName = spintax.spin(listing.name) self.client.find_element_by_name("PostingTitle").send_keys(spinName) #self.debug("Filling in zip code") self.client.find_element_by_id("postal_code").send_keys(self.postCode) #self.debug("Filling in post content") spinDescription = spintax.spin(listing.description) self.client.find_element_by_name("PostingBody").send_keys( spinDescription) #self.debug("Checking 'Okay to contact for other offers'") self.waitForName("contact_ok") self.client.find_element_by_name("contact_ok").click() # self.debug("Unchecking 'Want a map' if checked") # try: # self.client.find_element_by_css_selector("#wantamap:checked") # except NoSuchElementException: # self.debug("Not checked") # finally: # self.client.find_element_by_css_selector("#wantamap:checked").click() # time.sleep(self.waitTime) #self.debug("Clicking continue") self.client.find_element_by_name("go").click() # if "editimage" in self.client.current_url: # FIX tHIS # self.debug("Clicking continue") # self.client.find_element_by_css_selector('button.done').click() # else: # self.debug( # "Could not submit. Maybe a bad email address or phone number") #self.debug("Clicking publish") self.waitForClass("bigbutton") self.client.find_element_by_class_name('bigbutton').click() # determine if we need to switch to classic uploading time.sleep(self.waitTime) if len(self.client.find_elements_by_id('classic')) != 0: #self.debug("clicking use classic image uploader") self.waitForId("classic") time.sleep(self.waitTime) self.client.find_element_by_id('classic').click() time.sleep(self.waitTime ) # must wait for classic to pop into the viewport #self.debug("uploading images") self.waitForName("file") for imagePath in listing.imagePathList: self.debug("Attempting to upload image: " + os.getcwd() + "/" + imagePath) self.client.find_element_by_name("file").send_keys(os.getcwd() + "/" + imagePath) time.sleep(self.waitTime) self.debug("Clicking done with images") self.waitForClass("bigbutton") self.client.find_element_by_class_name('bigbutton').click() self.debug("Click publish (again)") self.waitForName("go") self.client.find_element_by_name('go').click() # check if we need to verify the post self.debug("Check if the post needs verified") time.sleep(self.waitTime) htmlText = self.client.find_element_by_css_selector("body").text # self.debug(htmlText) if "FURTHER ACTION REQUIRED" in htmlText: # wait for the email to come through and then verify it self.debug("must verify post") time.sleep(45) self.validatePostInEmail() return self.client.find_element_by_css_selector( "ul.ul").find_elements_by_css_selector("a")[0].get_attribute( "href") # region WaitFor methods def waitForName(self, name): for i in range(0, 30): #self.debug("waiting for id \"" + name + "\"...") if len(self.client.find_elements_by_name(name)) != 0: break time.sleep(2) def waitForId(self, idName): for i in range(0, 30): #self.debug("waiting for id \"" + idName + "\"...") if len(self.client.find_elements_by_id(idName)) != 0: break time.sleep(2) def waitForCss(self, css): for i in range(0, 30): #self.debug("waiting for css selector \"" + css + "\"...") if len(self.client.find_elements_by_css_selector(css)) != 0: break time.sleep(2) def waitForClass(self, className): for i in range(0, 30): #self.debug("waiting for class \"" + className + "\"...") if len(self.client.find_elements_by_class_name(className)) != 0: break time.sleep(2) # endregion def validatePostInEmail(self): self.debug("NOW, WE VALIDATE!") self.client.get("https://mail.protonmail.com/login") self.waitForId("username") self.client.find_element_by_id("username").send_keys(self.protonLogin) self.client.find_element_by_id("password").send_keys( self.protonPassword) self.client.find_element_by_id("login_btn").click() # we're looking for the first link (our craigslistBot email folder) in the first "menuItem-label" list self.waitForClass("menuLabel-item") labelItem = self.client.find_elements_by_class_name( "menuLabel-item")[0] labelLink = labelItem.find_elements_by_css_selector( "a")[0].get_attribute('href') self.client.get(labelLink) # click the newest email self.waitForClass("conversation") self.client.find_elements_by_class_name("conversation")[0].click() # find the newest message in that email self.waitForClass("message") correctMessage = self.client.find_elements_by_class_name("message")[-1] # get the one time link, typically the last link in the list self.waitForCss("a") oneTimeLink = correctMessage.find_elements_by_css_selector( "a")[-1].get_attribute('href') # if the last link is a support page, select the second to last link which should be our verification link if oneTimeLink == "https://www.craigslist.org/about/scams?lang=en&cc=us": oneTimeLink = correctMessage.find_elements_by_css_selector( "a")[-2].get_attribute('href') # navigate to the verification link self.client.get(oneTimeLink) # get the new post link. This may be the incorrect link, look into this. self.waitForCss("a") newPostLink = labelItem.find_elements_by_css_selector( "a")[0].get_attribute('href') time.sleep(2) return newPostLink
def parse(self, response): display = Display(visible=0, size=(800, 600)) display.start() url = 'https://www.publicstorage.com/missouri/self-storage-st-charles-mo/63303-self-storage/918?PID=PSLocalSearch&CID=1341&CHID=LL' driver = webdriver.Firefox() driver.get(url) #url2='http://www.a1lockerrental.com/self-storage/mo/st-louis/4427-meramec-bottom-rd-facility/unit-sizes-prices#/units?category=all' #driver2 = webdriver.Firefox() #driver2.get(url2) #html2 = driver.page_source #soup2 = BeautifulSoup(html2, 'html.parser') #soup.append(soup2) #print soup items = [] inside = "Indoor" outside = "Outdoor" inside_units = ["5 x 5", "5 x 10"] outside_units = [ "5' x 5'", "5' x 10'", "5' x 15'", "8' x 10'", "10' x 10'", "10' x 20'", "10' x 25'", "10' x 30'", "10' x 24'", "10' x 15'" ] #print soup.findAll('span',{"class":"sss-unit-size"}) html = driver.page_source soup = BeautifulSoup(html, 'html.parser') time.sleep(3) sizeTagz = soup.findAll('div', {"class": "srp_label srp_font_14"}) rateTagz = soup.findAll('div', {"class": "srp_label alt-price"}) specialTagz2 = soup.findAll('div', {"class": "srp_res_clm srp_clm90"}) specialTagz = soup.findAll('div', {"class": "srp_v-space_10"}) typesTagz = soup.findAll( 'ul', {"class": "srp_list"}, ) yield { 'date': datetime.datetime.now().strftime("%m-%d"), 'name': "Public Storage" } size = [] for n in range(len(sizeTagz)): #print len(sizeTagz) print(specialTagz2[n]).get_text() #print (rateTagz[n]).get_text() if "Outside" in (typesTagz[n]).get_text(): if (sizeTagz[n]).get_text() in outside_units: if (sizeTagz[n]).get_text() not in size: size.append((sizeTagz[n]).get_text()) #size.append(re.findall(r'\d+',(sizeTagz2[n]).get_text())) print "logic hit" yield { #soup.findAll('p',{"class":"icon-bg"}) #'name': soup.find('strong', {'class':'high'}).text #.replace('\n', '') "special": "Incomplete", #re.sub(r"(?<=[a-z])\r?\n"," ",(specialTagz[n]).get_text()), "rate": (rateTagz[n]).get_text(), 'size': ((sizeTagz[n]).get_text()), "types": "Outside" } driver.close()
def parseData(linkedinProfile): display = Display(visible=0, size=(800, 600)) display.start() driver = webdriver.Chrome('/usr/local/share/chromedriver') profile_link = linkedinProfile driver.get(profile_link) html = driver.page_source soup = BeautifulSoup(html, 'lxml') with open('data.csv', 'a') as csvWriterFile: csvWriter = csv.writer(csvWriterFile) # Parse the name of the user name = soup.find("h1", {"id": "name"}) data = [] data.append(name.text) # Parse the summary summary = soup.find("section", {"id": "summary"}) data.append(summary.find("div", {"class": "description"}).text) # Parse the skills skills = soup.find("ul", {"class": "pills"}).find_all('li') skillData = '' for skill in skills: if 'see-less' in skill.get("class"): continue if 'see-more' in skill.get("class"): continue skillData = skillData + " " + skill.a.span.text data.append(skillData) # Parse the Work Experience workExperiences = soup.find("ul", { "class": "positions" }).find_all('li') workExperienceData = [] for workExperience in workExperiences: header = workExperience.header tmp = {} tmp['role'] = header.find("h4", {"class": "item-title"}).text tmp['organisation'] = header.find("h5", { "class": "item-subtitle" }).text tmp['description'] = workExperience.find("p", { "class": "description" }).text workExperienceData.append(tmp) data.append(workExperienceData) # Parse the projects. projects = soup.find_all("li", {"class": "project"}) projectData = [] for project in projects: tmp = {} tmp['title'] = project.header.text tmp['description'] = project.p.text projectData.append(tmp) data.append(projectData) csvWriter.writerow(data)
def gethtml1(url_root): global driver int1 = 0 html='' while int1 < 10: # 配合计时函数,开一个信号来计算时间,超时便报错,然后except重新再爬这个网址。要是200次都报错就跳过该网址 #(有时候因为网络或者模拟浏览器的问题,程序会卡住但不会报错跳出,用这个方法识别超时,手动报出错,catch错误后重新爬) #signal.signal(signal.SIGALRM, timed_out) #signal.setitimer(signal.ITIMER_REAL, 10, 0) try: #模拟chrome,并消除浏览器图像化,以便在服务器里运行 int1 = int1 + 1 driver.get(url_root.replace('amp;', '').replace('&th=1', '')) #打印当前网址 print(driver.current_url) html = driver.page_source.encode('utf-8','ignore').decode() if html.find('您输入的网址在我们的网站上无法正常显示网页')!= -1: print('good is missing') html="" elif len(html) < 1024 * 5: print('block 2...',len(html)) raise RuntimeError() else: while(html.find('抱歉,我们只是想确认一下当前访问者并非自动程序')) != -1: print('block 1...') f= open('tmp/fp.txt','w') f.write(html) f.close() #找正则表达式 pi = r'<img src="(.*?)" />' pi_url = re.findall(pi, str(html), re.S | re.M) #findall下是一个列表 #获取图片 if len(pi_url) > 0: data = urllib.request.urlopen(pi_url[0]).read() address = 'pi/pi.jpg' w = open(address, 'wb') w.write(data) w.close() de = main(address) else: print('pi_url is null') #输入验证码 driver.find_element_by_id("captchacharacters").send_keys(str(de)) # driver.find_element_by_name("继续购物").click() driver.find_element_by_class_name('a-button-text').click() time.sleep(0.5) driver.get(driver.current_url.replace('amp;', '').replace('&th=1', '')) # 打印当前网址 print(driver.current_url) html = driver.page_source.encode('utf-8', 'ignore').decode() # index = random.randint(0, 15) # user_agent = user_agents[index] # head = {'User-Agent': user_agent} # req = request.Request(url_root, headers=head) # response = urllib.request.urlopen(req) # html = response.read().decode('utf-8') break except Exception as e: print('time exceeded', int1,e) try: os.system('rm -rf /tmp/.com.google.Chrome*') os.system('rm -rf /tmp/.org.chromium*') os.system('pkill -9 chrome') os.system('pkill -9 Xvfb') os.system('pkill -9 chromedriver') os.system('pkill -9 geckodriver') print('sleep...') time.sleep(random.randint(1,3)) print('starting') display = Display(visible=0, size=(800, 800)) display.start() #driver = webdriver.Firefox() driver = webdriver.Chrome() driver.delete_all_cookies() print('started') except Exception as e: print('!', e) return html
class WebAssay: """ This is a base class that is built ontop a Selenium driver. Inherit from this class to 1. parse web pages, 2. calculate the area and position of elements, and 3. stain HTML page for parsed elements. It can be used as a base class for variants of WebAssay. You must implement a `run` function to use the base class. """ def __init__(self, user_agent: str, window_size: tuple, headless=False, parser_functions: List = [], color_palette: Dict = {}, warpped_height_px: int = 700, reset_driver_after: int = 50): """ `headless` should be set to True if you want a headless web browser. `color_palette` is a dictionary that maps from element category to a hex color. `parser_functions` a list of parser functions. Where a parser function takes bs4, and returns a list of dictionaries. Be sure to make one of those keys contains `category`, if you're using a `color_pallette` and want to stain images. `warpped_height_px` is the minimum y-distance in pixels to consider an element warpped. """ # functions that take bs4 and return a list of dicts. self.parser_functions = parser_functions if len(self.parser_functions) == 0: raise ValueError("Please assign parser_functions!") # browser params self.window_size = window_size self.width, self.height = window_size self.user_agent = user_agent self.headless = headless self._init_browser() # optional params self.color_palette = color_palette # dictionary of category to color. self.warpped_height = warpped_height_px # skip elements whose height exceeds. # friends we make along the way self.error_files = [] # which files are not parsed correctly? self.element_metadata = pd.DataFrame( ) # the most recent element metadata. self.driver_reset_counter = 0 # driver will reset at `reset_driver_after`. self.reset_driver_after = reset_driver_after def _init_browser(self): """ Initalizes a selenium browser with proper `user_agent` and window `size`. Set `headless` to True to have a headless browser. Keep the default as False to help debug. """ self.display = False if self.headless: self.display = Display(visible=0, size=(self.width + 10, self.height + 10)) self.display.start() # Set up user agent profile = webdriver.FirefoxProfile() profile.set_preference("general.useragent.override", self.user_agent) firefox_capabilities = webdriver.DesiredCapabilities.FIREFOX firefox_capabilities['marionette'] = True driver = webdriver.Firefox(profile, capabilities=firefox_capabilities) driver.set_window_size(*self.window_size) self.driver = driver def close_driver(self): """Closes the driver""" self.driver.quit() if not isinstance(self.display, bool): self.display.stop() def restart_driver(self): """Restarts drivers and display""" self.close_driver() self._init_browser() self.driver_reset_counter = 0 time.sleep(2) def save_source(self, fn: str): """Saves the source code of a page.""" with open(fn, 'w') as f: f.write(self.driver.page_source) def screenshot_full(self, fn: str): """ Takes a full screenshot. There are other methods that work better with a headless browser (such as expanding the window). The screenshot is resized to the original dimensions. For whatever reason, I get higher res images by the default screenshot. The standard size allows us to mark up the screenshot with the element metadata in `paint_abstract_representation`. """ body = self.driver.find_element_by_tag_name('body') body.screenshot(fn) # resize image img = Image.open(fn) img.thumbnail((body.rect['width'], 1e6), Image.ANTIALIAS) img.save(fn) def identify_elements( self, body: Union[element.Tag, element.NavigableString]) -> List: """ Runs every parser in `self.parser_functions` through the web page. The results are appended to the `data` output. """ data = [] for parser in self.parser_functions: results = parser(body) data.extend(results) return data def stain_element(self, xpath: str, category: str, color: str = '#ffffff', opacity: float = 0.7) -> bool: """ Alters the HTML of a page. Stains elements located in `xpath` with `color` by overwritting the style attribute. Also sets a new param of markup_category = `category`. """ try: elm = self.driver.find_element_by_xpath(xpath) except: # couldn't find element return False if not elm.is_displayed(): return False style = elm.get_attribute('style') if elm.tag_name == 'img': custom_style = f"background-color: {color} !important; " \ "transition: all 0.5s linear;"\ "mix-blend-mode: multiply !important;" if style: style += '; ' + custom_style else: style = custom_style self.driver.execute_script( f"arguments[0].setAttribute('markup_category','{category}')", elm) self.driver.execute_script( f"arguments[0].setAttribute('style','{style}')", elm) parent = elm.find_element_by_xpath('ancestor::div[1]') style_parent = parent.get_attribute('style') custom_style = f"background-color: {color} !important; " if style_parent: style_parent += '; ' + custom_style else: style_parent = custom_style self.driver.execute_script( f"arguments[0].setAttribute('style','{style_parent}')", parent) else: self.driver.execute_script( f"arguments[0].setAttribute('markup_category','{category}')", elm) custom_style = f"background-color: {color} !important; "\ "transition: all 0.5s linear;" if style: style += '; ' + custom_style else: style = custom_style self.driver.execute_script( f"arguments[0].setAttribute('style','{style}')", elm) all_images = elm.find_elements_by_tag_name('img') for img in all_images: if img.is_displayed(): style = img.get_attribute('style') custom_style = f"background-color: {color} !important; " \ "mix-blend-mode: multiply !important; z-index:99 !important;" if style: style += '; ' + custom_style else: style = custom_style self.driver.execute_script( f"arguments[0].setAttribute('style','{style}')", img) all_videos = elm.find_elements_by_tag_name('video') for vid in all_videos: if vid.is_displayed(): style = vid.get_attribute('style') custom_style = f"background-color: {color} !important; " \ "mix-blend-mode: multiply !important; z-index:99 !important;" if style: style += '; ' + custom_style else: style = custom_style self.driver.execute_script( f"arguments[0].setAttribute('style','{style}')", vid) if elm.tag_name == 'a': all_children_by_xpath = elm.find_elements_by_tag_name("div") for child in all_children_by_xpath: if child.is_displayed(): style = elm.get_attribute('style') custom_style = f"background-color: {color} !important; " if style: style += '; ' + custom_style else: style = custom_style self.driver.execute_script( f"arguments[0].setAttribute('style','{style}')", child) return True def calculate_element_area(self, xpath: str) -> Dict: """ Selenium will try to find an element based on the `xpath`. If it is found, calculate the `area` that element occupies on first screen (`area`) and whole page (`area_page`). If the element is warpped or empty, return an empty dict. """ # get the element based on the xpath try: elm = self.driver.find_element_by_xpath(xpath) except: # couldn't find element return {} # get dimensions of element rect = elm.rect # skip warped elements if rect['height'] >= self.warpped_height: return {'is_warpped': True} # adjust the dimensions by clipping if necessay. "Area" is the first screen if elm.is_displayed(): area = calc_area(rect, location=rect, width=self.width, height_bottom=self.height) area_page = calc_area(rect, location=rect, width=self.width) meta = { 'xpath': xpath, 'dimensions': elm.size, 'location': elm.location, 'area': area, 'area_page': area_page, } return meta def open_local_html(self, fn): """Opens a local HTML page in the emulator.""" local_file = 'file://' + os.path.abspath(fn) if self.driver.current_url != local_file: self.driver.get(local_file) def run(self): """ This function must be overwritten in the inherited class. Should contain the following steps: 1. Read either the current page on the driver or a local HTML file `fn` into bs4... 2. Identify elements by sending the contents of the HTML through each parser in `parser_functions`. Do this by calling `self.identify_elements()` on the page. 3. For each element, `self.calculate_element_area()`, and optionally `self.stain_element()` if self.stain = True. 4. Assign `self.element_metadata` with the latest element metadata. And then anything else is up to you. """ raise NotImplementedError
def crawl_meta(meta_hdf5=None, write_meta_name='data.hdf5', crawl_review=False): if meta_hdf5 is None: # Crawl the meta data from OpenReview # Set up a browser to crawl from dynamic web pages from selenium import webdriver from selenium.webdriver.chrome.options import Options from pyvirtualdisplay import Display display = Display(visible=0, size=(800, 800)) display.start() import time executable_path = '/usr/local/bin/chromedriver' options = Options() options.add_argument("--headless") browser = webdriver.Chrome(options=options, executable_path=executable_path) # Load all URLs for all ICLR submissions urls = [] with open('urls.txt') as f: urls = f.readlines() urls = [url.strip() for url in urls] meta_list = [] wait_time = 0.25 max_try = 1000 for i, url in enumerate(urls): browser.get(url) time.sleep(wait_time) key = browser.find_elements_by_class_name("note_content_field") key = [k.text for k in key] withdrawn = 'Withdrawal Confirmation:' in key desk_reject = 'Desk Reject Comments:' in key value = browser.find_elements_by_class_name("note_content_value") value = [v.text for v in value] # title title = string.capwords( browser.find_element_by_class_name("note_content_title").text) author = string.capwords( browser.find_element_by_class_name("meta_row").text).split( ', ') # abstract valid = False tries = 0 while not valid: if 'Abstract:' in key: valid = True else: time.sleep(wait_time) tries += 1 key = browser.find_elements_by_class_name( "note_content_field") key = [k.text for k in key] withdrawn = 'Withdrawal Confirmation:' in key desk_reject = 'Desk Reject Comments:' in key value = browser.find_elements_by_class_name( "note_content_value") value = [v.text for v in value] if tries >= max_try: print('Reached max try: {} ({})'.format(title, url)) break abstract = ' '.join(value[key.index('Abstract:')].split('\n')) # keyword if 'Keywords:' in key: keyword = value[key.index('Keywords:')].split(',') keyword = [k.strip(' ') for k in keyword] keyword = [ ''.join(string.capwords(k).split(' ')) for k in keyword if not k == '' ] for j in range(len(keyword)): if '-' in keyword[j]: keyword[j] = ''.join([ string.capwords(kk) for kk in keyword[j].split('-') ]) else: keyword = [] # rating rating_idx = [i for i, x in enumerate(key) if x == "Rating:"] rating = [] if len(rating_idx) > 0: for idx in rating_idx: rating.append(int(value[idx].split(":")[0])) if crawl_review: review_idx = [i for i, x in enumerate(key) if x == "Review:"] # review = [] review_len = [] if len(review_idx) > 0: for idx in review_idx: review_len.append( len([ w for w in value[idx].replace('\n', ' ').split( ' ') if not w == '' ])) # review.append(value[idx]) # decision if 'Decision:' in key: decision = value[key.index('Decision:')] meta_review = value[key.index('Decision:') + 1] else: decision = 'N/A' meta_review = '' meta_review_len = len([ w for w in meta_review.replace('\n', ' ').split(' ') if not w == '' ]) # log log_str = '[{}] ratings: {}'.format( i + 1, rating, ) """ log_str = '[{}] Abs: {} chars, keywords: {}, ratings: {}'.format( i+1, len(abstract), len(keyword), rating, ) if crawl_review: log_str += ', review len: {}'.format(review_len) """ log_str += ', meta review len: {}'.format(meta_review_len) if not decision == 'N/A': log_str += ', decision: {}'.format(decision) log_str += '] {}'.format(title) log_str += ' by {}'.format(', '.join(author)) if withdrawn: log_str += ' (withdrawn)' if desk_reject: log_str += ' (desk_reject)' print(log_str) meta_list.append( PaperMeta( title, abstract, keyword, rating, url, withdrawn, desk_reject, decision, author, # None if not crawl_review else review, None if not crawl_review else review_len, meta_review_len, )) # Save the crawled data write_meta(meta_list, write_meta_name) else: # Load the meta data from local meta_list = read_meta(meta_hdf5) return meta_list
class UITestCase(LiveServerTestCase): def use_xvfb(self): from pyvirtualdisplay import Display self.display = Display('xvfb', visible=1, size=(1280, 1024)) self.display.start() self.driver = WebDriver() def setUp(self): try: self.driver = WebDriver() ui_is_not_available = False except WebDriverException: ui_is_not_available = True if ui_is_not_available: self.use_xvfb() self.driver.implicitly_wait(10) clear_caches() setup_for_ui_test() super(UITestCase, self).setUp() def tearDown(self): self.driver.quit() if hasattr(self, 'display'): self.display.stop() ContentType.objects.clear_cache() super(UITestCase, self).tearDown() def click(self, selector): self.find(selector).click() def click_when_visible(self, selector): element = self.find(selector) self.wait_until_visible(element) element.click() def find(self, selector): return self.driver.find_element_by_css_selector(selector) def find_name(self, name): return self.driver.find_element_by_name(name) def find_id(self, id): return self.driver.find_element_by_id(id) def process_login_form(self, username, password): username_elmt = self.wait_until_present('[name="username"]') password_elmt = self.find_name('password') username_elmt.send_keys(username) password_elmt.send_keys(password) self.click('form * button') def browse_to_url(self, url): self.driver.get(self.live_server_url + url) def browse_to_instance_url(self, url, instance=None): instance = instance if instance is not None else self.instance self.driver.get('%s/%s/%s' % (self.live_server_url, self.instance.url_name, url)) def find_anchor_by_url(self, url): return self.find("[href='%s']" % url) def wait_until_present(self, selector, timeout=10): """ Wait until an element with CSS 'selector' exists on the page. Useful for detecting that an operation loads the page you're expecting. """ element = [None] # use list so it can be set by inner scope def is_present(driver): element[0] = self.find(selector) return element[0] is not None WebDriverWait(self.driver, timeout).until(is_present) return element[0] def wait_until_text_present(self, text, timeout=10): """ Wait until 'text' exists on the page. Useful for detecting that an operation loads the page you're expecting. """ WebDriverWait(self.driver, timeout).until(lambda driver: text in driver.page_source) def wait_until_enabled(self, element_or_selector, timeout=10): """ Wait until 'element_or_selector' is enabled. """ element = self._get_element(element_or_selector) WebDriverWait(self.driver, timeout).until( lambda driver: element.get_attribute("disabled") is None) return element def wait_until_visible(self, element_or_selector, timeout=10): """ Wait until 'element_or_selector' (known to already exist on the page) is displayed. """ element = self._get_element(element_or_selector) WebDriverWait(self.driver, timeout).until(lambda driver: element.is_displayed()) return element def wait_until_invisible(self, element_or_selector, timeout=10): """ Wait until 'element_or_selector' (known to already exist on the page) is not displayed. """ element = self._get_element(element_or_selector) def is_invisible(driver): try: return not element.is_displayed() except StaleElementReferenceException: return True WebDriverWait(self.driver, timeout).until(is_invisible) return element def _get_element(self, element_or_selector): if isinstance(element_or_selector, basestring): return self.find(element_or_selector) else: return element_or_selector