def get_bookmark(url, oid, owner): display = Display(visible=0, size=(800, 600)) display.start() result = {} # url = 'http://kcy.me/wge8' result['original-url'] = url req = requests.get(url) result['url'] = req.url soup = BeautifulSoup(req.text, 'lxml') result['title'] = soup.title.string.encode('utf-8') result['description'] = soup.findAll(attrs={"name":"description"})[0]['content'].encode('utf-8') driver = webdriver.Firefox() driver.get(req.url) filename = oid + '.png' fname = os.path.join(os.path.dirname(__file__), filename) driver.get_screenshot_as_file(fname) result['screenshot'] = upload_image_as_png_to_s3(filename, owner) driver.close() display.stop() # delete temp file os.unlink(filename) return result
class UITestCase(LiveServerTestCase): def use_xvfb(self): from pyvirtualdisplay import Display self.display = Display('xvfb', visible=1, size=(1280, 1024)) self.display.start() self.driver = WebDriver() def setUp(self): try: self.driver = WebDriver() ui_is_not_available = False except WebDriverException: ui_is_not_available = True if ui_is_not_available: self.use_xvfb() self.driver.implicitly_wait(10) super(UITestCase, self).setUp() def tearDown(self): self.driver.quit() if hasattr(self, 'display'): self.display.stop() super(UITestCase, self).tearDown()
def main(): '''business logic for when running this module as the primary one!''' display = Display(visible=0, size=(1024, 768)) display.start() fresh_cl_post = find_cl_post() prev_cl_post = {"title":"","link":""} old_cl_post = {"title":"","link":""} # find_cl_post() while True: # print "TEST" + str(datetime.date.today()) fresh_cl_post = find_cl_post() try: if fresh_cl_post['title'] != prev_cl_post['title']: old_cl_post = prev_cl_post prev_cl_post = fresh_cl_post send_cl_email(fresh_cl_post) except: print "Failed to test & send mail at: "+str(datetime.datetime.now()) gc.collect() time.sleep(SLEEP_SECONDS) display.stop()
def retrieveTTdata(url): print "processing tenTeb ..." display = Display(visible=0, size=(1024, 1024)) display.start() # driver = webdriver.Firefox() # http://stackoverflow.com/questions/8255929/running-webdriver-chrome-with-selenium driver = webdriver.Chrome() driver.get(url) sleep(5) html = driver.page_source driver.quit() display.stop() parser = etree.HTMLParser() tree = etree.parse(StringIO(html), parser) for div_element in tree.getiterator("div"): if "class" in div_element.keys() and div_element.attrib["class"] == "types_bg": tree = div_element for div_element in tree.getiterator("div"): if "class" in div_element.keys() and div_element.attrib["class"] == "bets ml": parse_div_element(div_element)
def webthumb(url, filename, is_flash=False): script = """ var s = document.createElement('script'); s.src = 'http://cruels.net/sb/flashfix.js'; document.body.appendChild(s); """ print "webthumb(%s, %s)" % (url, filename) display = Display(visible=0, size=(1200, 900)) display.start() browser = webdriver.Firefox() browser.get(url) if is_flash: time.sleep(1) else: browser.execute_script(script) time.sleep(6) tmpfile = "%s.tmp" % filename browser.get_screenshot_as_file(tmpfile) img = pil.open(tmpfile) width, height = img.size if is_flash: resized = img.resize((LIBRARYFILE_THUMB_WIDTH, LIBRARYFILE_THUMB_HEIGHT), pil.ANTIALIAS) else: ratio = float(width) / float(height) resized = img.resize((LIBRARYFILE_THUMB_WIDTH, int(LIBRARYFILE_THUMB_WIDTH / ratio)), pil.ANTIALIAS) resized.save(filename) os.remove(tmpfile) print "Saved %s." % filename browser.quit() display.stop() return True
class BCCVLTestCase(unittest.TestCase): def setUp(self): # acquire URL, username and password from environment variables, or use default values for dev env. self.username = os.getenv("BCCVL_TEST_USERNAME", "admin") self.password = os.getenv("BCCVL_TEST_PASSWORD", "admin") self.url = os.getenv("BCCVL_TEST_URL", "https://192.168.100.200/") # The amount of time selenium will potentially wait in searching for elements. This is blocking. implicit_wait = int(os.getenv("BCCVL_TEST_IMPLICIT_WAIT", "15")) # Run tests in a virtual display (xvfb) virtual_display = os.getenv("BCCVL_TEST_VIRTUAL_DISPLAY", "false") == "true" # Setup the virtual display if virtual_display: self.display = Display(visible=0, size=(1920, 1080)) self.display.start() else: self.display = None # Setup the Firefox Profile and webdriver self.driver = webdriver.Firefox() self.driver.implicitly_wait(implicit_wait) # Maximize the window # self.driver.maximize_window() self.driver.set_window_size(1200, 800) # Go to the bccvl homepage self.driver.get(self.url) def tearDown(self): if self.display: self.display.stop() self.driver.quit()
def getupc(data, sleeptime): display = Display(visible=0, size=(800, 600)) display.start() a = webdriver.Firefox() a.get('https://www.google.com/ncr') time.sleep(sleeptime) search = WebDriverWait(a, 5).until(EC.element_to_be_clickable((By.XPATH, "//input[@type='text']"))) for i in data: ActionChains(a).move_to_element(search).click(search).send_keys(i['name'] + ' upc', Keys.ENTER).perform() time.sleep(sleeptime) contents = WebDriverWait(a, 5).until(EC.presence_of_all_elements_located((By.XPATH, "//div[@class='g']"))) try: upc = next( (re.split(r'/', href.find_element_by_tag_name('a').get_attribute('href'))[-1] for href in contents if href.find_element_by_tag_name('a').get_attribute('href').startswith( 'http://www.upcitemdb.com/upc'))) i['upc'] = upc except StopIteration: pass search = WebDriverWait(a, 5).until(EC.element_to_be_clickable((By.XPATH, "//input[@type='text']"))) search.clear() a.close() display.stop() return data
def rzhd(): directions=[create_url(),] while raw_input('Want to add more directions? y/n ')=='y': directions.append(create_url()) print "------------------" # n=raw_input('Check tickets every ...(seconds)? ') n = 60 place=choose_place() i = 0 display = Display(visible=0, size=(5, 5)) display.start() # Запускаем вирутальный дисплей while len(directions)!=0: i+=1 print print "----------------->Searching for PLATSKART<-----------------" print "try #",i print time.asctime() print for url in directions: if find_train(url, place)==True: send_email('*****@*****.**', url) if raw_input('Did you buy ticket? y/n ')=='y': directions.remove(url) if len(directions) == 0: print "Successfully bought all tickets!" return True print str(n)+" seconds until next try..." time.sleep(float(n)) # Дадим браузеру корректно завершиться display.stop() # Закрываем виртуальный дисплей
def load(self): min_time = 3600 # 1 hour in seconds max_time = 7179 # 2 hours in seconds (less 21) tasktime = randint(min_time, max_time) threading.Timer(tasktime, self.load).start() tasktime_m , tasktime_s = divmod( tasktime , 60) tasktime_h , tasktime_m = divmod( tasktime_m , 60) output_content = "Load execution - waiting %dh %02dmin %02dsec for the next time." % (tasktime_h, tasktime_m, tasktime_s) print "[KeepUp]" , output_content from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as ec from selenium.webdriver.common.keys import Keys from pyvirtualdisplay import Display # Initial display = Display(visible=0, size=(1600, 900)) display.start() profile = webdriver.FirefoxProfile() profile.set_preference("browser.cache.disk.enable", False) profile.set_preference("browser.cache.memory.enable", False) profile.set_preference("browser.cache.offline.enable", False) profile.set_preference("network.http.use-cache", False) driver = webdriver.Firefox() driver.get("https://c9.io/dashboard.html") driver.save_screenshot(self.directory_img + 'login.png') #Username username = driver.find_element_by_id("id-username") username.click() username.clear() username.send_keys(self.user, Keys.ARROW_DOWN) #Password password = driver.find_element_by_id("id-password") password.click() password.clear() password.send_keys(self.password, Keys.ARROW_DOWN) #Submit submit_button = driver.find_element_by_css_selector("button[type=submit]") # print submit_button.text # Click submition submit_button.click(); time.sleep(5) driver.save_screenshot(self.directory_img + 'user_profile.png') # Target dir driver.get(self.target_workspace) time.sleep(10) self.log({'log_html': driver.page_source, 'log_file': output_content}) #make log driver.save_screenshot(self.directory_img + 'final_workspace.png') # End driver.quit() display.stop()
def main(param): if len(param) != 2: sys.exit(-9) if len(param[1]) <= 0: sys.exit(-8) paths = param[0] shotsdir = paths.get('path', 'output.shotsdir').lstrip('"').rstrip('"') targets = param[1] display = Display(visible=0, size=(800, 600)) display.start() binary = FirefoxBinary('/opt/firefox/firefox') browser = webdriver.Firefox(firefox_binary=binary) tgt_len = len(targets) for i, tgt in enumerate(targets): browser.get(tgt[0]) browser.save_screenshot(shotsdir+'/'+tgt[1]+'.png') print '( %3d / %3d ) Took %s.png' % (i+1, tgt_len, tgt[1]) browser.quit() display.stop()
class TestContext(object): def open_browser(self): # if test_config.SELENIUM_USE_REMOTE: # dc = getattr(DesiredCapabilities, self.driver.upper()) # dc['name'] = test_config.SELENIUM_TEST_NAME # cmd_exec = test_config.SELENIUM_REMOTE_CMD_EXEC # self.browser = webdriver.Remote(desired_capabilities=dc, command_executor=cmd_exec) if test_config.SELENIUM_USE_VIRTUALDISPLAY: self.virtualdisplay = Display(backend=test_config.SELENIUM_VIRTUALDISPLAY_BACKEND, size=(600, 800)).start() self.browser = webdriver.Firefox(firefox_binary=FirefoxBinary(test_config.SELENIUM_FIREFOX_PATH)) self.browser.implicitly_wait(test_config.SELENIUM_PAGE_WAIT) def close(self): self.browser.quit() if hasattr(self, 'virtualdisplay'): self.virtualdisplay.stop() def get(self, url): self.browser.get(url) self.url = url def follow_link(self, link): link.click() self.url = self.browser.current_url def wait_for(self, by, thing): wait = WebDriverWait(self.browser, test_config.SELENIUM_PAGE_WAIT) wait.until(EC.presence_of_element_located((by, thing)))
def main(args): parser = argparse.ArgumentParser(description="Program for running tests on the PATRIC web interface.") parser.add_argument("user", metavar="user", help="Patric login username.") parser.add_argument("passwd", metavar="passwd", help="Patric login password.") parser.add_argument("--firebug", action="store_true", help="Open Firebug during test.") args = parser.parse_args() fp = webdriver.FirefoxProfile() if args.firebug: fp.add_extension(extension='extras/firebug-2.0.9.xpi') fp.set_preference("extensions.firebug.currentVersion", "2.0.9") #Avoid startup screen fp.set_preference("extensions.firebug.console.enableSites", "true") fp.set_preference("extensions.firebug.net.enableSites", "true") fp.set_preference("extensions.firebug.script.enableSites", "true") fp.set_preference("extensions.firebug.allPagesActivation", "on") # Create virtual display display = Display(visible=0, size=(1400, 950)) display.start() # Create webdriver and retrieve url driver = webdriver.Firefox(firefox_profile=fp) driver.get(SITE_URL + '/login') # Wait for username input box to appear WebDriverWait(driver, PAGE_LOAD_TIMEOUT).until(EC.presence_of_element_located((By.ID, "dijit_form_TextBox_0"))) # Set username and password, click login button userElement = driver.find_element_by_id("dijit_form_TextBox_0") pwdElement = driver.find_element_by_id("dijit_form_TextBox_1") userElement.send_keys(args.user) pwdElement.send_keys(args.passwd) loginElement = driver.find_element_by_id("dijit_form_Button_1") loginElement.click() time.sleep(3) # Retrieve home page, wait for an expected page element to load, take a screenshot driver.get(SITE_URL + '/portal/portal/patric/Home') WebDriverWait(driver, PAGE_LOAD_TIMEOUT).until(EC.presence_of_element_located((By.ID, "cart"))) driver.set_window_size(1400, 950) driver.execute_script("window.scrollTo(0,0);") driver.get_screenshot_as_file("homepage_after_login.jpg") print "Saved screenshot to: homepage_after_login.jpg\n" # Retrieve ws url, wait for create folder button to appear ws_url = SITE_URL + '/workspace/' + args.user + '@patricbrc.org/home' driver.get(ws_url) WebDriverWait(driver, PAGE_LOAD_TIMEOUT).until(EC.presence_of_element_located((By.CLASS_NAME, "ActionButtonContainer"))) time.sleep(5) # Have to reload page, because often time the workspace is empty on first load driver.get(ws_url) WebDriverWait(driver, PAGE_LOAD_TIMEOUT).until(EC.presence_of_element_located((By.CLASS_NAME, "ActionButtonContainer"))) # createFolderButton = driver.find_element_by_class_name("ActionButton fa icon-folder-plus fa-2x") # createFolderButton.click() time.sleep(30) driver.quit() display.stop() return 0
def loadSite(url): profile = webdriver.FirefoxProfile() profile.set_preference("network.proxy.type", 1) profile.set_preference("network.proxy.http", "74.84.131.34") profile.set_preference("network.proxy.http_port", int('80')) profile.update_preferences() # display = Display(visible=0, size=(800, 600)) display.start() path_to_chromedriver = '/home/alexandr/www/html/python/prs/files/geckodriver' browser = webdriver.Firefox(firefox_profile = profile, executable_path = path_to_chromedriver) # browser.delete_all_cookies() browser.get(url) #print(browser.page_source) #print(browser.page_source) tree = etree.HTML( browser.page_source) # browser.close() display.stop() # nodes = tree.xpath('//table[@class="network-info"]//tr/td') for node in nodes: print(node.text) return 1
class FunctionalTest(StaticLiveServerTestCase): @classmethod def setUpClass(cls): for arg in sys.argv: if 'liveserver' in arg: cls.server_url = 'http://' + arg.split('=')[1] return super().setUpClass() cls.server_url = cls.live_server_url @classmethod def tearDownClass(cls): if cls.server_url == cls.live_server_url: super().tearDownClass() def setUp(self): self.display = Display(visible=0, size=(1024, 768)) self.display.start() self.browser = webdriver.Firefox() # self.browser.implicitly_wait(3) def tearDown(self): self.browser.quit() self.display.stop() def check_for_row_in_list_table(self, row_text): table = self.browser.find_element_by_id('id_list_table') rows = table.find_elements_by_tag_name('tr') self.assertIn(row_text, [row.text for row in rows])
class TestCase(unittest.TestCase): def setUp(self): app.config['TESTING'] = True app.config['SQLALCHEMY_DATABASE_URI'] = 'sqlite:///:memory:' self.app = app.test_client() db.create_all() self.display = Display(visible=0, size=(800, 600)) self.display.start() self.driver = webdriver.Firefox() def tearDown(self): db.session.remove() db.drop_all() self.driver.quit() self.display.stop() def test_extract_funds(self): funds = extract_funds( # some javascript going on that I can't figure out how to mock #'file:///%s/t/test_files/list_mutual_funds.html' % basedir, self.driver ) self.assertTrue(len(funds) > 110)
def work(): logging.info("start weeklys screenshot work") print ("start ... ") if not DISPLAY: print ("hide display ... ") display = Display(visible=0, size=(1366, 768)) display.start() config = getConfigObj() if config == None: return False userName = config.get("USER", "UserName") userPWD = config.get("USER", "userPWD") ret = getTowerWeeklyScreenshot(userName, userPWD, DEFAULT_SAVE_PATH) if not ret: print ('Error, abort. Please check the log file "%s"' % LOG_FILE) return False logging.info("finish all work, exit.") if not DISPLAY: display.stop() return True
def get_news(): if check_wlan(): from pyvirtualdisplay import Display import re display = Display(visible=0, size=(800, 600)) display.start() driver = webdriver.Firefox() url = "http://www.deutschlandfunk.de/" driver.get(url) source = driver.find_element_by_xpath('//*[@id="wrapper"]/div/section[2]/div[1]').get_attribute('innerHTML') n_articles = source.count('<article') print(str(n_articles) + " articles found.") lst = re.findall('<h3>(.+)</h3>', source) result = lst driver.close() display.stop() return result else: print("Error: Not connected to the internet")
def run_selenium(landmark): display = Display(visible=0, size=(800, 600)) display.start() logTo(TEST_LOG,'Selenium : Starting Selenium for '+landmark,'INFO','a') interFace=open(HOME_DIR+'/Desktop/one-time-test-suite/iface.txt','r') tmp=interFace.readlines() iface=tmp[0].split('\n')[0] tmpstmp=datetime.now().strftime("%s") profile = webdriver.FirefoxProfile() profile.update_preferences() browser = webdriver.Firefox(firefox_profile=profile) # assign profile to browser browser.delete_all_cookies() logTo(TEST_LOG,' Selenium : Starting tcpdump .. ','INFO','a') tcpcmd='tcpdump -i '+iface+' -w '+EXP_DIR+'/'+'tcpdump_'+landmark.split('.')[0]+'_'+tmpstmp args=shlex.split(tcpcmd) ptcpdmp=sub.Popen((args)) time.sleep(10) logTo(TEST_LOG,' Selenium : Starting get '+landmark,'INFO','a') browser.get('http://www.'+landmark) time.sleep(5) perfData=browser.execute_script('return window.performance.timing') fname=EXP_DIR+'/'+'perfdata_'+landmark.split('/')[0] fname=fname.replace('.','-') pickle.dump(perfData,open(fname,'wb')) logTo(TEST_LOG,'Selenium : Writing done to '+EXP_DIR+'/perfdata_'+landmark,'INFO','a') browser.quit() display.stop() ptcpdmp.terminate() logTo(TEST_LOG,'Finished Selenium for '+landmark,'INFO','a')
class AdminTestCase(LiveServerTestCase): def setUp(self): self.display = Display(visible=0, size=(800, 600)) self.display.start() self.selenium = webdriver.Firefox() super(AdminTestCase, self).setUp() def tearDown(self): self.selenium.quit() self.display.stop() super(AdminTestCase, self).tearDown() def test_payment(self): """ payment will be successful. """ self.selenium.get("%s/pay" % self.live_server_url) self.selenium.implicitly_wait(20) self.selenium.maximize_window() self.selenium.find_element_by_name("amount").send_keys("100000") pay_button = self.selenium \ .find_element_by_xpath('//input[@value="pay"]') pay_button.click() return_to_site_button = self.selenium.find_element_by_id("btn3") return_to_site_button.click() self.assertIn("successful", self.selenium.page_source)
def get_all_items(): #list to store alll scraped data all_items = list() #Display - read about pyvirtualdisplay display = Display(visible=0, size=(1024, 768)) display.start() #webdriver - read about selenium.webdriver driver = webdriver.Firefox() #this is a starting page we are scraping driver.get("http://www.federalreserve.gov/apps/reportforms/default.aspx") #Every element on the HTML page can be located using CSS selectors. #Opening the starting page in Chrome, right click on the drop-down menu, click "Inspect" we see a tag on the right highlighted, we copy it's id - MainContent_ddl_ReportForms #Knowing the id of dropdown menu, we can locate it with Selenium like this main_menu = WebDriverWait(driver,10).until(EC.presence_of_element_located((By.CSS_SELECTOR,"#MainContent_ddl_ReportForms"))) #Drop down menu is an HTML table of options which can be verified in Chrome browser (Developer Tools, that pop up when you right click and press "Inspect" on an element) #Following returns all of the options - rows in that table form_options = main_menu.find_elements_by_tag_name("option") #We count them option_count = len(form_options) #Next, we loop over all of them - essentially like we scrolling down the drop down menu and clicking on each every form for form_i in xrange(1,option_count): #Get web element corresponding to a form form = form_options[form_i] #Click as a mouse click-action in browser form.click() #Get text, because we need to store the form number form_id = form.text #Locate a web element corresponding to the submit button. By CSS selector which we found by inspection in Chrome browser (same logic as above) submit_button = WebDriverWait(driver,3).until(EC.presence_of_element_located((By.CSS_SELECTOR,"#MainContent_btn_GetForm"))) #Click as a mouse click-action in browser submit_button.click() #Prepare data structures to store all the info we want to scrape a = dict.fromkeys(['Description','OMB','Background','RespondentPanel','Frequency','PublicRelease']) #We are on a web page after submit-click, following will search for all items of interest. Or for corresponding #web-elements for el in a.keys(): try: item = driver.find_element_by_css_selector("#MainContent_lbl_"+el+"_data") #Once found it will store them in our dictionary, if not it will proceed to "except" section and do nothing a[el] = item.text except: #case when there is no such field pass #we need form number as well a['FormNumber'] = form_id #keeping them all in one list, which will have a dictionary per Form Number - and later, a row in your excel file per Form number all_items.append(a) #Ok, that part bothers me a little: it looks like I have to refresh "form_options" each time... #Otherwise I get following exception: selenium.common.exceptions.StaleElementReferenceException: Message: Element not found in the cache - perhaps the page has changed since it was looked up driver.get("http://www.federalreserve.gov/apps/reportforms/default.aspx") main_menu = WebDriverWait(driver,10).until(EC.presence_of_element_located((By.CSS_SELECTOR,"#MainContent_ddl_ReportForms"))) form_options = main_menu.find_elements_by_tag_name("option") driver.close() display.stop() return all_items
def get_image(self): ## Uses supplied scrape site to find new pictures url = self.scrape_site # virtual display for headless runs display = Display(visible=0, size=(800, 600)) display.start() with closing(Firefox()) as browser: browser.get(url) time.sleep(5) # TODO: fix with something less static, but still # multipurpose considering scrape_site as a db var imgs = browser.find_elements_by_tag_name('img') # TODO: fix this temporary workaround that prevents ad server data # from reaching the image checks no_ad_imgs = [i for i in imgs if 'adsrvr' not in \ i.get_attribute('src')] for img in no_ad_imgs: src = img.get_attribute('src') alt = img.get_attribute('alt') image_id = re.findall("/photo/(.+?)/", src)[0] if(self._check_id(image_id) and self._check_ratios(src)): self.img_id = image_id self.description = alt self._save_hd_image() break display.stop() if (self.img_id): return raise Exception('Failed to find a suitable image: all out or bugged')
def openurl(companyname=first_arg): display = Display(visible=0, size=(1024, 768)) display.start() browser = webdriver.Firefox() time.sleep(randint(8,10)) try: browser.get('http://www.google.com') time.sleep(5) search = browser.find_element_by_name('q') input_text = companyname + str(" crunchbase") search.send_keys(input_text) time.sleep(randint(10,15)) search.send_keys(Keys.RETURN) time.sleep(randint(10,15)) gn = browser.find_element_by_tag_name('h3').text gnc = str(gn).split(' | ')[0].replace(" ","") output_file = '0515' + gnc + '.html' browser.find_element_by_link_text(gn).click() time.sleep(randint(55,60)) company_html = browser.page_source time.sleep(randint(5,10)) with open("smallname.txt", 'a') as myfile: json.dump(output_file,myfile) with open(output_file, 'a+') as myfile: myfile.write(company_html) except: company_html = 'none' with open("missedname.txt", "a") as myfile: json.dump(companyname,myfile) time.sleep(1) browser.close() time.sleep(1) display.stop() return company_html
class BrowserManager: def __init__(self): self._lock = False def bootup(self): self._display = Display(visible=0, size=(1024, 768)) self._display.start() profile = {} if 'HTTP_PROXY' in os.environ: proxy_url = os.environ['HTTP_PROXY'] proxy_server = proxy_url.split(':')[1][2:] proxy_port = proxy_url.split(':')[-1] profile['network.proxy.type'] = 1 profile['network.proxy.http'] = proxy_server profile['network.proxy.http_port'] = proxy_port profile['network.proxy.https'] = proxy_server profile['network.proxy.https_port'] = proxy_port self.browser = Browser(profile_preferences=profile) def obtain(self,background): while self._lock: background.wait('Browser lock', 15) self._lock = True return self.browser def release(self,background): self._lock = False def shutdown(self): self.browser.quit() self._display.stop()
def main(): if (len(sys.argv) < 2): print "./fb_auto_commenter.py Brazil/English/French" return fb_auto_mail.write_file("---------------------------" + sys.argv[1] + "\n") try: display = Display(visible=0, size=(800,600)) display.start() #打开区域权限 logging.info(">>>>>>>open limit") open_limit() #读取googledocs 群组信息 logging.info(">>>>>>>read from google docs") global french_groups_id french_groups_id = read_from_googledocs() #french_groups_id = ['309490585766406', '745769152175443', '1393190844256106', '1384933575085078', '1458512047714028', '1581747275377893', '778025652245798', '252563551503667', '1468450793419237'] logging.info(french_groups_id) #打开任务进程 logging.info(">>>>>>>start post task") start_task_process() #关闭权限 logging.info(">>>>>>>close limit") close_limit() logging.info(">>>>>>>send result mail") fb_auto_mail.send_mail() except Exception as e: logging.error(e) finally: logging.info("end") display.stop()
def authorizeToken(requestTokenResponse): """ Given a dict requestTokenResponse with the temporary oauth_token and oauth_token_secret, we generate a login link that a user should interact with to obtain an authCode <str> This process is automated with Splinter and pyvirtualdisplay """ resource_owner_key = requestTokenResponse['oauth_token'] resource_owner_secret = requestTokenResponse['oauth_token_secret'] redirect_response = 'https://us.etrade.com/e/t/etws/authorize?key={}&token={}'.format(client_Consumer_Key,resource_owner_key) # print 'go to this link for authorization:', redirect_response # cannot parse redirect_response without a browser because the response is not pure json # oauth_response = oauth.parse_authorization_response(redirect_response) # Open URL in a new tab, if a browser window is already open. # webbrowser.open_new_tab(redirect_response) # Display allows the script to run in a linux cloud without a screen display = Display(visible=0, size=(1024, 768)) display.start() # create a browser using Splinter library and simulate the workflow of a user logging in # various time.sleep(n) is inserted here to make sure login is successful even on slower connections with Browser() as browser: # Visit URL url = redirect_response browser.visit(url) if browser.is_element_present_by_name('txtPassword', wait_time=0): browser.fill('USER', etrade_settings.username) time.sleep(3) browser.find_by_name('txtPassword').click() time.sleep(3) # pprint(browser.html) browser.fill('PASSWORD', etrade_settings.userpass) # Find and click the 'logon' button browser.find_by_name('Logon').click() time.sleep(3) if browser.is_element_present_by_name('continueButton', wait_time=2): browser.find_by_name('continueButton').click() browser.find_by_value('Accept').click() time.sleep(3) # authCode = browser.find_by_xpath("//@type='text'").first.value authCode = browser.find_by_tag("input").first.value time.sleep(3) display.stop() return authCode
def main() : display = Display(visible=0, size=(800, 600)) display.start() authurl = "https://firewall.amritanet.edu:8443/auth1.html" delay = 3 print "\n\n[*] Opening a New Session.." driver = webdriver.Firefox() driver.get(authurl) assert "Sonic" in driver.title print "\n\n[*] Enumerating Login Page.." user = driver.find_element_by_name("userName") passwd = driver.find_element_by_name("pwd") print "\n\n[*] Sending Credentials .. " user.send_keys("<user_name_here>") passwd.send_keys("<password_here>") passwd.send_keys(Keys.RETURN) driver.get("http://www.msftncsi.com/ncsi.txt") print "\n\n[*] Login Done!" driver.quit() display.stop()
def retrieveYRdata(url): print "processing yddapRewop ..." display = Display(visible=0, size=(1024, 1024)) display.start() # driver = webdriver.Firefox() # http://stackoverflow.com/questions/8255929/running-webdriver-chrome-with-selenium driver = webdriver.Chrome() driver.get(url) html = driver.page_source driver.quit() display.stop() parser = etree.HTMLParser() tree = etree.parse(StringIO(html), parser) first = 1 table_list = list() for table_element in tree.getiterator("table"): if "id" in table_element.keys() and "class" in table_element.keys() and "style" in table_element.keys(): if not first: table_list.append(table_element) first = 0 for table_element in table_list: parse_table_element(table_element)
class SeleniumRunner(object): def __call__(self, f): @functools.wraps(f) def decorated(_self, *args, **kwargs): with self as driver: return f(_self, driver, *args, **kwargs) return decorated def __enter__(self): self.display = Display(visible=0, size=(800, 600)) self.display.start() self.driver = webdriver.Chrome() return self.driver def __exit__(self, *args, **kwargs): try: self.driver.quit() except (AttributeError,) as e: # Someone has messed with our browser pass try: self.display.stop() except (AttributeError,) as e: # Someone has messed with our display pass
class Collab(threading.Thread): """docstring forCollab""" def __init__(self, selector): threading.Thread.__init__(self) self.__display = Display(visible=0, size=(800, 600)) self.__display.start() chrome_options = webdriver.ChromeOptions() chrome_options.add_argument("--no-sandbox") chrome_options.binary_location = CHROME_LOCATION self.__driver = webdriver.Chrome("/opt/selenium/chromedriver", chrome_options=chrome_options, service_args=["--verbose", "--log-path=/home/log"]) self.__driver.get(URL + DOCID) self.content_editor = "" self.alive = False self.select = None while self.select is None: self.__driver.implicitly_wait(20) self.select = self.__driver.find_element_by_class_name( selector) def stop(self): self.alive = False self.__driver.close() self.__display.stop()
def process_screenshots(app, env): if not hasattr(env, 'screenshot_all_screenshots'): return if not app.config['screenshots_create']: print("Not doing screenshots on maggies farm no more") return if 'SPHINX_SS_USE_PVD' in os.environ.keys() and os.environ['SPHINX_SS_USE_PVD'] == "true": from pyvirtualdisplay import Display # Start a virtual headless display display = Display(visible=0, size=(1024, 768)) display.start() else: display = None # Don't bother building screenshots if we're just collecting messages. # Just checks if we invoked the build command with "gettext" in there somewhere if "gettext" in sys.argv: return all_args = map(lambda x: x['from_str_arg'], env.screenshot_all_screenshots) # If building in a different language, start the server in a different language command = SCREENSHOT_COMMAND + SCREENSHOT_COMMAND_OPTS + \ [re.sub(r"\s", r"", "--from-str={0}".format(json.dumps(all_args)))] language = env.config.language if language: command += ["--lang={0}".format(language)] subprocess = Popen(command) subprocess.wait() try: if subprocess.returncode: raise Exception("Screenshot process had nonzero return code: {0}".format(subprocess.returncode)) finally: if display: display.stop()
def run(): #read input variables ABR_ALG = args.abr_alg #abr algorithm to execute TIME = args.time_seconds # time to sleep ins seconds SERVER_ADDR = args.server_addr #server address to open STREAM_ID = str(args.stream_id) TRACE = args.trace EXP_ID = args.result_dir + '/log_' + ABR_ALG + '_' + TRACE + '_' + STREAM_ID #path to logsile #print >> sys.stderr, 'udp', args.udp if args.udp: url = 'http://localhost/' + 'myindex_' + ABR_ALG + '_udp.html' else: url = 'http://localhost/' + 'myindex_' + ABR_ALG + '.html' # timeout signal signal.signal(signal.SIGALRM, timeout_handler) signal.alarm(TIME + 30) try: port = ABR_SERVER_PORT_OFFSET + args.stream_id # Note: all the video servers have to take the same params # # log_file_dir_abr_server = os.path.join(args.result_dir, 'result') if not os.path.exists(log_file_dir_abr_server): os.makedirs(log_file_dir_abr_server, 0o777) python_v = 'python3' command = [ python_v, args.server_module, str(port), ABR_ALG, EXP_ID, str(TIME), args.result_dir, STREAM_ID ] if args.debug: command.append('--debug') if args.display: command.append('--display') global proc cmd = '' for x in command: cmd += x + ' ' logger.info("Starting the server located at {}".format(command[1])) proc = subprocess.Popen([cmd], stdout=subprocess.PIPE, shell=True) sleep(10) url += '?p=' + str(port) print( port ) # This has to be the only print statement up to this point. This is because every time we call print, # its string is passed to competitive_tests.py using pipes sys.stdout.flush() #r = requests.post('http://localhost:' + str(port), json={'suggested_bitrate': 4300}) # to not display the page in browser (unless -b option) if args.show_browser: logger.info("Not displaying the browser") display = Display(visible=0, size=(300, 400)) display.start() #init chrome driver ''' default_chrome_user_dir = 'abr_browser_dir/chrome_data_dir' chrome_user_dir = '/tmp/chrome_user_dir_id_' os.system('rm -r ' + chrome_user_dir) os.system('cp -r ' + default_chrome_user_dir + ' ' + chrome_user_dir) chrome_driver = 'abr_browser_dir/chromedriver' ''' options = Options() options.add_argument('--ignore-certificate-errors') options.add_argument('--autoplay-policy=no-user-gesture-required') options.add_argument("--disable-infobars") options.add_argument('--disable-application-cache') options.add_argument('--media-cache-size=1') options.add_argument("--disk-cache-size=1") options.add_argument( "--disable-web-security" ) # only needed when running tests over the UDP proxy options.add_argument("--explicitly-allowed-ports=6000") options.add_argument("--auto-open-devtools-for-tabs") logger.info("Options have been added to chrome driver") #enable quic if args.quic: logger.info("Enabling quic") options.add_argument('--no-proxy-server') options.add_argument('--enable-quic') options.add_argument('--quic-version=QUIC_VERSION_39') options.add_argument('--quic-host-whitelist="https://' + SERVER_ADDR + '" "https://' + SERVER_ADDR + '"') options.add_argument('--origin-to-force-quic-on=' + SERVER_ADDR) # start chrome #driver=webdriver.Chrome(chrome_driver, chrome_options=options) driver_path = './src/chromedriver' driver = webdriver.Chrome(chrome_options=options, executable_path=driver_path) driver.set_page_load_timeout(30) driver.get(url) logger.info("Chrome driver started") #run for @TIME seconds wait_for_video_end(pipe_out=proc.stdout, timeout=TIME) logger.info("Video ended") driver.quit() logger.info("Driver quitted") if args.show_browser: logger.info("Stopping display") display.stop() logger.info("Sending SIGINT to the video server") proc.kill() proc.wait() except Exception as e: logging.error(traceback.format_exc()) try: display.stop() except: logging.error(traceback.format_exc()) try: driver.quit() except: logging.error(traceback.format_exc()) try: proc.kill() proc.wait() except: logging.error(traceback.format_exc())
print i + 1, str(webslist[i]) log.write(str(i + 1) + str(webslist[i]) + '\n') chrome_options = webdriver.ChromeOptions() chrome_options.add_argument("--proxy-server={0}".format(proxy.proxy)) driver = webdriver.Chrome(chrome_options=chrome_options) driver.set_page_load_timeout(60) proxy.new_har() web = webslist[i] try: driver.get(web) except: print "Error" log.write('Error\n') driver.quit() continue else: proxy.har filename = 'httpdata22/' + str(i) + '.har' out_f = open(filename, 'w') json.dump(proxy.har, out_f) out_f.close() driver.quit() server.stop() display.stop() log.close()
class PassQuiz: def __init__(self): self.url = 'http://elearning.surgeons.org/course/view.php?' \ 'id=127§ion=0' self.display = Display(visible=0, size=(1024, 768)) self.display.start() self.driver = webdriver.Firefox() def sign_in(self): """ Authorization """ EMAIL = os.getenv('EMAIL') EMAIL_PASSWORD = os.getenv('PASSWORD') # find and fill inputs in form username = self.driver.find_element_by_name("username") username.send_keys(EMAIL) password = self.driver.find_element_by_name("password") password.send_keys(EMAIL_PASSWORD) # send form data self.driver.find_element_by_id("regularsubmit").click() def get_body(self): return self.driver.page_source def pass_quiz(self): """ """ driver = self.driver driver.get(self.url) self.sign_in() for s in range(1, 5): for j in range(1, 4): driver.get( 'http://elearning.surgeons.org/course/view.php?id=127§ion=0' ) category = driver.find_element_by_xpath('//li[@id="section-' + str(s) + '"]/div/ul/li[' + str(j) + ']/div/div/div/div/a') category.click() for i in range(0, 500): re_attempt = driver.find_element_by_xpath( "//input[@value='Re-attempt quiz'] | " "//input[@value='Attempt quiz now']") re_attempt.click() finish_quiz = driver.find_element_by_class_name( "endtestlink") finish_quiz.click() finish_all = driver.find_element_by_xpath( "//input[@value='Submit all and finish']") finish_all.click() time.sleep(2) confirm = driver.find_element_by_xpath( "//div[@class='confirmation-dialogue']/div/" "input[@value='Submit all and finish']") confirm.click() finish_review = driver.find_element_by_partial_link_text( 'Finish review') finish_review.click() def __del__(self): pass self.driver.delete_all_cookies() self.driver.close() self.display.stop()
def main(): argv = None ## Set of test arguments, uncomment to try the crawler # argv = ["cookiebot", "-u", "https://purplemath.com/", "-u", "https://gamefly.com/", "-n", "2"] # argv = ["onetrust", "-n", "5", "-u", "https://www.metabomb.net/", "-u", "https://www.maytag.com/", "-u", "https://www.aveda.com/", "-u", "https://www.equipmenttrader.com/", "-u", "https://www.tiffany.com/"] # argv = ["all", "-n", "1", "-u", "https://www.equipmenttrader.com/"] # parse usage docstring and get arguments cargs = docopt(__doc__, argv=argv) sites: Set[str] = retrieve_cmdline_urls(cargs) filtered_sites: List[str] = filter_bad_urls_and_sort(sites) # safety check if len(filtered_sites) == 0: print("Website crawl list is empty. Aborting...", file=sys.stderr) return 1 # set up OpenWPM num_browsers = int(cargs["--num_browsers"]) manager_params, browser_params = TaskManager.load_default_params( num_browsers) for i in range(num_browsers): setup_browser_config(browser_params[i]) # define output directories manager_params["output_format"] = "local" manager_params["log_directory"] = "./logs/" os.makedirs(manager_params["log_directory"], exist_ok=True) # define log file and database paths now = datetime.now().strftime('%Y%m%d_%H%M%S') manager_params["log_file"] = f"crawl_{now}.log" # Database filename if cargs["--use_db"]: db_path, db_fn = os.path.split(cargs["--use_db"]) manager_params["data_directory"] = db_path manager_params["database_name"] = db_fn else: manager_params["data_directory"] = "./collected_data/" manager_params["database_name"] = f"crawl_data_{now}.sqlite" os.makedirs(manager_params["data_directory"], exist_ok=True) # activate pyvirtualdisplay disp = Display(backend="xvfb") disp.start() # prevent shutdown due to failures manager_params["failure_limit"] = 16384 # setting up the TaskManager creates the logger. then we can retrieve a sub-logger, and set it up. manager = TaskManager.TaskManager(manager_params, browser_params) logger = manager.logger total_commands = len(filtered_sites) # callback, executed once command sequence completes def progress_report(success: bool): global completed, interrupted if success: completed += 1 logger.info("Command sequence completed.") else: interrupted += 1 logger.warning("Command sequence has been interrupted!") logger.info("%i/%i completed, %i/%i interrupted" % (completed, total_commands, interrupted, total_commands)) # crawl each site # Can alter some parameters here if needed for j, site in enumerate(filtered_sites): command_sequence = CommandSequence.CommandSequence( site, site_rank=j, reset=True, blocking=False, callback=progress_report) if cargs["all"]: # CMP crawl and Browse functions consolidated into the same command # this is done such that browse can be aborted early if CMP is not found command_sequence.run_consent_crawl(num_links=10, sleep=1.0, timeout=180, abort_browse_early=True, subpage_timeout=10.0) else: # legacy variants of the consent crawler commands. Only a single CMP active. if cargs["cookiebot"]: command_sequence.try_extract_cookiebot_data(sleep=1.0, timeout=60) elif cargs["onetrust"]: command_sequence.try_extract_onetrust_data(sleep=1.0, timeout=60) elif cargs["termly"]: command_sequence.try_extract_termly_data(sleep=1.0, timeout=60) # browse the page to retrieve additional cookies command_sequence.browse(num_links=20, sleep=1.0, timeout=120, subpage_timeout=10.0) # Execute the two commands manager.execute_command_sequence(command_sequence) # shuts down the browsers and waits for the data to finish logging manager.close() disp.stop() return 0
def multi_mode(cli_parsed): dbm = db_manager.DB_Manager(cli_parsed.d + '/ew.db') dbm.open_connection() if not cli_parsed.resume: dbm.initialize_db() dbm.save_options(cli_parsed) m = Manager() targets = m.Queue() lock = m.Lock() multi_counter = m.Value('i', 0) display = None def exitsig(*args): dbm.close() if current_process().name == 'MainProcess': print '' print 'Resume using ./EyeWitness.py --resume {0}'.format( cli_parsed.d + '/ew.db') os._exit(1) signal.signal(signal.SIGINT, exitsig) if cli_parsed.resume: pass else: url_list, rdp_list, vnc_list = target_creator(cli_parsed) if any((cli_parsed.web, cli_parsed.headless)): for url in url_list: dbm.create_http_object(url, cli_parsed) for rdp in rdp_list: dbm.create_vnc_rdp_object('rdp', rdp, cli_parsed) for vnc in vnc_list: dbm.create_vnc_rdp_object('vnc', vnc, cli_parsed) if any((cli_parsed.web, cli_parsed.headless)): if cli_parsed.web and not cli_parsed.show_selenium: display = Display(visible=0, size=(1920, 1080)) display.start() multi_total = dbm.get_incomplete_http(targets) if multi_total > 0: if cli_parsed.resume: print 'Resuming Web Scan ({0} Hosts Remaining)'.format( str(multi_total)) else: print 'Starting Web Requests ({0} Hosts)'.format( str(multi_total)) if multi_total < cli_parsed.threads: num_threads = multi_total else: num_threads = cli_parsed.threads for i in xrange(num_threads): targets.put(None) try: workers = [ Process(target=worker_thread, args=(cli_parsed, targets, lock, (multi_counter, multi_total))) for i in xrange(num_threads) ] for w in workers: w.start() for w in workers: w.join() except Exception as e: print str(e) # Set up UA table here if cli_parsed.cycle is not None: ua_dict = get_ua_values(cli_parsed.cycle) if not cli_parsed.ua_init: dbm.clear_table("ua") completed = dbm.get_complete_http() completed[:] = [x for x in completed if x.error_state is None] for item in completed: for browser, ua in ua_dict.iteritems(): dbm.create_ua_object(item, browser, ua) cli_parsed.ua_init = True dbm.clear_table("opts") dbm.save_options(cli_parsed) for browser, ua in ua_dict.iteritems(): targets = m.Queue() multi_counter.value = 0 multi_total = dbm.get_incomplete_ua(targets, browser) if multi_total > 0: print( "[*] Starting requests for User Agent {0}" " ({1} Hosts)").format(browser, str(multi_total)) if multi_total < cli_parsed.threads: num_threads = multi_total else: num_threads = cli_parsed.threads for i in xrange(num_threads): targets.put(None) workers = [ Process(target=worker_thread, args=(cli_parsed, targets, lock, (multi_counter, multi_total), (browser, ua))) for i in xrange(num_threads) ] for w in workers: w.start() for w in workers: w.join() if any((cli_parsed.vnc, cli_parsed.rdp)): log._LOG_LEVEL = log.Level.ERROR multi_total, targets = dbm.get_incomplete_vnc_rdp() if multi_total > 0: print '' print 'Starting VNC/RDP Requests ({0} Hosts)'.format( str(multi_total)) app = QtGui.QApplication(sys.argv) timer = QTimer() timer.start(10) timer.timeout.connect(lambda: None) # add qt4 reactor import qt4reactor qt4reactor.install() from twisted.internet import reactor for target in targets: if os.path.dirname(cli_parsed.d) != os.path.dirname( target.screenshot_path): target.set_paths(cli_parsed.d) tdbm = db_manager.DB_Manager(cli_parsed.d + '/ew.db') if target.proto == 'vnc': reactor.connectTCP( target.remote_system, target.port, vnc_module.RFBScreenShotFactory( target.screenshot_path, reactor, app, target, tdbm)) else: reactor.connectTCP( target.remote_system, int(target.port), rdp_module.RDPScreenShotFactory( reactor, app, 1200, 800, target.screenshot_path, cli_parsed.timeout, target, tdbm)) reactor.runReturn() app.exec_() if display is not None: display.stop() results = dbm.get_complete_http() vnc_rdp = dbm.get_complete_vnc_rdp() dbm.close() m.shutdown() write_vnc_rdp_data(cli_parsed, vnc_rdp) sort_data_and_write(cli_parsed, results) if cli_parsed.ocr: for target in targets: try: rdp_module.parse_screenshot(cli_parsed.d, target) except IOError: pass
def single_mode(cli_parsed): display = None if cli_parsed.web: create_driver = selenium_module.create_driver capture_host = selenium_module.capture_host if not cli_parsed.show_selenium: display = Display(visible=0, size=(1920, 1080)) display.start() elif cli_parsed.headless: if not os.path.isfile( os.path.join(os.path.dirname(os.path.realpath(__file__)), 'bin', 'phantomjs')): print(" [*] Error: You are missing your phantomjs binary!") print(" [*] Please run the setup script!") sys.exit(0) create_driver = phantomjs_module.create_driver capture_host = phantomjs_module.capture_host url = cli_parsed.single http_object = objects.HTTPTableObject() if cli_parsed.active_scan: http_object._active_scan = True http_object.remote_system = url http_object.set_paths(cli_parsed.d, 'baseline' if cli_parsed.cycle is not None else None) if cli_parsed.active_scan: http_object._active_scan = True web_index_head = create_web_index_head(cli_parsed.date, cli_parsed.time) if cli_parsed.cycle is not None: print 'Making baseline request for {0}'.format( http_object.remote_system) else: print 'Attempting to screenshot {0}'.format(http_object.remote_system) driver = create_driver(cli_parsed) result, driver = capture_host(cli_parsed, http_object, driver) result = default_creds_category(result) if cli_parsed.resolve: result.resolved = resolve_host(result.remote_system) driver.quit() if cli_parsed.cycle is not None and result.error_state is None: ua_dict = get_ua_values(cli_parsed.cycle) for browser_key, user_agent_value in ua_dict.iteritems(): print 'Now making web request with: {0} for {1}'.format( browser_key, result.remote_system) ua_object = objects.UAObject(browser_key, user_agent_value) ua_object.copy_data(result) driver = create_driver(cli_parsed, user_agent_value) ua_object, driver = capture_host(cli_parsed, ua_object, driver) ua_object = default_creds_category(ua_object) result.add_ua_data(ua_object) driver.quit() if display is not None: display.stop() html = result.create_table_html() with open(os.path.join(cli_parsed.d, 'report.html'), 'w') as f: f.write(web_index_head) f.write(create_table_head()) f.write(html) f.write("</table><br>")
class Scrape(object): def __init__(self, log=False): self.log = log self.display = Display(visible=0, size=(800, 2400)) self.display.start() logging.info('Initialized virtual display..') chrome_options = webdriver.ChromeOptions() chrome_options.add_argument('--no-sandbox') chrome_options.add_experimental_option( 'prefs', { 'download.default_directory': os.getcwd(), 'download.prompt_for_download': False, }) logging.info('Prepared chrome options..') self.browser = webdriver.Chrome(chrome_options=chrome_options) logging.info('Initialized chrome browser..') def __enter__(self): return self def __exit__(self, exc_type, exc_value, traceback): self.browser.quit() self.display.stop() def click_through(self, button): button.click() def link_has_gone_stale(): try: # poll the link with an arbitrary call button.find_elements_by_id('doesnt-matter') return False except exceptions.StaleElementReferenceException: return True wait_for(link_has_gone_stale) def scrape(self, url): self.browser.get(url) logging.info('Accessed %s ..', url) logging.info('Page title: %s', self.browser.title) def download_nsw(self, year=1989, retry=5): ''' @param retry Number of retry attempts on captcha ''' self.scrape( 'https://www.apps08.osr.nsw.gov.au/erevenue/ucm/ucm_list.php') cobj = Captcha("//img[@id='captcha']", "//input[@id='gd_securityCode']", "//button[@id='captcha']", retry, self.log) cobj.bypass(self.browser) er = self.browser.find_element_by_xpath("//select[@name='g_range']") select = Select(er) select.select_by_value('6') for y in range(year, 2019): for q in range(1, 8, 6): if q == 7 and y == 2018: break ed = self.browser.find_element_by_xpath( "//input[@id='g_date']") ed.send_keys(Keys.CONTROL + 'a') ed.send_keys('01/0%d/%d' % (q, y)) esb = self.browser.find_element_by_xpath( "//button[@id='g_submit']") esb.click() time.sleep(1) eeb = self.browser.find_element_by_xpath( "//button[@id='OpenResultDialog']") eeb.click() time.sleep(1) eeb = self.browser.find_element_by_xpath( "//button[@name='export_download']") eeb.click() time.sleep(20) os.rename("ucmlist.slk", "ucmlist-%d-0%d-01.slk" % (y, q))
class Scraper(): """Scraper parent class, child classes are media streaming sites.""" def __init__(self): """Sets creds for each instance.""" with open('creds.json', 'r') as f: self.creds = json.loads(f.read()) def start_driver(self, window_size='--window-size=1920,1080'): """Starts headless chrome browser/driver.""" logging.info('starting driver') self.display = Display(visible=0) # self.display = Display(visible=0, size=(1920, 1080)) self.display.start() options = Options() options.add_argument('--headless') options.add_argument('--disable-gpu') # likely necessary options.add_argument(window_size) self.driver = webdriver.Chrome(CHROMEDRIVER_PATH, options=options) self.driver.implicitly_wait(10) # seconds def stop_driver(self): """Stops headless browser/driver.""" logging.info('stopping driver') self.display.stop() self.driver.quit() def lookup_and_write_medias(self, medias, mtype): """Takes list of movies or shows, searches themoviedb, creates object to write to database, then inserts if new or updates timestamp if not new. """ logging.info('len(medias) before take unique: {}'.format(len(medias))) # get unique: list of dict into list of tuples, set, back to dict medias = [dict(t) for t in set([tuple(d.items()) for d in medias])] logging.info('len(medias) after take unique: {}'.format(len(medias))) for m in medias: source_to_write = dict(self.source) # if media link exists, set source link, try link db lookup / update if 'link' in m.keys(): source_to_write['link'] = m['link'] full_media = flaskapp.db_lookup_via_link(m['link']) if full_media: # logging.info(u'db media link found: {}'.format(m['title'])) flaskapp.update_media_with_source(full_media, source_to_write) continue # link url was not in database, therefore do themoviedb search sleep(0.2) year = m.get('year', '') results = flaskapp.themoviedb_search(m['title'], mtype, year=year) # exit iteration if search not complete or no results if 'total_results' not in results: logging.error(u'tmdb search not complete for {}: {} {}'.format( mtype, m['title'], year)) continue if results['total_results'] < 1: logging.warning(u'tmdb 0 results for {}: {} {}'.format( mtype, m['title'], year)) # empty media for db write, prevent re-searching full_media = dict() full_media['title'] = m['title'] full_media['mtype'] = mtype full_media['year'] = year full_media['id'] = m['link'] full_media['sources'] = [] else: # assume top result is best match and use it full_media = results['results'][0] # append data so dict can be saved to database full_media['mtype'] = mtype full_media['sources'] = [] if mtype == 'movie': full_media['year'] = full_media['release_date'][:4] else: full_media['title'] = full_media['name'] full_media['year'] = full_media['first_air_date'][:4] # check if titles are not exact match, in future may not append these if not flaskapp.doTitlesMatch(m['title'], full_media['title']): logging.warning(u'not exact titles: {} | {}'.format( m['title'], full_media['title'])) # write db media if new flaskapp.insert_media_if_new(full_media) # update db media with source flaskapp.update_media_with_source(full_media, source_to_write) def update_watchlist_amz(self): """For watchlist items check if amazon prime and amazon pay are sources and add to db""" wl_unique = flaskapp.get_all_watchlist_in_db() for m in wl_unique: media = flaskapp.themoviedb_lookup(m['mtype'], m['id']) flaskapp.amz_prime_check(media) sleep(2.5) flaskapp.amz_pay_check(media) sleep(2.5)
class SlackSpider(): def __init__(self): #self.all_items = [] self.channelList = [] self.dataList = [] self.pageSize = 0 self.urlsToHit = [] self.TeamName = '' self.ChannelName = '' # Open headless chromedriver def start_driver(self): print('starting driver...') self.display = Display(visible=0, size=(800, 600)) self.display.start() self.driver = webdriver.Chrome("/var/chromedriver/chromedriver") sleep(randint(9, 10)) # Close chromedriver def close_driver(self): print('closing driver...') self.display.stop() self.driver.quit() print('closed!') # Tell the browser to get a page def get_page(self, url): print('getting page...{0}'.format(url)) self.driver.get(url) sleep(randint(9, 10)) # Grab items from divisions def grab_list_items(self): print('grabbing list of items...') senderAvatar = '' all_items = [] for div in self.driver.find_elements_by_xpath( '//ul[@class="messages"]//li'): data = self.process_elements(div, senderAvatar) if data: all_items.append(data) if data.senderAvatar != '': senderAvatar = data.senderAvatar return all_items # Process division elements def process_elements(self, div, senderAvatar): msg_sender_avatar = '' try: msg_sender = div.find_element_by_class_name( "msg-user").get_attribute('innerText') msg_time = div.find_element_by_class_name( "msg-time").get_attribute('innerText') msg_body = div.find_element_by_class_name( "msg-body").get_attribute('innerText') except Exception as error: print 'element not found exception' return None try: avatar = div.find_element_by_xpath('.//*[@class="msg-avatar"]') msg_sender_avatar = avatar.find_element_by_class_name( 'msg-thumb').get_attribute('src') except Exception as error: msg_sender_avatar = senderAvatar if msg_sender and msg_time and msg_body: archiveObj = SlackArchive() archiveObj.teamName = self.TeamName archiveObj.channelName = self.ChannelName archiveObj.messageBody = msg_body archiveObj.senderAvatar = msg_sender_avatar archiveObj.messageTime = msg_time archiveObj.messageSender = msg_sender return archiveObj else: return None # Parse the URL def parse(self, url): self.get_page(url) return self.grab_list_items() pass # Get list of channels in a team def getChannelList(self): for channelName in self.driver.find_elements_by_xpath( '//ul[@class="channels-list"]//li//a'): self.channelList.append(channelName.text) pass # Get the total number of pages in each channel in each page def getPageSize(self, url_Template): for page in self.driver.find_elements_by_xpath( '//ul[@class="pagination pagination-vertical"]//li[@class="page-item active"]' ): self.pageSize = int(page.text) pass # Build the list of URL's to hit def buildTarget(self, teamName): url_Template = "https://{0}.slackarchive.io/".format(teamName) self.get_page(url_Template) self.getChannelList() if teamName == 'buffercommunity': self.channelList = self.channelList[7:] for channel in self.channelList: channelName = channel[1:].strip() urlA = url_Template + channelName + "/" self.get_page(urlA) self.getPageSize(urlA) print 'Page size: {0}'.format(self.pageSize) for i in range(1, self.pageSize + 1): urlObject = [] urlObject.append(teamName) urlObject.append(channelName) urlObject.append(urlA + "page-" + str(i)) self.urlsToHit.append(urlObject) pass # Run the crawler def runSpider(self, teamName): self.buildTarget(teamName) Utils.get_Connection_SNA4Slack() sync_table(SlackArchive) for url in self.urlsToHit: self.TeamName = url[0] self.ChannelName = url[1] count = 0 for data in self.parse(url[2]): if data: count += 1 node_object = SlackArchive( id=uuid.uuid1(), teamName=data.teamName, channelName=data.channelName, messageSender=data.messageSender.rstrip().lstrip(), messageBody=data.messageBody.rstrip().lstrip(), senderAvatar=data.senderAvatar, messageTime=dateutil.parser.parse(data.messageTime)) node_object.save() if count > 0: print '{0} rows saved'.format(count) else: print url[2] print 'No data found' pass
def wd_instance(driver_name, time_to_wait=30): """:param time_to_wait: Sets a sticky timeout to implicitly wait for an element to be found """ display = None remurl = 'http://%s:%s' \ % (get_config('CHROMEDRIVER_HOST'), get_config('CHROMEDRIVER_PORT')) if driver_name == 'firefox': driver = webdriver.Firefox() elif driver_name in user_agents.keys(): capabilities = {} if driver_name == 'mobile_chrome': capabilities = selenium.webdriver.DesiredCapabilities.CHROME capabilities["chromeOptions"] = { 'args': ['user-agent=%s' % user_agents[driver_name]], 'extensions': [] } elif driver_name != 'chrome': capabilities["chromeOptions"] = { 'args': ["user-agent=%s" % user_agents[driver_name]], 'extensions': [] } if 'chromeOptions' not in capabilities: capabilities['chromeOptions']={'args':[]} capabilities['chromeOptions']['args'].append('--test-type') if get_config('CHROMEDRIVER_VIRTUAL_DISPLAY'): chromedriver = get_config('CHROMEDRIVER_BINARY') os.environ['webdriver.chrome.driver'] = chromedriver display = Display(visible=0, size=DEFAULT_DISPLAY_SIZE) display.start() options = selenium.webdriver.ChromeOptions() if driver_name in user_agents: options.add_argument('--user-agent="%s"'%user_agents[driver_name]) try: driver = selenium.webdriver.Chrome(chromedriver,chrome_options=options,desired_capabilities=capabilities) driver.set_window_size(*DEFAULT_DISPLAY_SIZE) driver.set_window_position(0, 0) except Exception as expt: print(expt) print('-' * 80) raise Exception( 'failed to instantiate webdriver ' 'with binary path %s' % chromedriver) else: try: if get_config('CHROMEDRIVER_HOST') not in ['localhost', '127.0.0.1']: remurl += '/wd/hub' capabilities = selenium.webdriver.DesiredCapabilities.CHROME driver = selenium.webdriver.Remote( remurl, desired_capabilities=capabilities) driver.set_window_size(*DEFAULT_DISPLAY_SIZE) driver.set_window_position(0, 0) except Exception as expt: print('-' * 80) raise Exception('could not connect to selenium at %s; ' 'CHECK THAT YOU HAVE CHROMEDRIVER RUNNING - ' 'http://code.google.com/p/chromedriver/' 'downloads/list' 'Exception: %s' % (remurl, str(expt))) else: raise Exception('Driver not defined!') if not display is None: driver.display_stop = lambda: display.stop() driver.implicitly_wait(time_to_wait) return driver
class LexisNexisSpider(scrapy.Spider): name = 'lexisnexis' start_urls = [] s_date = '' e_date = '' c_date = '' page_cnt = 1 dont_filter = True agency_list = [] ''' today = datetime.now() + timedelta(days = -3) date = str(today)[0:10] year = date[0:4] month = date[5:7] day = date[8:10] ''' ''' Constructor ''' def __init__(self, keyword='nation', *args, **kwargs): self.keyword = keyword self.start_urls = ['http://www.google.com'] super(LexisNexisSpider, self).__init__(*args, **kwargs) self.display = Display(visible=0, size=(1280, 1024)) self.display.start() profile = webdriver.FirefoxProfile() profile.native_events_enabled = True self.driver = webdriver.Firefox(profile) # self.driver2 = webdriver.Firefox(profile) self.driver.get(self.get_query_url(self.keyword)) time.sleep(3) def __del__(self): self.driver.close() self.driver.quit() self.display.stop() print '************************************************************************' print 'CLOSED!!!' ''' Get the query url ''' def get_query_url(self, keyword): today = datetime.now() + timedelta(days=-25) date = str(today)[0:10] year = date[0:4] month = date[5:7] day = date[8:10] return 'http://www.lexisnexis.com/lnacui2api/api/version1/sr?sr=%28' + keyword + '%29%20and%20Date%28geq%28' + month + '/' + day + '/' + year + '%29%29&csi=8006%2C6742%2C8213%2C8142%2C8075%2C11810%2C306884%2C247189%2C163823%2C301477&oc=00006&hgn=t&hl=t&hes=t&hnsl=t&hsl=t&hdym=t&hfb=t&ssl=f&stp=bool&icvrpg=true' ''' #The New York Times +'%2C6742' \ # USA TODAY +'%2C8213' \ #Wall Street Journal Abstracts +'%2C8142' \ #The Washington Post +'%2C8075' \ #Post-Dispatch +'%2C11810' \ #The Baltimore Sun +'%2C306884' \ #The Philadelphia Inquirer +'%2C247189' \ #Chicago Daily Herald +'%2c163823' #Arizona Capitol Times +'%2c301477' ''' #return 'http://www.lexisnexis.com/lnacui2api/api/version1/sr?sr=%28' + keyword + '%29%20and%20Date%28geq%28'+ month + '/' + day + '/' + year + '%29%29&csi=8006%2C6742%2C8213%2C8142%2C8075&oc=00006&hgn=t&hl=t&hes=t&hnsl=t&hsl=t&hdym=t&hfb=t&ssl=f&stp=bool&icvrpg=true' #return 'http://www.lexisnexis.com/lnacui2api/api/version1/sr?sr=%28' + keyword + '%29%20and%20Date%28geq%284/5/2011%29%29&csi=8006%2C6742%2C8213%2C8142%2C8075&oc=00006&hgn=t&hl=t&hes=t&hnsl=t&hsl=t&hdym=t&hfb=t&ssl=f&stp=bool&icvrpg=true' def next_page(self, start_index): try: next_button = self.driver.find_element_by_xpath( '//table//table//table//table//table//table//td[@align="right"]/a/img[@src="images/IconPaginationNext.gif"]' ) except: return False pass risb = self.driver.find_element_by_xpath( '//input[@name="risb"]').get_attribute("value") nexpage = "http://www.lexisnexis.com/lnacui2api/results/listview/listview.do?start=" + str( start_index) + "&sort=RELEVANCE&format=GNBLIST&risb=" + risb self.driver.get(nexpage) time.sleep(2) source = self.driver.find_element_by_xpath( '//frame[@title="Results Content Frame"]') self.driver.get(source.get_attribute("src")) time.sleep(2) return True ''' Starting point Retrieve the news link from the list of search results. Args: response - the response object pertaining to the search results page ''' def parse(self, response): button_continue = self.driver.find_element_by_xpath( '//a[@id="firstbtn"]') try: button_continue.click() except: print 'can' 't find continue button ' source = self.driver.find_element_by_xpath( '//frame[@title="Results Content Frame"]') self.driver.get(source.get_attribute("src")) time.sleep(5) item_list = list() start_id = 1 while self.next_page(start_id): noshade_list = self.driver.find_elements_by_xpath( '//tr[@class="noshaderow1st"]') shade_list = self.driver.find_elements_by_xpath( '//tr[@class="shaderow1st"]') for news in noshade_list + shade_list: button = news.find_element_by_xpath('.//a') news_title = button.text news_url = button.get_attribute("href") news_agency = news.find_element_by_xpath( './/span[@class="notranslate"]').text article = LexisnexisArticleItem() article['title'] = news_title article['url'] = news_url article['agency'] = news_agency item_list.append(article) start_id += 25 print "++++++++++++++++++", len(item_list) for article in item_list: self.driver.get(article['url']) time.sleep(2) try: source = self.driver.find_element_by_xpath( '//frame[@title="Results Document Content Frame"]') self.driver.get(source.get_attribute('src')) time.sleep(2) date_str = self.driver.find_element_by_xpath( '//span[@class="verdana"]/center').text news_date = self.parse_date(date_str) news_id = self.driver.find_element_by_xpath( '//input[@name="docIdentifier"]') news_id = news_id.get_attribute('value') news_content_list = self.driver.find_elements_by_xpath( '//span[@class="verdana"]/p[@class="loose"]') news_content_list = [n.text for n in news_content_list] news_content = '.'.join(news_content_list) #Get keywords rake = Rake() keywords_list = rake.run(news_content) keywords = '\n'.join(keywords_list) tag = rake.get_tagged_text() #article['keywords'] = keywords article['aid'] = news_id article['date'] = news_date article['contents'] = news_content article['keywords'] = keywords article['tagged_text'] = tag except Exception, e: print 'ERROR!!!!!!!!!!!!! URL :' print traceback.print_exc(file=sys.stdout) yield article
class BrowserWebdriver(BrowserBase): skip_urls = [] def __init__(self, *args, **kwargs): BrowserBase.__init__(self, *args, **kwargs) self._first_navigation_ts = None self._first_navigation_netloc = None self._ts_offset = None def _skip_url(self, page, url): if not url: return False _, req_netloc, _ = parse_url(url) for su in self.skip_urls: if su in req_netloc: _, page_netloc, _ = parse_url(page.url) if not any(x in page_netloc for x in self.skip_urls): self.log_debug("skipping URL %s" % req_netloc) return True return False def _browser_clear_caches(self): BrowserBase._browser_clear_caches(self) self.driver.quit() self.pid = self.browser_start() def _browser_navigate(self, location, cached=True, name=None): url = location.url if isinstance(location, Page) else location real_navigation = self._http_get(url) return Page(self, url, cached, name=name, real_navigation=real_navigation) def _browser_wait(self, page, timeout=None): self.log_info("_browser_wait()...") if timeout is None: timeout = self.nav_timeout start = time.time() while time.time() - start < timeout / 2: time.sleep(0.2) if self.driver.execute_script( "return window.performance.timing.loadEventEnd"): break # onload event has not been processed yet, so need to wait and retry self.log_info("Waiting for loadEventEnd ... ") while time.time() - start < timeout: time.sleep(self.ajax_threshold) # hack. Execute something in browser context to flush logs... self.driver.execute_script( "return window.performance.timing.loadEventEnd") self._browser_get_events(page) ir = page.get_incomplete_reqs() if not ir: break self.log_info( "Waiting for incomplete requests:\n %s" % ("\n ".join(["%s - %s" % (r.id, r.url) for r in ir]))) if time.time() - start >= timeout: if not self.driver.execute_script( "return window.performance.timing.loadEventEnd"): self.log_error( "Page '%s' load timeout, window.performance.timing.loadEventEnd = 0" % page.url) ir = page.get_incomplete_reqs() if ir: self.log_error( "Can't wait for page '%s' load completion, " "see '%s' for details\nincomplete requests:\n %s" % (page.url, self.log_path, "\n ".join( ["%s - %s" % (r.id, r.url) for r in ir]))) page.complete(self) def _browser_warmup_page(self, location, name=None): self.navigate_to(location, cached=False, stats=False, name=name) def _browser_display_init(self, headless, resolution): if headless: try: from pyvirtualdisplay import Display except ImportError as e: abort(e) self.display = Display(visible=0, size=resolution) self.display.start() else: self.display = None def _browser_execute_script(self, js): val = self.driver.execute_script("return %s" % js) self.log_debug("%s = %s" % (js, val)) return val def browser_get_name(self): c = self.driver.capabilities return c['browserName'] def browser_get_version(self): c = self.driver.capabilities return self._get_val(c, ['version', 'browserVersion']) def browser_get_platform(self): c = self.driver.capabilities return self._get_val(c, ['platform', 'platformName']) def browser_get_screenshot_as_file(self, filename): self.driver.get_screenshot_as_file(filename) def browser_get_page_timeline(self, page): values = {} for t in PageTimeline.types: if t in PageTimeline.jstypes: js = "window.performance.timing.%s" % PageTimeline.jstypes[t] values[t] = self._browser_execute_script(js) return PageTimeline(page, values) # def browser_set_session(self, domain, session_id): # self._http_get(domain) # self.driver.add_cookie({'name': 'sessionid', 'value': session_id}) def browser_get_current_url(self): return self.driver.current_url def browser_get_screenshot(self, filename): self.driver.get_screenshot_as_file(filename) def browser_stop(self): try: if self.driver: self.driver.quit() self.driver = None if self.display: self.display.stop() self.display = None except URLError: pass def _xpath_click(self, xpath): exc = None # take into account possible replacements of %23/# xpaths = [xpath] if "%23" in xpath: xpaths.append(xpath.replace("%23", "#")) if "#" in xpath: xpaths.append(xpath.replace("#", "%23")) for x in xpaths: self.log_debug("Looking for xpath: %s ..." % x) try: el = self.driver.find_element_by_xpath(x) el.click() self.log_debug("Looking for xpath: %s ... OK" % x) return except NoSuchElementException as e: self.log_debug( "Looking for xpath: %s ... Failed, no such element" % x) exc = e except ElementNotVisibleException as e: self.log_warning( "Looking for xpath: %s ... Failed, element not visible" % x) exc = e self.log_error("NoSuchElementException, xpath: %s, see debug log" % xpath) self.log_debug("page source:\n%s" % self.driver.page_source.encode('ascii', 'ignore')) raise BrowserExc(e) def _http_get(self, url, validator=None): self.log_debug("Execute GET request: %s" % url) if not self._first_navigation_ts: self._first_navigation_ts = time.time() _, self._first_navigation_netloc, _ = parse_url(url) ar = url.split("^") if len(ar) > 1: self._xpath_click(ar[1]) return False try: self.driver.get(url) except WebDriverException as e: raise BrowserExc(e) return True @staticmethod def _get_val(d, keys): for key in keys: if key in d: return d[key] return "unknown" def print_browser_info(self): c = self.driver.capabilities self.print_stats_title("Browser summary") print(" - platform: %s" % self.browser_get_platform()) print(" - browser: %s %s" % (self.browser_get_name(), self.browser_get_version())) print(" - PID: %d" % self.pid) print(" - log file: %s" % self.log_path) def print_log_file_path(self): self.print_stats_title("Browser log file") print(" %s" % self.log_path) # === virtual methods that must be implemented in every webdriver-based browser === # def _browser_parse_logs(self, page, logs): raise BrowserExcNotImplemented() def _browser_get_events(self, page): raise BrowserExcNotImplemented() # === webdriver specific === # def dom_wait_element_stale(self, el, timeout_s=None, name=None): start_time = time.time() if timeout_s is None: timeout_s = self.nav_timeout # http://www.obeythetestinggoat.com/how-to-get-selenium-to-wait-for-page-load-after-a-click.html while time.time() < start_time + timeout_s: try: el.find_elements_by_id('doesnt-matter') pass except StaleElementReferenceException: break time.sleep(0.1) if time.time() > start_time + timeout_s: msg = "DOM element '%s' click() timeout: %.1fs" % ( name, time.time() - start_time) self.log_error(msg) raise BrowserExcTimeout(msg) def dom_click(self, el, timeout_s=None, name=None, wait_callback=None, wait_callback_obj=None): self.log_debug("dom_click(%s, %s)" % (str(el), str(name))) if timeout_s is None: timeout_s = self.nav_timeout p = Page(self, self.browser_get_current_url(), True, name=name, real_navigation=False) p.start() # 1. click on the element old_page = self.driver.find_element_by_tag_name('html') el.click() # 2. wait for selenium onclick completion if wait_callback: self.log_debug( "wait callback: %s, %s" % (str(wait_callback.__name__), str(wait_callback_obj))) wait_callback(wait_callback_obj, el, timeout_s, name) else: self.log_debug("wait stale: %s, %s, %s" % (el, timeout_s, name)) self.dom_wait_element_stale(el, timeout_s, name) # 3. wait for ajax completion, because browser URL can be update only after that self._browser_wait(p, timeout=timeout_s) p.url = self.browser_get_current_url() time.sleep(0.2) def dom_find_element_by_id(self, id): try: return self.driver.find_element_by_id(id) except NoSuchElementException as e: raise BrowserExc(e) def dom_find_element_by_name(self, name): try: return self.driver.find_element_by_name(name) except NoSuchElementException as e: raise BrowserExc(e) def dom_find_element_by_xpath(self, xpath): try: return self.driver.find_element_by_xpath(xpath) except NoSuchElementException as e: raise BrowserExc(e) def dom_find_frames(self): frames = [] for name in ("frame", "iframe"): try: frames += self.driver.find_elements_by_tag_name(name) except NoSuchElementException as e: pass return frames def dom_switch_to_frame(self, frame): self.log_info("Switching to frame %s" % frame) return self.driver.switch_to.frame(frame) def dom_switch_to_default_content(self): self.log_info("Switching to default content") return self.driver.switch_to.default_content() def dom_send_keys(self, el, keys): val = el.get_attribute('value') if val != '': # clear initial value self.log_info("Element value is not empty, clear content...") self.driver.execute_script("arguments[0].value = ''", el) time.sleep(2.0) for ch in keys: el.send_keys(ch) time.sleep(0.2) val = el.get_attribute('value') if val == keys: return True self.log_warning("Bogus selenium send_keys(). Entered: '%s', " "but see: '%s', using set_attribute()..." % (keys, val)) time.sleep(2.0) self.driver.execute_script("arguments[0].value = '%s'" % keys, el) time.sleep(2.0) val = el.get_attribute('value') if val == keys: self.log_info("Ok, set_attribute() works fine") return True self.log_error( "Bogus selenium send_keys() and set_attribute(), can't enter value into the element" ) return False # === some predefined scenarios === # def _do_send_keys(self, title, keys, tag_names, tag_ids): for tag, name in tag_names: try: el = self.dom_find_element_by_name(name) if el.tag_name != tag: continue if not self.dom_send_keys(el, keys): self.log_error("Couldn't enter %s" % title) return False return True except BrowserExc as e: pass for tag, name in tag_names: try: el = self.dom_find_element_by_xpath( '//*[@label="{}"]'.format(name)) if el.tag_name != tag: continue if not self.dom_send_keys(el, keys): self.log_error("Couldn't enter %s" % title) return False return True except BrowserExc as e: pass for tag, id in tag_ids: try: el = self.dom_find_element_by_id(id) if el.tag_name != tag: continue if not self.dom_send_keys(el, keys): self.log_error("Couldn't enter %s" % title) return False return True except BrowserExc as e: pass self.log_info("Couldn't find %s input field" % title) return False def _do_login(self, url, user, password, login_form, timeout_s=None): if not self._do_send_keys('user name', user, login_form.user_tags, login_form.user_ids): return False time.sleep(1) if not self._do_send_keys('password', password, login_form.pass_tags, login_form.pass_ids): return False time.sleep(1) submit_form_found = False for tag, name in login_form.sbmt_tags: try: el = self.dom_find_element_by_name(name) if el.tag_name != tag: continue submit_form_found = True self.dom_click(el, name=name, timeout_s=timeout_s) try: el = self.dom_find_element_by_name(name) except BrowserExc: self.log_info("Login succeed") return True except BrowserExc as e: pass for tag, id in login_form.sbmt_ids: try: el = self.dom_find_element_by_id(id) if el.tag_name != tag: continue submit_form_found = True self.dom_click(el, name=id, timeout_s=timeout_s) try: el = self.dom_find_element_by_id(id) except BrowserExc: self.log_info("Login succeed") return True except BrowserExc as e: pass for x in login_form.sbmt_xpath: try: el = self.dom_find_element_by_xpath(x) submit_form_found = True self.dom_click(el, name=id, timeout_s=timeout_s) try: el = self.dom_find_element_by_xpath(x) except BrowserExc: self.log_info("Login succeed") return True except BrowserExc as e: pass if not submit_form_found: self.log_info("Couldn't find login submit form") self.log_info("Login failed") return False def do_login(self, url, user, password, login_form, timeout_s=None): self.log_info("Trying to login to '%s' under user %s" % (url, user)) self.navigate_to(url, cached=None) if self._do_login(url, user, password, login_form, timeout_s=timeout_s): return True for frame in self.dom_find_frames(): self.dom_switch_to_frame(frame) if self._do_login(url, user, password, login_form, timeout_s=timeout_s): return True self.log_info("Login to '%s' under user '%s' has been failed" % (url, user)) return False
class PinterestImages(): def __init__(self): self.display = Display(visible=0, size=(800, 600)) self.display.start() self.srchurl = 'https://in.pinterest.com/search/pins/?q=%s' self.base_url = self.srchurl self.path_to_chromedriver = './chromedriver' self.browser = webdriver.Chrome( executable_path=self.path_to_chromedriver) self.browser = webdriver.Chrome() self.browser.get('https://in.pinterest.com/login/') self.elem = self.browser.find_elements_by_name("username_or_email") self.elem[0].send_keys("*****@*****.**") self.elem = self.browser.find_elements_by_name("password") self.elem[0].send_keys("qawsedrf") self.elem = self.browser.find_elements_by_xpath( "/html/body/div[1]/div[1]/div[1]/div/div/div/form/div[4]/div/button" ) self.elem[0].click() self.buton = '//*[@id="yui_3_5_1_1_1440135195051_1805"]' def crawl(self, qry): def noImages(psource): if psource == None: return 0 soup = BeautifulSoup(psource, 'lxml') imgs = soup.findAll('div', 'Image Module pinUiImage') return len(imgs) url = self.base_url % ('+'.join(qry)) self.browser.get(url) time.sleep(1) pps = None cps = None for i in range(1, 20): self.browser.execute_script("window.scrollTo(0, %d);" % (i * 10000)) time.sleep(10) cps = self.browser.page_source if noImages(cps) < noImages(pps): break pps = cps pagesource = pps soup = BeautifulSoup(pagesource, 'lxml') imgs = soup.findAll('div', 'Image Module pinUiImage') extractedUrls = [] for img in imgs: imgd = img.findAll('img') url = imgd[0]['src'] title = imgd[0]['alt'].encode('ascii', 'ignore') extractedUrls.append(url.replace('236x', '736x') + '\t' + title) with open('_'.join(sys.argv[1:]) + '_Pinterest', 'w') as outfile: for x in extractedUrls: outfile.write(x + '\n') def stop(self): self.browser.quit() self.display.stop()
class Order: def __init__(self, username, password, url): self.username = username self.password = password self.url = url self.display = Display(visible=0, size=(1920, 1080)) self.display.start() self.browser = webdriver.Chrome() self.browser.implicitly_wait(15) def goToPage(self): self.browser.get(self.url) print(self.browser.title) def login(self): loginButton = self.browser.find_element_by_css_selector( "a#nav-link-yourAccount span.nav-line-1") print(loginButton.text) if loginButton.text == "Hello. Sign in": loginButton.click() email = self.browser.find_element_by_id("ap_email") pw = self.browser.find_element_by_id("ap_password") email.clear() pw.clear() email.send_keys(self.username) pw.send_keys(self.password) submit = self.browser.find_element_by_id("signInSubmit") submit.click() else: print("Already logged in.") loginButton = self.browser.find_element_by_css_selector( "a#nav-link-yourAccount span.nav-line-1") print(loginButton.text) def placeOrder(self): print(self.browser.title) print("Placing order.") wait = WebDriverWait(self.browser, 10) addToCart = self.browser.find_element_by_css_selector( "input#add-to-cart-button") addToCart.click() time.sleep(10) print(self.browser.title) wait.until(EC.title_contains('Amazon.com Shopping Cart')) checkout = self.browser.find_element_by_css_selector( "a#hlb-ptc-btn-native") checkout.click() time.sleep(10) print(self.browser.title) wait.until(EC.title_contains('Amazon.com Checkout')) placeOrder = self.browser.find_element_by_name("placeYourOrder1") placeOrder.click() time.sleep(20) print(self.browser.title) wait.until(EC.title_contains('Amazon.com Thanks You')) def kill(self): self.browser.close() self.display.stop() def start(self): try: self.goToPage() self.login() self.placeOrder() except Exception: print("Exception Raised") raise finally: self.kill()
class TflCrawler(): def __init__(self): ''' Constructor method that instantiate the TflCrawler. ''' self.__site = 'http://cycling.data.tfl.gov.uk/' self.__elements = {} # initialise an empty dictionary self._file_type = 'CSV file' self.__folder_dir = os.path.abspath(os.path.dirname(__file__)) def _start_crawling(self, driver_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), 'chromedriver'))): ''' Start crawling process, creating and invisible browser display, with 800 by 600 dimension. Additionally, the location of Chrome driver is specified. :param driver_dir: defines the location of Chrome driver. The directory of the driver is specified as a relative path of the user working directory. ''' try: print('start driver...') self._display = Display( visible= 0, size = (800,600)) # create a chrome display with 800*600 dimension self._display.start() # starts the browser self._driver = webdriver.Chrome(driver_dir) # set the location of web driver except Exception as e: print(f'[No driver was identified. Identified files: {os.listdir(driver_dir)}]') def _stop_crawling(self): ''' It closes the browser display that was initialised by the start_crawling method. The driver it also stops. ''' print('closing driver...') self._display.stop() self._driver.quit() def _get_site(self, url): ''' The current method performs a request on http://cycling.data.tfl.gov.uk/ server and gets a response. The content of the response is converted in HTML and is returned by the method. :param url: the url of http://cycling.data.tfl.gov.uk/' ''' try: self._driver.get(url) # navigates to page sleep(5) # stops the code execution so that the HTML content to be loaded (5 to 10 seconds) return self._driver.execute_script('return document.body.innerHTML') # load the HTML content except Exception as e: print(f'[Unable to reach {self.__site}. Error : {str(e)}]') def _populate_dictionary(self, html): ''' The HTML structure that has been retrieved by __get_site method is analysed such that a dictionary of all csv files within the website is constructed. The dictionary is populated by each csv file that may be uploaded on TFL website. Each csv file is encapsulated as a dictionary, containing keys such that [name, url, date, size]. :param html: the html structure that is created by the __get_site method ''' try: print('get the content...') soup = BeautifulSoup(html, 'html.parser') # creates a soup object and defines how the HTML will be parsed # finds all tr elements with an attribute of data-level=3 main_content = soup.find_all('tr', attrs= { 'data-level' : '3' }) # iterate over the tr elements for i,item in enumerate(main_content): td = item.find_all('td') # retrieves the td elements within the tr # checks if the type of the 4th td element is CSV if (td[3].string == self._file_type): # Populates the dictionary self.__elements[i] ={ 'name' : td[0].a.string, 'url' : td[0].a['href'], 'date' : td[1].string, 'size' : td[2].string } except Exception as e: print(f'[Unable to parse the content of {self.__site}. Error: {str(e)}]') def parse(self): ''' Performs the entire process to parse the TFL website. In particular, starts the Chrome driver, waits until the site to load the HTML content, and therefore performs a request to the website. Then, the response is parsed, populating a dictionary that maintains all the csv files that might exist on that site :param driver_dir: defines the Google driver relative directory ''' self._start_crawling(os.path.join(self.__folder_dir,'chromedriver')) html = self._get_site(self.__site) self._populate_dictionary(html) self._stop_crawling() def retrieve_csv_files(self, DNS,rel_path): ''' Iterates over the constructed dictionary and retrieves each csv file that is identified. The csv files are saved locally. Additionally, the corresponded relations of the DB are created :param path: the relative path, which determines the location that the created csv file would be stored. ''' def populate_stations_pairs_relation(df): def insert(l): if len(l) > 1: # adds a colon at the end of the statement l[-1] = no_space_join([l[-1][:-1], ';']) # joins the insert statements statement = no_space_join(l) # insert the query execute(statement) conn.commit() # Drops duplicate routes, that have a start-end station which already exists dfrout= df[['StartStation Id','EndStation Id']].drop_duplicates() # drop OD routes that started and ended at the same station dfrout = dfrout.drop(dfrout[(dfrout['StartStation Id'] == dfrout['EndStation Id'])].index) # Variables to avoid overheading execute = cur.execute fetchall = cur.fetchall # corresponds to the stations that already exists in the DB AND have a location execute('SELECT station_id,st_asText(location) FROM webapp_stations WHERE location IS NOT NULL') # gets the stations that have a location stations = dict([(station[0], station[1].replace('MULTIPOINT', '')) for station in fetchall()]) # stations that in do not have a location in the database, are removed from the data frame sids = [s for s in stations.keys()] dfrout = dfrout[dfrout['StartStation Id'].isin(sids) == dfrout['EndStation Id'].isin(sids)] # requests the pairs of stations that exist in the database execute('SELECT start_station_id,end_station_id FROM webapp_stations_pairs_routes') pairs_dict = dict([(pair,pair) for pair in fetchall()]) # Variables that will used to construct the request url #plan = '&plan=' #plan_options = ['fastest','balanced','quietest'] plan = '&plan=balanced' default_url = 'https://www.cyclestreets.net/api/journey.json?key=112d0fc4c69f3951&itinerarypoints=' nPairs = dfrout.shape[0] try: # Variables out of the for loop #l = ['INSERT INTO webapp_stations_pairs_routes(start_station_id,end_station_id,fastest_ref_dist,fastest_ref_time,fastest_ref_geom,balanced_ref_dist,balanced_ref_time,balanced_ref_geom,quietest_ref_dist,quietest_ref_time,quietest_ref_geom) VALUES '] l = ['INSERT INTO webapp_stations_pairs_routes(start_station_id,end_station_id,balanced_ref_dist,balanced_ref_time,balanced_ref_geom) VALUES '] comma_join = ','.join no_space_join = ''.join pipe_join = '|'.join for i_pair,pair in enumerate(dfrout.itertuples()): # every 100 requests, stop the execution for 10 seconds (request policy) if i_pair % 1000 == 0 and i_pair > 0: sleep(5) print(f'Pair : {i_pair+1} of {nPairs}') start_station_id = int(pair[1]) end_station_id = int(pair[2]) # checks for OD pairs that do not exist in the DP (if the ) if (start_station_id,end_station_id) not in pairs_dict: try: start_coords = stations[start_station_id][1:-1].replace(' ',',') end_coords = stations[end_station_id][1:-1].replace(' ',',') #time,distance,coords = [],[],[] #atime = time.append #adistance = distance.append #acoords = coords.append #for option in plan_options: # request the link from www.cyclestreet.com response = requests.get(no_space_join([default_url, pipe_join([start_coords,end_coords]), plan])).json()['marker'][0]['@attributes'] # loads the json file into a python object(dictionary) time = response['time'] distance = response['length'] coords = f"st_GeomFromText('LINESTRING({response['coordinates'].replace(' ','?').replace(',',' ').replace('?',',')})',4326)" #response_json = loads(response)['marker'][0]['@attributes'] #atime(response['time']) #adistance(response['length']) #acoords(f"st_GeomFromText('LINESTRING({response['coordinates'].replace(' ','?').replace(',',' ').replace('?',',')})',4326)") except (KeyError,AttributeError): continue # creates a statement of the current pair #statement = no_space_join(['(',comma_join([str(start_station_id),str(end_station_id),distance[0],time[0],coords[0],distance[1],time[1],coords[1],distance[2],time[2],coords[2]]),'),']) statement = no_space_join(['(',comma_join([str(start_station_id),str(end_station_id),distance,time,coords]),'),']) l.append(statement) if i_pair % 100 == 0: insert(l) l = ['INSERT INTO webapp_stations_pairs_routes(start_station_id,end_station_id,balanced_ref_dist,balanced_ref_time,balanced_ref_geom) VALUES '] #l = ['INSERT INTO webapp_stations_pairs_routes(start_station_id,end_station_id,fastest_ref_dist,fastest_ref_time,fastest_ref_geom,balanced_ref_dist,balanced_ref_time,balanced_ref_geom,quietest_ref_dist,quietest_ref_time,quietest_ref_geom) VALUES '] except Exception as e: print('Error while data of webapp_stations_ref_routes were requested...') try: insert(l) return stations except: print('Error while the INSERT statement was executed for the webapp_stations_ref_routes relation') def insert_values_db(values, table_attributes,relation,null_stations): # Local Variables statement = [f"INSERT INTO {table_attributes} VALUES "] append = statement.append # its assign so that we avoid the overheating inside the loop replace = str.replace # its assign so that we avoid overheating inside the loop n = values.shape[0]-1 # number of observations # If the relation that is examined is the stations, receive the spatial location of each station if relation == 'webapp_stations': # stations_location =[(randint(0,89) + random() ,randint(0,89) + random()) for e in range(values.shape[0])] try: stations_location, null_stations = get_station_location(driver_dir= os.path.join(self.__folder_dir,'chromedriver'), url ='https://api.tfl.gov.uk/swagger/ui/index.html?url=/swagger/docs/v1#!/BikePoint/BikePoint_Search' , stations = values['StartStation Name'].values.tolist(), null_stations = null_stations) except Exception as e: print('Error - line 228') elif relation == 'webapp_routes': stations = populate_stations_pairs_relation(values) # returns a dictionary, with all the stations that have a location cur.execute('SELECT id,start_station_id,end_station_id FROM webapp_stations_pairs_routes') pairs = dict([((pair[1],pair[2]),pair[0]) for pair in cur.fetchall()]) # Iterate over each observation and create the corresponded INSERT statement for irow, row in enumerate(values.itertuples()): pk = row[1] # assign the value of pk to a local variable try: if relation == 'webapp_bikes': append(replace(f"({pk}),", "\\'", "''")) elif relation == 'webapp_stations': try: append(replace(f"({pk},'{row[2]}', ST_GeomFromText('MULTIPOINT({stations_location[irow][0]} {stations_location[irow][1]})',4326)),", "\\'", "''")) except: continue elif relation == 'webapp_routes': # get only the routes that i) do not have the same starting and ending station and i) have a start or end station that contains a location in the db if (row[6] != row[7]) and (row[6] in stations) and (row[7] in stations) : pair_id = pairs[(row[6],row[7])] append(replace(f"({pk},'{row[2]}','{row[3]}',{abs(row[4])},{row[5]},{pair_id}),", "\\'", "''")) except (ValueError,KeyError): continue # Constructs the INSERT statement if len(statement) > 1: statement[-1] = ''.join([statement[-1][:-1] + ';']) statement = ''.join(statement) # INSERT the new values into the database sql_execute(statement) conn.commit() # commit the transaction if relation =='webapp_stations': return null_stations def populate_relation(df, df_main_all_names, relation, pk , table_attributes, null_stations): # Local variables def process_df(df, df_main_all_names,relation): # in order to avoid error in subsequent procedures, we need to receive the Id of the starting and ending stations if relation == 'webapp_stations': start_stations_df = df[df_main_all_names[1]].dropna() scol = start_stations_df.columns end_stations_df = df[['EndStation Id','EndStation Name']].dropna() end_stations_df.columns = [scol[0],scol[1]] ndf = pd.concat([start_stations_df,end_stations_df], axis= 0).drop_duplicates([df_main_all_names[0]]) else: # drops the duplicates from the primary key for the webapp_routes and webapp_bikes relation ndf = dataframe(df[df_main_all_names[1]]).drop_duplicates([df_main_all_names[0]]).dropna() return ndf new_values = [] append = new_values.append dataframe = pd.DataFrame # Retrieves the csv sub-dataframe that defines a relation try: ndf = process_df(df,df_main_all_names,relation) except (TypeError,IndexError,KeyError): df.columns = ['Rental Id','Duration','Bike Id','End Date','EndStation Id','EndStation Name','Start Date','StartStation Id', 'StartStation Name'] ndf = process_df(df,df_main_all_names,relation) # Dimensions of the table n = ndf.shape[1] # Performs a SELECT query that will return current values within the db sql_execute(f"SELECT {pk[1]} FROM {relation};") # identify the pk of each entity - a dictionary is used for more efficient search stored_pks= dict([(e[pk[0]],e[pk[0]]) for e in cur.fetchall()]) try: # Look for new values for row in ndf.itertuples(): if (row[1] not in stored_pks): append(row[1]) if len(stored_pks) != 0: if n == 1: # 1 Dimensional relations if len(new_values) > 0: insert_values_db(dataframe({f'{df_main_all_names[0]}' : new_values}), table_attributes, relation,null_stations) else: # n Dimensional relations if len(new_values) > 0: new_values_joined = dataframe({ df_main_all_names[0]: new_values}).merge(ndf,how='left',left_on= df_main_all_names[0], right_on = df_main_all_names[0]) if relation == 'webapp_stations': null_stations = insert_values_db(new_values_joined[df_main_all_names[1]], table_attributes, relation,null_stations) return null_stations else: insert_values_db(new_values_joined[df_main_all_names[1]], table_attributes, relation,null_stations) else: if relation == 'webapp_stations': null_stations = insert_values_db(dataframe(ndf[df_main_all_names[1]]), table_attributes, relation,null_stations) return null_stations else: insert_values_db(dataframe(ndf[df_main_all_names[1]]), table_attributes, relation, null_stations) except psycopg2.InternalError: conn.rollback() process_df(df, df_main_all_names, relation) except Exception as e: print(f'Line 327 - {e}') #------------------------------------------------------------------------------------------------------------------------- try: # Local Variables join = os.path.join exists = os.path.exists size = os.path.getsize cd = self.__folder_dir # gives the directory of tflcrawler read_csv = pd.read_csv # establish a connection with o PostgreSQL database, based on the given DNS parameter conn = psycopg2.connect(DNS) cur = conn.cursor() # initialise a cursor sql_execute = cur.execute # cur.execute command is assigned as local variable (avoid dot overheating) null_stations = ['Bourne Street, Belgravia'] # list that will check if a station is null path = join(cd,rel_path) # Defines the path where the csv files will be stored print('starts to retrieve the csv files...') elements = self.__elements # assign the current dictionary to a local variable # iterate over the dictionary elements for value in tqdm(elements.values()): name = value['name'] # file name try: csv_path = join(path, name) # assign a full path fof the file print(csv_path) # if the file does not exist or the file exists, having a size of zero (nothing within it) if (not exists(csv_path)) or (exists(csv_path) and size(csv_path) == 0): # request the csv file from the server try: response = requests.get(value['url']) except (requests.ConnectionError, requests.ConnectTimeout, requests.HTTPError, requests.TooManyRedirects) as error: print(str(error)) # convert the text to a generator splitted_text = response.iter_lines(chunk_size= 512) # opens and write the file with open(csv_path, 'w') as file: for line in splitted_text: file.write(str(line)[2:-1] + '\n') file.close() # reads the created csv file df = read_csv(filepath_or_buffer= csv_path, delimiter=',' ,encoding= 'utf-8') # Populates the Bikes entity populate_relation(df = df, df_main_all_names= ('Bike Id', 'Bike Id'), relation= 'webapp_bikes' , pk = (0,'bike_id'), table_attributes= 'webapp_bikes(bike_id)', null_stations = null_stations) # Populates the Stations entity condition = True # initialise a boolean variable that checks if the populate_relation function of stations has been correctly executed while(condition): try: # populate the db with the corresponded values of stations null_stations = populate_relation(df = df, df_main_all_names= ('StartStation Id', ['StartStation Id', 'StartStation Name']) , relation ='webapp_stations', pk = (0,'station_id'), table_attributes= 'webapp_stations(station_id,station_name,location)', null_stations = null_stations) # set the condition to false and exit from the while loop condition = False except ValueError: # If the function returns an error due to unsimilarity of the file, SKIP the file condition = False except Exception as e: # If the function returns any other error, execute the function again # The function may do not executed correctly due to problems with the connection with the API and other requests print('POPULATE_RELATION IS EXECUTED AGAIN...') continue # Populates the Routes entity populate_relation(df = df, df_main_all_names=('Rental Id', ['Rental Id','Start Date','End Date', 'Duration','Bike Id','StartStation Id', 'EndStation Id']), relation= 'webapp_routes', pk =(0,'rental_id'), table_attributes='webapp_routes(rental_id,start_date,end_date,duration,bike_id,station_pairs_id)', null_stations = null_stations) except Exception as e: print(f'[Error of file {name} - Inside the FOR loop]') continue except Exception as e: # Close the cursor and database connection as well cur.close() conn.close() print(f'[ Error while the files are retrieved. Error: {str(e)}]') @property def elements(self): return self.__elements @property def site(self): return self.__site
def get_urls(query, url, verbose=False, warning=True, user_agent=None, proxy=None, **kwargs): """ Bypass Google captchas and Google API by using selenium-webdriver to gather the Google URL. This will open a robot controlled browser window and attempt to get a URL from Google that will be used for scraping afterwards. Only downside to this method is that your IP and user agent will be visible until the application pulls the URL. """ if verbose: logger.debug(set_color( "setting up the virtual display to hide the browser...", level=10 )) ff_display = Display(visible=0, size=(800, 600)) ff_display.start() logger.info(set_color( "firefox browser display will be hidden while it performs the query..." )) if warning: logger.warning(set_color( "your web browser will be automated in order for Zeus to successfully " "bypass captchas and API calls. this is done in order to grab the URL " "from the search and parse the results. please give selenium time to " "finish it's task...", level=30 )) if verbose: logger.debug(set_color( "running selenium-webdriver and launching browser...", level=10 )) if verbose: logger.debug(set_color( "adjusting selenium-webdriver user-agent to '{}'...".format(user_agent), level=10 )) if proxy is not None: proxy_type = proxy.keys() proxy_to_use = Proxy({ "proxyType": ProxyType.MANUAL, "httpProxy": proxy[proxy_type[0]], "ftpProxy": proxy[proxy_type[0]], "sslProxy": proxy[proxy_type[0]], "noProxy": "" }) if verbose: logger.debug(set_color( "setting selenium proxy to '{}'...".format( ''.join(proxy_type) + "://" + ''.join(proxy.values()) ), level=10 )) else: proxy_to_use = None profile = webdriver.FirefoxProfile() profile.set_preference("general.useragent.override", user_agent) browser = webdriver.Firefox(profile, proxy=proxy_to_use) logger.info(set_color("browser will open shortly...")) browser.get(url) if verbose: logger.debug(set_color( "searching search engine for the 'q' element (search button)...", level=10 )) search = browser.find_element_by_name('q') logger.info(set_color( "searching '{}' using query '{}'...".format(url, query) )) search.send_keys(query) search.send_keys(Keys.RETURN) # hit return after you enter search text time.sleep(3) if verbose: logger.debug(set_color( "obtaining URL from selenium..." )) retval = browser.current_url if verbose: logger.debug(set_color( "found current URL from selenium browser '{}'...".format(retval), level=10 )) logger.info(set_color( "closing the browser and continuing process.." )) browser.close() ff_display.stop() return retval
class CraigslistBot: @staticmethod def debug(inString): print(" [BOT] - %s" % inString.encode('utf-8').strip()) def __init__(self, protonLogin="", protonPassword="", loginEmail="", loginPass="", contactNumber="", contactName="", postCode="", listingsFile="", waitTime=10, waitTimeBetweenPosts=30): self.display = "" if not os.name == 'nt': self.display = Display(visible=1, size=(1248, 1000)) # 800x600 self.display.start() self.client = webdriver.Firefox() self.isLoggedIn = False self.protonLogin = protonLogin self.protonPassword = protonPassword self.loginEmail = loginEmail self.loginPass = loginPass self.contactNumber = contactNumber self.contactName = contactName self.postCode = postCode self.listingsFile = listingsFile self.waitTime = waitTime self.waitTimeBetweenPosts = waitTimeBetweenPosts self.locationCode = "chi" #nyc asks for more location data not implement yet s def __del__(self): if not os.name == 'nt': self.display.stop() self.client.quit() return 0 def login(self, oneTimeLoginLink=""): self.debug("Logging in...") if oneTimeLoginLink == "": self.client.get("https://accounts.craigslist.org/login") else: self.client.get(oneTimeLoginLink) self.waitForId("inputEmailHandle") #self.debug("Inputing information to login screen") self.client.find_element_by_css_selector( "#inputEmailHandle").send_keys(self.loginEmail) self.client.find_element_by_css_selector("#inputPassword").send_keys( self.loginPass) self.client.find_element_by_id("login").click() # if need activation: # otl = self.validatePostInEmail() # self.login(otl) # return try: self.client.find_element_by_css_selector('.tab') except NoSuchElementException: self.debug("Not logged in") return self.debug("Successfully logged in!") self.isLoggedIn = True def createpost(self, listing): if not self.isLoggedIn: self.debug("ERROR: You're not logged in!") return 0 #self.debug("Attempting to post this listing:") #self.debug(listing.tostring() + "\n") #self.debug("Navigating to post page") #self.debug("locationCode: " + self.locationCode) initialPostUrl = "https://post.craigslist.org/c/" + self.locationCode #self.debug("navigating to " + initialPostUrl) self.client.get(initialPostUrl) self.waitForCss("input[value='1']") self.client.find_element_by_css_selector("input[value='1']").click() # fso = for sale by owner # so = service offered self.client.find_element_by_css_selector("input[value='fso']").click() time.sleep(self.waitTime) # 199 = computer parts # 7 = computers # 96 = electronics self.client.find_element_by_css_selector("input[value='96']").click() time.sleep(self.waitTime) """ self.debug("Trying to fill in email") try: self.client.find_element_by_css_selector( '#FromEMail').send_keys(self.loginEmail) except NoSuchElementException: self.debug("Not avaliable") try: self.client.find_element_by_css_selector( '#FromEMail').send_keys(self.loginEmail) except NoSuchElementException: self.debug("Not avaliable") """ #self.debug("Checking 'Okay to contact by phone'") self.waitForName("show_phone_ok") self.client.find_element_by_name("show_phone_ok").click() self.client.find_element_by_name("contact_phone_ok").click() #self.debug("Checking 'Okay to contact by text'") self.client.find_element_by_name("contact_text_ok").click() #self.debug("Filling in contact phone number") self.client.find_element_by_name("contact_phone").send_keys( self.contactNumber) #self.debug("Filling in contact name") self.client.find_element_by_name("contact_name").send_keys( self.contactName) #self.debug("Filling in post title") spinName = spintax.spin(listing.name) self.client.find_element_by_name("PostingTitle").send_keys(spinName) #self.debug("Filling in zip code") self.client.find_element_by_id("postal_code").send_keys(self.postCode) #self.debug("Filling in post content") spinDescription = spintax.spin(listing.description) self.client.find_element_by_name("PostingBody").send_keys( spinDescription) #self.debug("Checking 'Okay to contact for other offers'") self.waitForName("contact_ok") self.client.find_element_by_name("contact_ok").click() # self.debug("Unchecking 'Want a map' if checked") # try: # self.client.find_element_by_css_selector("#wantamap:checked") # except NoSuchElementException: # self.debug("Not checked") # finally: # self.client.find_element_by_css_selector("#wantamap:checked").click() # time.sleep(self.waitTime) #self.debug("Clicking continue") self.client.find_element_by_name("go").click() # if "editimage" in self.client.current_url: # FIX tHIS # self.debug("Clicking continue") # self.client.find_element_by_css_selector('button.done').click() # else: # self.debug( # "Could not submit. Maybe a bad email address or phone number") #self.debug("Clicking publish") self.waitForClass("bigbutton") self.client.find_element_by_class_name('bigbutton').click() # determine if we need to switch to classic uploading time.sleep(self.waitTime) if len(self.client.find_elements_by_id('classic')) != 0: #self.debug("clicking use classic image uploader") self.waitForId("classic") time.sleep(self.waitTime) self.client.find_element_by_id('classic').click() time.sleep(self.waitTime ) # must wait for classic to pop into the viewport #self.debug("uploading images") self.waitForName("file") for imagePath in listing.imagePathList: self.debug("Attempting to upload image: " + os.getcwd() + "/" + imagePath) self.client.find_element_by_name("file").send_keys(os.getcwd() + "/" + imagePath) time.sleep(self.waitTime) self.debug("Clicking done with images") self.waitForClass("bigbutton") self.client.find_element_by_class_name('bigbutton').click() self.debug("Click publish (again)") self.waitForName("go") self.client.find_element_by_name('go').click() # check if we need to verify the post self.debug("Check if the post needs verified") time.sleep(self.waitTime) htmlText = self.client.find_element_by_css_selector("body").text # self.debug(htmlText) if "FURTHER ACTION REQUIRED" in htmlText: # wait for the email to come through and then verify it self.debug("must verify post") time.sleep(45) self.validatePostInEmail() return self.client.find_element_by_css_selector( "ul.ul").find_elements_by_css_selector("a")[0].get_attribute( "href") # region WaitFor methods def waitForName(self, name): for i in range(0, 30): #self.debug("waiting for id \"" + name + "\"...") if len(self.client.find_elements_by_name(name)) != 0: break time.sleep(2) def waitForId(self, idName): for i in range(0, 30): #self.debug("waiting for id \"" + idName + "\"...") if len(self.client.find_elements_by_id(idName)) != 0: break time.sleep(2) def waitForCss(self, css): for i in range(0, 30): #self.debug("waiting for css selector \"" + css + "\"...") if len(self.client.find_elements_by_css_selector(css)) != 0: break time.sleep(2) def waitForClass(self, className): for i in range(0, 30): #self.debug("waiting for class \"" + className + "\"...") if len(self.client.find_elements_by_class_name(className)) != 0: break time.sleep(2) # endregion def validatePostInEmail(self): self.debug("NOW, WE VALIDATE!") self.client.get("https://mail.protonmail.com/login") self.waitForId("username") self.client.find_element_by_id("username").send_keys(self.protonLogin) self.client.find_element_by_id("password").send_keys( self.protonPassword) self.client.find_element_by_id("login_btn").click() # we're looking for the first link (our craigslistBot email folder) in the first "menuItem-label" list self.waitForClass("menuLabel-item") labelItem = self.client.find_elements_by_class_name( "menuLabel-item")[0] labelLink = labelItem.find_elements_by_css_selector( "a")[0].get_attribute('href') self.client.get(labelLink) # click the newest email self.waitForClass("conversation") self.client.find_elements_by_class_name("conversation")[0].click() # find the newest message in that email self.waitForClass("message") correctMessage = self.client.find_elements_by_class_name("message")[-1] # get the one time link, typically the last link in the list self.waitForCss("a") oneTimeLink = correctMessage.find_elements_by_css_selector( "a")[-1].get_attribute('href') # if the last link is a support page, select the second to last link which should be our verification link if oneTimeLink == "https://www.craigslist.org/about/scams?lang=en&cc=us": oneTimeLink = correctMessage.find_elements_by_css_selector( "a")[-2].get_attribute('href') # navigate to the verification link self.client.get(oneTimeLink) # get the new post link. This may be the incorrect link, look into this. self.waitForCss("a") newPostLink = labelItem.find_elements_by_css_selector( "a")[0].get_attribute('href') time.sleep(2) return newPostLink
class BaseCase(unittest.TestCase): ''' A base test case that wraps methods for enhanced usage. You can also add your own methods here. ''' def __init__(self, *args, **kwargs): super(BaseCase, self).__init__(*args, **kwargs) try: self.driver = WebDriver() except Exception: pass self.environment = None def open(self, url): self.driver.get(url) if settings.WAIT_FOR_RSC_ON_PAGE_LOADS: self.wait_for_ready_state_complete() self._demo_mode_pause_if_active() def open_url(self, url): """ In case people are mixing up self.open() with open(), use this alternative. """ self.open(url) def click(self, selector, by=By.CSS_SELECTOR, timeout=settings.SMALL_TIMEOUT): element = page_actions.wait_for_element_visible( self.driver, selector, by, timeout=timeout) self._demo_mode_scroll_if_active(selector, by) element.click() if settings.WAIT_FOR_RSC_ON_CLICKS: self.wait_for_ready_state_complete() self._demo_mode_pause_if_active() def click_chain(self, selectors_list, by=By.CSS_SELECTOR, timeout=settings.SMALL_TIMEOUT, spacing=0): """ This method clicks on a list of elements in succession. 'spacing' is the amount of time to wait between clicks. (sec) """ for selector in selectors_list: self.click(selector, by=by, timeout=timeout) if spacing > 0: time.sleep(spacing) def click_link_text(self, link_text, timeout=settings.SMALL_TIMEOUT): element = self.wait_for_link_text_visible(link_text, timeout=timeout) element.click() if settings.WAIT_FOR_RSC_ON_CLICKS: self.wait_for_ready_state_complete() self._demo_mode_pause_if_active() def add_text(self, selector, new_value, timeout=settings.SMALL_TIMEOUT): """ The more-reliable version of driver.send_keys() Similar to update_text(), but won't clear the text field first. """ element = self.wait_for_element_visible(selector, timeout=timeout) element.send_keys(new_value) self._demo_mode_pause_if_active() def send_keys(self, selector, new_value, timeout=settings.SMALL_TIMEOUT): """ Same as add_text() -> more reliable, but less name confusion. """ self.add_text(selector, new_value, timeout=timeout) def update_text_value(self, selector, new_value, timeout=settings.SMALL_TIMEOUT, retry=False): """ This method updates an element's text value with a new value. @Params selector - the selector with the value to update new_value - the new value for setting the text field timeout - how long to wait for the selector to be visible retry - if True, use jquery if the selenium text update fails """ element = self.wait_for_element_visible(selector, timeout=timeout) element.clear() self._demo_mode_pause_if_active(tiny=True) element.send_keys(new_value) if (retry and element.get_attribute('value') != new_value and ( not new_value.endswith('\n'))): logging.debug('update_text_value is falling back to jQuery!') selector = self.jq_format(selector) self.set_value(selector, new_value) self._demo_mode_pause_if_active() def update_text(self, selector, new_value, timeout=settings.SMALL_TIMEOUT, retry=False): """ The shorter version of update_text_value(), which clears existing text and adds new text into the text field. We want to keep the old version for backward compatibility. """ self.update_text_value(selector, new_value, timeout=timeout, retry=retry) def is_element_present(self, selector, by=By.CSS_SELECTOR): return page_actions.is_element_present(self.driver, selector, by) def is_element_visible(self, selector, by=By.CSS_SELECTOR): return page_actions.is_element_visible(self.driver, selector, by) def is_link_text_visible(self, link_text): return page_actions.is_element_visible(self.driver, link_text, by=By.LINK_TEXT) def is_text_visible(self, text, selector, by=By.CSS_SELECTOR): return page_actions.is_text_visible(self.driver, text, selector, by) def find_visible_elements(self, selector, by=By.CSS_SELECTOR): return page_actions.find_visible_elements(self.driver, selector, by) def execute_script(self, script): return self.driver.execute_script(script) def set_window_size(self, width, height): return self.driver.set_window_size(width, height) self._demo_mode_pause_if_active() def maximize_window(self): return self.driver.maximize_window() self._demo_mode_pause_if_active() def activate_jquery(self): """ If "jQuery is not defined", use this method to activate it for use. This happens because jQuery is not always defined on web sites. """ try: # Let's first find out if jQuery is already defined. self.driver.execute_script("jQuery('html')") # Since that command worked, jQuery is defined. Let's return. return except Exception: # jQuery is not currently defined. Let's proceed by defining it. pass self.driver.execute_script( '''var script = document.createElement("script"); ''' '''script.src = "https://ajax.googleapis.com/ajax/libs/jquery/1/''' '''jquery.min.js"; document.getElementsByTagName("head")[0]''' '''.appendChild(script);''') for x in xrange(30): # jQuery needs a small amount of time to activate. (At most 3s) try: self.driver.execute_script("jQuery('html')") return except Exception: time.sleep(0.1) # Since jQuery still isn't activating, give up and raise an exception raise Exception("Exception: WebDriver could not activate jQuery!") def scroll_to(self, selector): self.wait_for_element_visible(selector, timeout=settings.SMALL_TIMEOUT) scroll_script = "jQuery('%s')[0].scrollIntoView()" % selector try: self.driver.execute_script(scroll_script) except Exception: # The likely reason this fails is because: "jQuery is not defined" self.activate_jquery() # It's a good thing we can define it here self.driver.execute_script(scroll_script) self._demo_mode_pause_if_active(tiny=True) def scroll_click(self, selector): self.scroll_to(selector) self.click(selector) def jquery_click(self, selector): self.scroll_to(selector) self.driver.execute_script("jQuery('%s').click()" % selector) self._demo_mode_pause_if_active() def jq_format(self, code): return page_utils.jq_format(code) def set_value(self, selector, value): self.scroll_to(selector) val = json.dumps(value) self.driver.execute_script("jQuery('%s').val(%s)" % (selector, val)) self._demo_mode_pause_if_active() def jquery_update_text_value(self, selector, new_value, timeout=settings.SMALL_TIMEOUT): element = self.wait_for_element_visible(selector, timeout=timeout) self.scroll_to(selector) self.driver.execute_script("""jQuery('%s').val('%s')""" % (selector, self.jq_format(new_value))) if new_value.endswith('\n'): element.send_keys('\n') self._demo_mode_pause_if_active() def jquery_update_text(self, selector, new_value, timeout=settings.SMALL_TIMEOUT): self.jquery_update_text_value(selector, new_value, timeout=timeout) def hover_on_element(self, selector): self.wait_for_element_visible(selector, timeout=settings.SMALL_TIMEOUT) self.scroll_to(selector) time.sleep(0.05) # Settle down from scrolling before hovering return page_actions.hover_on_element(self.driver, selector) def hover_and_click(self, hover_selector, click_selector, click_by=By.CSS_SELECTOR, timeout=settings.SMALL_TIMEOUT): self.wait_for_element_visible(hover_selector, timeout=timeout) self.scroll_to(hover_selector) # Settle down from the scrolling before hovering element = page_actions.hover_and_click( self.driver, hover_selector, click_selector, click_by, timeout) self._demo_mode_pause_if_active() return element def wait_for_element_present(self, selector, by=By.CSS_SELECTOR, timeout=settings.LARGE_TIMEOUT): return page_actions.wait_for_element_present( self.driver, selector, by, timeout) def wait_for_element_visible(self, selector, by=By.CSS_SELECTOR, timeout=settings.LARGE_TIMEOUT): return page_actions.wait_for_element_visible( self.driver, selector, by, timeout) def wait_for_text_visible(self, text, selector, by=By.CSS_SELECTOR, timeout=settings.LARGE_TIMEOUT): return page_actions.wait_for_text_visible( self.driver, text, selector, by, timeout) def wait_for_link_text_visible(self, link_text, timeout=settings.LARGE_TIMEOUT): return self.wait_for_element_visible( link_text, by=By.LINK_TEXT, timeout=timeout) def wait_for_element_absent(self, selector, by=By.CSS_SELECTOR, timeout=settings.LARGE_TIMEOUT): return page_actions.wait_for_element_absent( self.driver, selector, by, timeout) def wait_for_element_not_visible(self, selector, by=By.CSS_SELECTOR, timeout=settings.LARGE_TIMEOUT): return page_actions.wait_for_element_not_visible( self.driver, selector, by, timeout) def wait_for_ready_state_complete(self, timeout=settings.EXTREME_TIMEOUT): return page_actions.wait_for_ready_state_complete(self.driver, timeout) def wait_for_and_accept_alert(self, timeout=settings.LARGE_TIMEOUT): return page_actions.wait_for_and_accept_alert(self.driver, timeout) def wait_for_and_dismiss_alert(self, timeout=settings.LARGE_TIMEOUT): return page_actions.wait_for_and_dismiss_alert(self.driver, timeout) def wait_for_and_switch_to_alert(self, timeout=settings.LARGE_TIMEOUT): return page_actions.wait_for_and_switch_to_alert(self.driver, timeout) def save_screenshot(self, name, folder=None): return page_actions.save_screenshot(self.driver, name, folder) def _demo_mode_pause_if_active(self, tiny=False): if self.demo_mode: if self.demo_sleep: wait_time = float(self.demo_sleep) else: wait_time = settings.DEFAULT_DEMO_MODE_TIMEOUT if not tiny: time.sleep(wait_time) else: time.sleep(wait_time/3.0) def _demo_mode_scroll_if_active(self, selector, by): if self.demo_mode: if by == By.CSS_SELECTOR: self.scroll_to(selector) # PyTest-Specific Code # def setUp(self): """ pytest-specific code Be careful if a subclass of BaseCase overrides setUp() You'll need to add the following line to the subclass setUp() method: super(SubClassOfBaseCase, self).setUp() """ self.is_pytest = None try: # This raises an exception if the test is not coming from pytest self.is_pytest = pytest.config.option.is_pytest except Exception: # Not using pytest (probably nosetests) self.is_pytest = False if self.is_pytest: self.with_selenium = pytest.config.option.with_selenium self.headless = pytest.config.option.headless self.headless_active = False self.with_testing_base = pytest.config.option.with_testing_base self.log_path = pytest.config.option.log_path self.browser = pytest.config.option.browser self.data = pytest.config.option.data self.demo_mode = pytest.config.option.demo_mode self.demo_sleep = pytest.config.option.demo_sleep if self.headless: self.display = Display(visible=0, size=(1200, 800)) self.display.start() self.headless_active = True if self.with_selenium: self.driver = browser_launcher.get_driver(self.browser) def tearDown(self): """ pytest-specific code Be careful if a subclass of BaseCase overrides setUp() You'll need to add the following line to the subclass's tearDown(): super(SubClassOfBaseCase, self).tearDown() """ if self.is_pytest: if self.with_selenium: # Save a screenshot if logging is on when an exception occurs if self.with_testing_base and (sys.exc_info()[1] is not None): test_id = "%s.%s.%s" % (self.__class__.__module__, self.__class__.__name__, self._testMethodName) test_logpath = self.log_path + "/" + test_id if not os.path.exists(test_logpath): os.makedirs(test_logpath) # Handle screenshot logging log_helper.log_screenshot(test_logpath, self.driver) # Handle basic test info logging log_helper.log_test_failure_data( test_logpath, self.driver, self.browser) # Handle page source logging log_helper.log_page_source(test_logpath, self.driver) # Finally close the browser self.driver.quit() if self.headless: if self.headless_active: self.display.stop()
class LinkedinPy: """Class to be instantiated to use the script""" def __init__(self, username=None, userid=None, password=None, nogui=False, selenium_local_session=True, use_firefox=False, browser_profile_path=None, page_delay=25, show_logs=True, headless_browser=False, proxy_address=None, proxy_chrome_extension=None, proxy_port=None, disable_image_load=False, bypass_suspicious_attempt=False, bypass_with_mobile=False, multi_logs=True): cli_args = parse_cli_args() username = cli_args.username or username password = cli_args.password or password use_firefox = cli_args.use_firefox or use_firefox page_delay = cli_args.page_delay or page_delay headless_browser = cli_args.headless_browser or headless_browser proxy_address = cli_args.proxy_address or proxy_address proxy_port = cli_args.proxy_port or proxy_port disable_image_load = cli_args.disable_image_load or disable_image_load bypass_suspicious_attempt = (cli_args.bypass_suspicious_attempt or bypass_suspicious_attempt) bypass_with_mobile = cli_args.bypass_with_mobile or bypass_with_mobile if not get_workspace(Settings): raise SocialPyError( "Oh no! I don't have a workspace to work at :'(") self.nogui = nogui if nogui: self.display = Display(visible=0, size=(800, 600)) self.display.start() self.browser = None self.headless_browser = headless_browser self.proxy_address = proxy_address self.proxy_port = proxy_port self.proxy_chrome_extension = proxy_chrome_extension self.selenium_local_session = selenium_local_session self.bypass_suspicious_attempt = bypass_suspicious_attempt self.bypass_with_mobile = bypass_with_mobile self.disable_image_load = disable_image_load self.username = username or os.environ.get('LINKEDIN_USER') self.password = password or os.environ.get('LINKEDIN_PW') Settings.profile["name"] = self.username self.page_delay = page_delay self.switch_language = True self.use_firefox = use_firefox Settings.use_firefox = self.use_firefox self.browser_profile_path = browser_profile_path self.liked_img = 0 self.already_liked = 0 self.liked_comments = 0 self.commented = 0 self.replied_to_comments = 0 self.connected = 0 self.already_connected = 0 self.unconnected = 0 self.connected_by = 0 self.connecting_num = 0 self.inap_img = 0 self.not_valid_users = 0 self.connect_times = 1 self.start_time = time.time() # assign logger self.show_logs = show_logs Settings.show_logs = show_logs or None self.multi_logs = multi_logs self.logfolder = get_logfolder(self.username, self.multi_logs, Settings) self.logger = self.get_linkedinpy_logger(self.show_logs) get_database(Settings, make=True) # IMPORTANT: think twice before relocating if self.selenium_local_session is True: self.set_selenium_local_session(Settings) def get_linkedinpy_logger(self, show_logs): """ Handles the creation and retrieval of loggers to avoid re-instantiation. """ existing_logger = Settings.loggers.get(self.username) if existing_logger is not None: return existing_logger else: # initialize and setup logging system for the LinkedinPy object logger = logging.getLogger(self.username) logger.setLevel(logging.DEBUG) file_handler = logging.FileHandler('{}general.log'.format( self.logfolder)) file_handler.setLevel(logging.DEBUG) extra = {"username": self.username} logger_formatter = logging.Formatter( '%(levelname)s [%(asctime)s] [%(username)s] %(message)s', datefmt='%Y-%m-%d %H:%M:%S') file_handler.setFormatter(logger_formatter) logger.addHandler(file_handler) if show_logs is True: console_handler = logging.StreamHandler() console_handler.setLevel(logging.DEBUG) console_handler.setFormatter(logger_formatter) logger.addHandler(console_handler) logger = logging.LoggerAdapter(logger, extra) Settings.loggers[self.username] = logger Settings.logger = logger return logger def set_selenium_local_session(self, Settings): self.browser, err_msg = \ set_selenium_local_session(self.proxy_address, self.proxy_port, self.proxy_chrome_extension, self.headless_browser, self.use_firefox, self.browser_profile_path, # Replaces # browser User # Agent from # "HeadlessChrome". self.disable_image_load, self.page_delay, self.logger, Settings) if len(err_msg) > 0: raise SocialPyError(err_msg) def login(self): """Used to login the user either with the username and password""" if not login_user(self.browser, self.username, None, self.password, self.logger, self.logfolder, self.switch_language, self.bypass_suspicious_attempt, self.bypass_with_mobile): message = "Wrong login data!" highlight_print(Settings, self.username, message, "login", "critical", self.logger) # self.aborting = True else: message = "Logged in successfully!" highlight_print(Settings, self.username, message, "login", "info", self.logger) # try to save account progress try: save_account_progress(self.browser, "https://www.linkedin.com/", self.username, self.logger) except Exception: self.logger.warning( 'Unable to save account progress, skipping data update') return self def withdraw_old_invitations(self, skip_pages=10, sleep_delay=6): page_no = skip_pages while page_no < 100: page_no = page_no + 1 try: url = "https://www.linkedin.com/mynetwork/invitation-manager/sent/?page=" + str( page_no) web_address_navigator(Settings, self.browser, url) print("Starting page:", page_no) if self.browser.current_url == "https://www.linkedin.com/mynetwork/invitation-manager/sent/" or len( self.browser.find_elements_by_css_selector( "li.invitation-card div.pl5")) == 0: print("============Last Page Reached==============") break checked_in_page = 0 for i in range( 0, len( self.browser.find_elements_by_css_selector( "li.invitation-card div.pl5"))): try: res_item = self.browser.find_elements_by_css_selector( "li.invitation-card div.pl5")[i] try: link = res_item.find_element_by_css_selector( "div > a") profile_link = link.get_attribute("href") user_name = profile_link.split('/')[4] self.logger.info( "user_name : {}".format(user_name)) except Exception as e: print("Might be a stale profile", e) time = res_item.find_element_by_css_selector( "div > time") self.logger.info("time : {}".format(time.text)) check_button = res_item.find_element_by_css_selector( "div > div:nth-child(1) > input") check_status = check_button.get_attribute( "data-artdeco-is-focused") self.logger.info( "check_status : {}".format(check_status)) self.browser.execute_script("window.scrollTo(0, " + str((i + 1) * 104) + ");") if "month" in time.text: (ActionChains(self.browser).move_to_element( check_button).click().perform()) self.logger.info("check_button clicked") checked_in_page = checked_in_page + 1 delay_random = random.randint( ceil(sleep_delay * 0.42), ceil(sleep_delay * 0.57)) sleep(delay_random) except Exception as e: self.logger.error(e) if checked_in_page > 0: self.logger.info("Widraw to be pressed") try: self.browser.execute_script("window.scrollTo(0, 0);") withdraw_button = self.browser.find_element_by_css_selector( "ul > li.mn-list-toolbar__right-button > button") self.logger.info("withdraw_button : {}".format( withdraw_button.text)) if "Withdraw" in withdraw_button.text: (ActionChains(self.browser).move_to_element( withdraw_button).click().perform()) self.logger.info("withdraw_button clicked") page_no = page_no - 1 delay_random = random.randint( ceil(sleep_delay * 0.85), ceil(sleep_delay * 1.14)) sleep(delay_random) except Exception as e: print( "For some reason there is no withdraw_button inspite of checkings", e) else: self.logger.info("Nothing checked in this page") except Exception as e: self.logger.error(e) self.logger.info("============Next Page==============") def search_1stconnects_and_savetodb(self, query, city_code, school_code=None, past_company=None, random_start=True, max_pages=10, max_connects=25, sleep_delay=6): """ search linkedin and connect from a given profile """ self.logger.info( "Searching for: query={}, city_code={}, school_code={}".format( query, city_code, school_code)) search_url = "https://www.linkedin.com/search/results/people/?&facetNetwork=%5B%22F%22%5D" if city_code: search_url = search_url + "&facetGeoRegion=" + city_code if school_code: search_url = search_url + "&facetSchool=" + school_code if past_company: search_url = search_url + "&facetPastCompany=" + past_company search_url = search_url + "&keywords=" + query search_url = search_url + "&origin=" + "FACETED_SEARCH" for page_no in range(1, 101): try: temp_search_url = search_url + "&page=" + str(page_no) web_address_navigator(Settings, self.browser, temp_search_url) self.logger.info("Starting page: {}".format(page_no)) for jc in range(2, 11): sleep(1) self.browser.execute_script( "window.scrollTo(0, document.body.scrollHeight/" + str(jc) + ");") if len( self.browser.find_elements_by_css_selector( "div.search-result__wrapper")) == 0: self.logger.info( "============Last Page Reached or asking for Premium membership==============" ) break for i in range( 0, len( self.browser.find_elements_by_css_selector( "div.search-result__wrapper"))): try: res_item = self.browser.find_elements_by_css_selector( "li.search-result div.search-entity div.search-result__wrapper" )[i] link = res_item.find_element_by_css_selector("div > a") profile_link = link.get_attribute("href") user_name = profile_link.split('/')[4] self.logger.info("user_name : {}".format(user_name)) msg_button = res_item.find_element_by_xpath( "//div[3]/div/div/button[text()='Message']") print(msg_button.text, "present") if msg_button.text == "Message": connect_restriction("write", user_name, None, self.logger) self.logger.info( "saved {} to db".format(user_name)) except Exception as e: self.logger.error(e) except Exception as e: self.logger.error(e) self.logger.info("============Next Page==============") def test_page(self, search_url, page_no, css_selector_identifier): web_address_navigator(Settings, self.browser, search_url) self.logger.info("Testing page: {}".format(page_no)) if len( self.browser.find_elements_by_css_selector( css_selector_identifier)) > 0: return True return False def search_and_connect(self, query, connection_relationship_code, city_code, school_code=None, past_company=None, random_start=True, max_pages=10, max_connects=25, sleep_delay=6): """ search linkedin and connect from a given profile """ if quota_supervisor(Settings, "connects") == "jump": return 0 self.logger.info( "Searching for: query={}, connection_relationship_code={}, city_code={}, school_code={}" .format(query, connection_relationship_code, city_code, school_code)) connects = 0 prev_connects = -1 search_url = "https://www.linkedin.com/search/results/people/?" if connection_relationship_code: search_url = search_url + "&facetNetwork=" + connection_relationship_code if city_code: search_url = search_url + "&facetGeoRegion=" + city_code if school_code: search_url = search_url + "&facetSchool=" + school_code if past_company: search_url = search_url + "&facetPastCompany=" + past_company search_url = search_url + "&keywords=" + query search_url = search_url + "&origin=" + "FACETED_SEARCH" temp_search_url = search_url + "&page=1" print(temp_search_url) time.sleep(10) if self.test_page( search_url=temp_search_url, page_no=1, css_selector_identifier="div.search-result__wrapper") == False: self.logger.info( "============Definitely no Result, Next Query==============") return 0 if random_start: trial = 0 st = 5 while True and trial < 5 and st > 1: st = random.randint(1, st - 1) temp_search_url = search_url + "&page=" + str(st) if self.test_page(temp_search_url, st, "div.search-result__wrapper"): break trial = trial + 1 else: st = 1 for page_no in list(range(st, st + max_pages)): if prev_connects == connects: self.logger.info( "============Limits might have exceeded or all Invites pending from this page(let's exit either case)==============" ) break else: prev_connects = connects try: temp_search_url = search_url + "&page=" + str(page_no) if page_no > st and st > 1: web_address_navigator(Settings, self.browser, temp_search_url) self.logger.info("Starting page: {}".format(page_no)) for jc in range(2, 11): sleep(1) self.browser.execute_script( "window.scrollTo(0, document.body.scrollHeight/" + str(jc) + "-100);") if len( self.browser.find_elements_by_css_selector( "div.search-result__wrapper")) == 0: self.logger.info( "============Last Page Reached or asking for Premium membership==============" ) break for i in range( 0, len( self.browser.find_elements_by_css_selector( "div.search-result__wrapper"))): try: res_item = self.browser.find_elements_by_css_selector( "li.search-result div.search-entity div.search-result__wrapper" )[i] # div.search-result__actions div button") # pp.pprint(res_item.get_attribute('innerHTML')) link = res_item.find_element_by_css_selector("div > a") profile_link = link.get_attribute("href") self.logger.info("Profile : {}".format(profile_link)) user_name = profile_link.split('/')[4] # self.logger.info("user_name : {}".format(user_name)) name = res_item.find_element_by_css_selector( "h3 > span > span > span") #//span/span/span[1]") self.logger.info("Name : {}".format(name.text)) if connect_restriction("read", user_name, self.connect_times, self.logger): self.logger.info("already connected") continue try: connect_button = res_item.find_element_by_xpath( "//div[3]/div/button[text()='Connect']") self.logger.info( "Connect button found, connecting...") self.browser.execute_script( "var evt = document.createEvent('MouseEvents');" + "evt.initMouseEvent('click',true, true, window, 0, 0, 0, 0, 0, false, false, false, false, 0,null);" + "arguments[0].dispatchEvent(evt);", res_item.find_element_by_xpath( '//div[3]/div/button[text()="Connect"]')) self.logger.info("Clicked {}".format( connect_button.text)) sleep(2) except Exception: invite_sent_button = res_item.find_element_by_xpath( "//div[3]/div/button[text()='Invite Sent']") self.logger.info("Already {}".format( invite_sent_button.text)) continue try: modal = self.browser.find_element_by_css_selector( "div.modal-wormhole-content > div") if modal: try: sendnow_or_done_button = modal.find_element_by_xpath( "//div[1]/div/section/div/div[2]/button[2]" ) #text()='Send now']") self.logger.info( sendnow_or_done_button.text) if not (sendnow_or_done_button.text == 'Done' or sendnow_or_done_button.text == 'Send now'): raise Exception( "Send Now or Done button not found" ) if sendnow_or_done_button.is_enabled(): (ActionChains( self.browser).move_to_element( sendnow_or_done_button).click( ).perform()) self.logger.info("Clicked {}".format( sendnow_or_done_button.text)) connects = connects + 1 connect_restriction( "write", user_name, None, self.logger) try: # update server calls update_activity( Settings, 'connects') except Exception as e: self.logger.error(e) sleep(2) else: try: #TODO: input("find correct close XPATH") close_button = modal.find_element_by_xpath( "//div[1]/div/section/div/header/button" ) (ActionChains( self.browser).move_to_element( close_button).click(). perform()) print(sendnow_or_done_button.text, "disabled, clicked close") sleep(2) except Exception as e: print( "close_button not found, Failed with:", e) except Exception as e: print( "sendnow_or_done_button not found, Failed with:", e) else: self.logger.info("Popup not found") except Exception as e: print("Popup not found, Failed with:", e) try: new_popup_buttons = self.browser.find_elements_by_css_selector( "#artdeco-modal-outlet div.artdeco-modal-overlay div.artdeco-modal div.artdeco-modal__actionbar button.artdeco-button" ) gotit_button = new_popup_buttons[1] (ActionChains(self.browser).move_to_element( gotit_button).click().perform()) print(gotit_button.text, " clicked") sleep(2) except Exception as e: print("New Popup also not found, Failed with:", e) self.logger.info( "Connects sent in this iteration: {}".format( connects)) delay_random = random.randint(ceil(sleep_delay * 0.85), ceil(sleep_delay * 1.14)) sleep(delay_random) if connects >= max_connects: self.logger.info( "max_connects({}) for this iteration reached , Returning..." .format(max_connects)) return except Exception as e: self.logger.error(e) except Exception as e: self.logger.error(e) self.logger.info("============Next Page==============") return connects def endorse(self, profile_link, sleep_delay): try: web_address_navigator(Settings, self.browser, profile_link) for jc in range(1, 10): sleep(1) self.browser.execute_script( "window.scrollTo(0, document.body.scrollHeight*" + str(jc) + "/10);") skills_pane = self.browser.find_element_by_css_selector( "div.profile-detail > div.pv-deferred-area > div > section.pv-profile-section.pv-skill-categories-section" ) if (skills_pane.text.split('\n')[0] == 'Skills & Endorsements'): try: first_skill_button_icon = self.browser.find_element_by_css_selector( "div.profile-detail > div.pv-deferred-area > div > section.pv-profile-section.pv-skill-categories-section > ol > li > div > div > div > button > li-icon" ) button_type = first_skill_button_icon.get_attribute("type") if button_type == 'plus-icon': first_skill_button = self.browser.find_element_by_css_selector( "div.profile-detail > div.pv-deferred-area > div > section.pv-profile-section.pv-skill-categories-section > ol > li > div > div > div > button" ) self.browser.execute_script( "var evt = document.createEvent('MouseEvents');" + "evt.initMouseEvent('click',true, true, window, 0, 0, 0, 0, 0, false, false, false, false, 0,null);" + "arguments[0].dispatchEvent(evt);", first_skill_button) first_skill_title = self.browser.find_element_by_css_selector( "div.profile-detail > div.pv-deferred-area > div > section.pv-profile-section.pv-skill-categories-section > ol > li > div > div > p > a > span" ) print(first_skill_title.text, "clicked") delay_random = random.randint(ceil(sleep_delay * 0.85), ceil(sleep_delay * 1.14)) sleep(delay_random) else: self.logger.info( 'button_type already {}'.format(button_type)) except Exception as e: self.logger.error(e) else: self.logger.info('Skill & Endorsements pane not found') except Exception as e: self.logger.error(e) def search_and_endorse(self, query, city_code, school_code, random_start=True, max_pages=3, max_endorsements=25, sleep_delay=6): """ search linkedin and endose few first connections """ if quota_supervisor(Settings, "connects") == "jump": return #False, "jumped" print("Searching for: ", query, city_code, school_code) search_url = "https://www.linkedin.com/search/results/people/?" if city_code: search_url = search_url + "&facetGeoRegion=" + city_code if school_code: search_url = search_url + "&facetSchool=" + school_code search_url = search_url + "&facetNetwork=%5B%22F%22%5D" search_url = search_url + "&keywords=" + query search_url = search_url + "&origin=" + "FACETED_SEARCH" if random_start: trial = 0 while True and trial < 3: st = random.randint(1, 3) temp_search_url = search_url + "&page=" + str(st) web_address_navigator(Settings, self.browser, temp_search_url) self.logger.info("Testing page:".format(st)) result_items = self.browser.find_elements_by_css_selector( "div.search-result__wrapper") if len(result_items) > 0: break trial = trial + 1 else: st = 1 connects = 0 for page_no in list(range(st, st + 1)): collected_profile_links = [] try: temp_search_url = search_url + "&page=" + str(page_no) if page_no > st and st > 1: web_address_navigator(Settings, self.browser, temp_search_url) self.logger.info("Starting page: {}".format(page_no)) for jc in range(2, 11): sleep(1) self.browser.execute_script( "window.scrollTo(0, document.body.scrollHeight/" + str(jc) + "-100);") result_items = self.browser.find_elements_by_css_selector( "div.search-result__wrapper") # print(result_items) for result_item in result_items: try: link = result_item.find_element_by_css_selector( "div > a") self.logger.info("Profile : {}".format( link.get_attribute("href"))) collected_profile_links.append( link.get_attribute("href")) name = result_item.find_element_by_css_selector( "h3 > span > span > span") self.logger.info("Name : {}".format(name.text)) except Exception as e: self.logger.error(e) except Exception as e: self.logger.error(e) for collected_profile_link in collected_profile_links: self.endorse(collected_profile_link, sleep_delay=sleep_delay) connects = connects + 1 if connects >= max_endorsements: self.logger.info( "max_endorsements({}) for this iteration reached , Returning..." .format(max_endorsements)) return self.logger.info("============Next Page==============") def dump_connect_restriction(self, profile_name, logger, logfolder): """ Dump connect restriction data to a local human-readable JSON """ try: # get a DB and start a connection db, id = get_database(Settings) conn = sqlite3.connect(db) with conn: conn.row_factory = sqlite3.Row cur = conn.cursor() cur.execute( "SELECT * FROM connectRestriction WHERE profile_id=:var", {"var": id}) data = cur.fetchall() if data: # get the existing data filename = "{}connectRestriction.json".format(logfolder) if os.path.isfile(filename): with open(filename) as connectResFile: current_data = json.load(connectResFile) else: current_data = {} # pack the new data connect_data = { user_data[1]: user_data[2] for user_data in data or [] } current_data[profile_name] = connect_data # dump the fresh connect data to a local human readable JSON with open(filename, 'w') as connectResFile: json.dump(current_data, connectResFile) except Exception as exc: logger.error( "Pow! Error occurred while dumping connect restriction data to a " "local JSON:\n\t{}".format(str(exc).encode("utf-8"))) finally: if conn: # close the open connection conn.close() def end(self): """Closes the current session""" # IS_RUNNING = False close_browser(self.browser, False, self.logger) with interruption_handler(): # close virtual display if self.nogui: self.display.stop() # write useful information self.dump_connect_restriction(self.username, self.logger, self.logfolder) # dump_record_activity(self.username, # self.logger, # self.logfolder, # Settings) with open('{}connected.txt'.format(self.logfolder), 'w') \ as connectFile: connectFile.write(str(self.connected)) # output live stats before leaving self.live_report() message = "Session ended!" highlight_print(Settings, self.username, message, "end", "info", self.logger) print("\n\n") def set_quota_supervisor(self, Settings, enabled=False, sleep_after=[], sleepyhead=False, stochastic_flow=False, notify_me=False, peak_likes=(None, None), peak_comments=(None, None), peak_connects=(None, None), peak_unconnects=(None, None), peak_server_calls=(None, None)): """ Sets aside QS configuration ANY time in a session """ # take a reference of the global configuration configuration = Settings.QS_config # strong type checking on peaks entered peak_values_combined = [ peak_likes, peak_comments, peak_connects, peak_unconnects, peak_server_calls ] peaks_are_tuple = all( type(item) is tuple for item in peak_values_combined) if peaks_are_tuple: peak_values_merged = [ i for sub in peak_values_combined for i in sub ] integers_filtered = filter(lambda e: isinstance(e, int), peak_values_merged) peaks_are_provided = all( len(item) == 2 for item in peak_values_combined) peaks_are_valid = all( type(item) is int or type(item) is type(None) for item in peak_values_merged) peaks_are_good = all(item >= 0 for item in integers_filtered) # set QS if peak values are eligible if (peaks_are_tuple and peaks_are_provided and peaks_are_valid and peaks_are_good): peaks = { "likes": { "hourly": peak_likes[0], "daily": peak_likes[1] }, "comments": { "hourly": peak_comments[0], "daily": peak_comments[1] }, "connects": { "hourly": peak_connects[0], "daily": peak_connects[1] }, "unconnects": { "hourly": peak_unconnects[0], "daily": peak_unconnects[1] }, "server_calls": { "hourly": peak_server_calls[0], "daily": peak_server_calls[1] } } if not isinstance(sleep_after, list): sleep_after = [sleep_after] rt = time.time() latesttime = {"hourly": rt, "daily": rt} orig_peaks = deepcopy(peaks) # original peaks always remain static stochasticity = { "enabled": stochastic_flow, "latesttime": latesttime, "original_peaks": orig_peaks } if (platform.startswith("win32") and python_version() < "2.7.15"): # UPDATE ME: remove this block once plyer is # verified to work on [very] old versions of Python 2 notify_me = False # update QS configuration with the fresh settings configuration.update({ "state": enabled, "sleep_after": sleep_after, "sleepyhead": sleepyhead, "stochasticity": stochasticity, "notify": notify_me, "peaks": peaks }) else: # turn off QS for the rest of the session # since peak values are ineligible configuration.update(state="False") # user should be warned only if has had QS turned on if enabled is True: self.logger.warning("Quota Supervisor: peak rates are misfit! " "Please use supported formats." "\t~disabled QS") def live_report(self): """ Report live sessional statistics """ print('') stats = [ self.liked_img, self.already_liked, self.commented, self.connected, self.already_connected, self.unconnected, self.inap_img, self.not_valid_users ] if self.connecting_num and self.connected_by: owner_relationship_info = ( "On session start was connectING {} users" " & had {} connectERS".format(self.connecting_num, self.connected_by)) else: owner_relationship_info = '' sessional_run_time = self.run_time() run_time_info = ( "{} seconds".format(sessional_run_time) if sessional_run_time < 60 else "{} minutes".format(truncate_float( sessional_run_time / 60, 2)) if sessional_run_time < 3600 else "{} hours".format(truncate_float(sessional_run_time / 60 / 60, 2))) run_time_msg = "[Session lasted {}]".format(run_time_info) if any(stat for stat in stats): self.logger.info( "Sessional Live Report:\n" "\t|> LIKED {} images | ALREADY LIKED: {}\n" "\t|> COMMENTED on {} images\n" "\t|> connected {} users | ALREADY connected: {}\n" "\t|> UNconnected {} users\n" "\t|> LIKED {} comments\n" "\t|> REPLIED to {} comments\n" "\t|> INAPPROPRIATE images: {}\n" "\t|> NOT VALID users: {}\n" "\n{}\n{}".format(self.liked_img, self.already_liked, self.commented, self.connected, self.already_connected, self.unconnected, self.liked_comments, self.replied_to_comments, self.inap_img, self.not_valid_users, owner_relationship_info, run_time_msg)) else: self.logger.info("Sessional Live Report:\n" "\t|> No any statistics to show\n" "\n{}\n{}".format(owner_relationship_info, run_time_msg)) def run_time(self): """ Get the time session lasted in seconds """ real_time = time.time() run_time = (real_time - self.start_time) run_time = truncate_float(run_time, 2) return run_time def search_and_apply(self): usualjobslink = "https://www.linkedin.com/jobs" web_address_navigator(Settings, self.browser, usualjobslink) job_title_XP = '//input[contains(@id,"jobs-search-box-keyword-id")]' txt_job_title = self.browser.find_element_by_xpath(job_title_XP) print('Entering Job Title') (ActionChains(self.browser).move_to_element( txt_job_title).click().send_keys("Python Developer").perform()) job_location_XP = '//input[contains(@id,"jobs-search-box-location-id")]' txt_job_location = self.browser.find_element_by_xpath(job_location_XP) print('Entering Job Location') (ActionChains( self.browser).move_to_element(txt_job_location).click().send_keys( "San Jose, California, United States").perform()) # update server calls for both 'click' and 'send_keys' actions for i in range(2): update_activity(Settings) sleep(1) print("Clicking Search Button") job_search_XP = '//button[contains(@class,"jobs-search-box__submit-button")]' btn_job_search = self.browser.find_element_by_xpath(job_search_XP) print(btn_job_search) (ActionChains( self.browser).move_to_element(btn_job_search).click().perform()) # update server calls update_activity(Settings) sleep(10) input("Press Enter to continue...") def search_and_apply(self, job_title, job_location, distance=50, random_start=True, max_pages=20, max_connects=25, sleep_delay=6): self.logger.info( "Searching for: job_title={}, job_location={}, radius={}".format( job_title, job_location, distance)) connects = 0 prev_connects = -1 # https://www.linkedin.com/jobs/search/?keywords=python%20developer&location=San%20Jose%2C%20California%2C%20United%20States&distance=50 job_search_url = "https://www.linkedin.com/jobs/search/?" if job_title: job_search_url = job_search_url + "keywords=" + job_title if job_location: job_search_url = job_search_url + "&location=" + job_location if distance: job_search_url = job_search_url + "&distance=" + str(distance) temp_job_search_url = job_search_url + "&start=0" print(temp_job_search_url) time.sleep(10) if self.test_page( search_url=temp_job_search_url, page_no=1, css_selector_identifier="div.jobs-search-results ") == False: self.logger.info( "============Definitely no Result, Next Query==============") return 0 if random_start: trial = 0 st = 5 while True and trial < 5 and st > 1: st = random.randint(1, st - 1) temp_job_search_url = job_search_url + "&start=" + str(st * 25) if self.test_page(temp_job_search_url, st, "div.jobs-search-results"): break trial = trial + 1 else: st = 1 for page_no in list(range(st, st + max_pages)): try: temp_job_search_url = job_search_url + "&start=" + str(page_no) if page_no > st and st > 1: web_address_navigator(Settings, self.browser, temp_job_search_url) self.logger.info("Starting page: {}".format(page_no)) for jc in range(2, 11): sleep(1) self.browser.execute_script( "window.scrollTo(0, document.body.scrollHeight/" + str(jc) + "-100);") if len( self.browser.find_elements_by_css_selector( "div.jobs-search-results")) == 0: self.logger.info( "============Last Page Reached or asking for Premium membership==============" ) break for i in range( 0, len( self.browser.find_elements_by_css_selector( "div.jobs-search-results"))): print(i) except Exception as e: self.logger.error(e) input("Press Enter to continue...")
def parse(self, response): socket.setdefaulttimeout(int(self.timeout)) # temporary file for the output image t_file = tempfile.NamedTemporaryFile(delete=False, suffix='.png') t_file.close() print('Created temporary image file: %s' % t_file.name) self.log('Created temporary image file: %s' % t_file.name) if not DEBUG_MODE: display = Display(visible=int(bool(DEBUG_MODE)), size=(self.width, self.height)) display.start() # we will use requesocks for checking response code r_session = requests.session() if self.timeout: self.timeout = int(self.timeout) r_session.timeout = self.timeout # Proxies activated again because of walmart bans if self.proxy: r_session.proxies = {"http": "{}://{}".format(self.proxy_type, self.proxy), \ "https": "{}://{}".format(self.proxy_type, self.proxy)} if self.user_agent: r_session.headers = {'User-Agent': self.user_agent} # check if the page returns code != 200 if self.code_200_required and str( self.code_200_required).lower() not in ('0', 'false', 'off'): page_code = r_session.get(self.product_url, verify=False).status_code if page_code != 200: self.log( 'Page returned code %s at %s' % (page_code, self.product_url), ERROR) yield ScreenshotItem() # return empty item if not DEBUG_MODE: display.stop() return driver = self.init_driver() item = ScreenshotItem() if self.proxy: ip_via_proxy = URL2ScreenshotSpider._get_proxy_ip(driver) item['via_proxy'] = ip_via_proxy print 'IP via proxy:', ip_via_proxy self.log('IP via proxy: %s' % ip_via_proxy) try: self.prepare_driver(driver) self.make_screenshot(driver, t_file.name) self.log('Screenshot was made for file %s' % t_file.name) except Exception as e: self.log('Exception while getting response using selenium! %s' % str(e)) # lets try with another driver another_driver_name = self._choose_another_driver() try: if not DEBUG_MODE: driver.quit() # clean RAM except Exception as e: pass driver = self.init_driver(name=another_driver_name) self.prepare_driver(driver) self.make_screenshot(driver, t_file.name) self.log('Screenshot was made for file %s (2nd attempt)' % t_file.name) try: if not DEBUG_MODE: driver.quit() except: pass # crop the image if needed if self.crop_width and self.crop_height: self.crop_width = int(self.crop_width) self.crop_height = int(self.crop_height) from PIL import Image # size is width/height img = Image.open(t_file.name) box = (self.crop_left, self.crop_top, self.crop_left + self.crop_width, self.crop_top + self.crop_height) area = img.crop(box) area.save(t_file.name, 'png') self.log('Screenshot was cropped and saved to %s' % t_file.name) if self.image_copy: # save a copy of the file if needed area.save(self.image_copy, 'png') with open(t_file.name, 'rb') as fh: img_content = fh.read() self.log('Screenshot content was read, size: %s bytes' % len(img_content)) if self.remove_img is True: os.unlink(t_file.name) # remove old output file self.log('Screenshot file was removed: %s' % t_file.name) # yield the item item['url'] = response.url item['image'] = base64.b64encode(img_content) item['site_settings'] = getattr(self, '_site_settings_activated_for', None) item['creation_datetime'] = datetime.datetime.utcnow().isoformat() if not DEBUG_MODE: display.stop() self.log('Item image key length: %s' % len(item.get('image', ''))) if img_content: yield item
class UITestCase(LiveServerTestCase): def use_xvfb(self): from pyvirtualdisplay import Display self.display = Display('xvfb', visible=1, size=(1280, 1024)) self.display.start() self.driver = WebDriver() def setUp(self): try: self.driver = WebDriver() ui_is_not_available = False except WebDriverException: ui_is_not_available = True if ui_is_not_available: self.use_xvfb() self.driver.implicitly_wait(10) clear_caches() setup_for_ui_test() super(UITestCase, self).setUp() def tearDown(self): self.driver.quit() if hasattr(self, 'display'): self.display.stop() ContentType.objects.clear_cache() super(UITestCase, self).tearDown() def click(self, selector): self.find(selector).click() def click_when_visible(self, selector): element = self.find(selector) self.wait_until_visible(element) element.click() def find(self, selector): return self.driver.find_element_by_css_selector(selector) def find_name(self, name): return self.driver.find_element_by_name(name) def find_id(self, id): return self.driver.find_element_by_id(id) def process_login_form(self, username, password): username_elmt = self.wait_until_present('[name="username"]') password_elmt = self.find_name('password') username_elmt.send_keys(username) password_elmt.send_keys(password) self.click('form * button') def browse_to_url(self, url): self.driver.get(self.live_server_url + url) def browse_to_instance_url(self, url, instance=None): instance = instance if instance is not None else self.instance self.driver.get('%s/%s/%s' % (self.live_server_url, self.instance.url_name, url)) def find_anchor_by_url(self, url): return self.find("[href='%s']" % url) def wait_until_present(self, selector, timeout=10): """ Wait until an element with CSS 'selector' exists on the page. Useful for detecting that an operation loads the page you're expecting. """ element = [None] # use list so it can be set by inner scope def is_present(driver): element[0] = self.find(selector) return element[0] is not None WebDriverWait(self.driver, timeout).until(is_present) return element[0] def wait_until_text_present(self, text, timeout=10): """ Wait until 'text' exists on the page. Useful for detecting that an operation loads the page you're expecting. """ WebDriverWait(self.driver, timeout).until(lambda driver: text in driver.page_source) def wait_until_enabled(self, element_or_selector, timeout=10): """ Wait until 'element_or_selector' is enabled. """ element = self._get_element(element_or_selector) WebDriverWait(self.driver, timeout).until( lambda driver: element.get_attribute("disabled") is None) return element def wait_until_visible(self, element_or_selector, timeout=10): """ Wait until 'element_or_selector' (known to already exist on the page) is displayed. """ element = self._get_element(element_or_selector) WebDriverWait(self.driver, timeout).until(lambda driver: element.is_displayed()) return element def wait_until_invisible(self, element_or_selector, timeout=10): """ Wait until 'element_or_selector' (known to already exist on the page) is not displayed. """ element = self._get_element(element_or_selector) def is_invisible(driver): try: return not element.is_displayed() except StaleElementReferenceException: return True WebDriverWait(self.driver, timeout).until(is_invisible) return element def _get_element(self, element_or_selector): if isinstance(element_or_selector, basestring): return self.find(element_or_selector) else: return element_or_selector
"BACKEND": "django.template.backends.django.DjangoTemplates", "APP_DIRS": True, "OPTIONS": { "context_processors": [ "django.template.context_processors.debug", "django.template.context_processors.request", "django.contrib.auth.context_processors.auth", "django.contrib.messages.context_processors.messages", "portal.context_processors.process_newsletter_form", ] }, } ] if os.environ.get('SELENIUM_HEADLESS', None): from pyvirtualdisplay import Display display = Display(visible=0, size=(1624, 1024)) display.start() import atexit atexit.register(lambda: display.stop()) INSTALLED_APPS = ["portal"] PIPELINE_ENABLED = False ROOT_URLCONF = "example_project.example_project.urls" STATIC_ROOT = "example_project/example_project/static" SECRET_KEY = "bad_test_secret" from django_autoconfig.autoconfig import configure_settings configure_settings(globals())
def check_lectures(): exec_start_time1 = time.time() email_message = '' driver = '' display = '' try: write_html('Time: ' + time.strftime("%H:%M") + ' Checking Cancelled Lectures \n') if check_website(lectures_url): cancelled = '' display = Display(visible=0, size=(1920, 1080)) display.start() driver = webdriver.Firefox() driver.get(lectures_url) driver.find_element_by_id('username').send_keys( 'MOODLE_USERNAME_HERE') driver.find_element_by_id('password').send_keys( base64.b64decode('MOODLE_Password_HERE').decode("utf-8")) driver.find_element_by_id('loginbtn').click() abs_lec = driver.find_element_by_xpath( '//*[@id="section-1"]/div[3]').text abs_lec_split = abs_lec.split('\n') today = (datetime.datetime.now()).strftime( "%A") # Define which subjects you have in which particular day if today == "Monday": todays_lec = ['database', 'project', 'networking security'] elif today == "Tuesday": todays_lec = ['project', 'advanced networking'] elif today == "Wednesday": todays_lec = ['project', 'advanced networking'] elif today == "Thursday": todays_lec = [ 'database', 'advanced networking', 'networking security' ] elif today == "Friday": todays_lec = ['project', 'networking security', 'database'] else: todays_lec = ['NO SCHOOL TODAY!'] for line in abs_lec_split: if current_class in line: for lectures in todays_lec: if lectures in line.lower(): cancelled = cancelled + line del lectures del line driver.quit() display.stop() write_html('Cancelled Lectures info Received \n') snd_message = check_notice(abs_lec.encode('utf-8')) if snd_message and cancelled != '': email_message = 'Below please find Cancelled lectures info:\n\n' + cancelled group_post(str(email_message), 'GROUP ID NO', "LEC") else: write_html( 'Cancelled Lectures still the same, E-mail not sent!') del cancelled del abs_lec del abs_lec_split del snd_message del driver del display del todays_lec del today else: write_html('Website Unreachable!') except Exception as err1: driver.quit() display.stop() status1, err_msg1 = update_log( str(err1), 'Error Origin: Cancelled Lectures Script') write_html(status1) del err1 del status1 del err_msg1 del driver del display time_took1 = time.time() - exec_start_time1 write_html('\nScript took ' + ("%.2f" % time_took1) + ' seconds to complete \n') del exec_start_time1 del email_message del time_took1
class WebAssay: """ This is a base class that is built ontop a Selenium driver. Inherit from this class to 1. parse web pages, 2. calculate the area and position of elements, and 3. stain HTML page for parsed elements. It can be used as a base class for variants of WebAssay. You must implement a `run` function to use the base class. """ def __init__(self, user_agent: str, window_size: tuple, headless=False, parser_functions: List = [], color_palette: Dict = {}, warpped_height_px: int = 700, reset_driver_after: int = 50): """ `headless` should be set to True if you want a headless web browser. `color_palette` is a dictionary that maps from element category to a hex color. `parser_functions` a list of parser functions. Where a parser function takes bs4, and returns a list of dictionaries. Be sure to make one of those keys contains `category`, if you're using a `color_pallette` and want to stain images. `warpped_height_px` is the minimum y-distance in pixels to consider an element warpped. """ # functions that take bs4 and return a list of dicts. self.parser_functions = parser_functions if len(self.parser_functions) == 0: raise ValueError("Please assign parser_functions!") # browser params self.window_size = window_size self.width, self.height = window_size self.user_agent = user_agent self.headless = headless self._init_browser() # optional params self.color_palette = color_palette # dictionary of category to color. self.warpped_height = warpped_height_px # skip elements whose height exceeds. # friends we make along the way self.error_files = [] # which files are not parsed correctly? self.element_metadata = pd.DataFrame( ) # the most recent element metadata. self.driver_reset_counter = 0 # driver will reset at `reset_driver_after`. self.reset_driver_after = reset_driver_after def _init_browser(self): """ Initalizes a selenium browser with proper `user_agent` and window `size`. Set `headless` to True to have a headless browser. Keep the default as False to help debug. """ self.display = False if self.headless: self.display = Display(visible=0, size=(self.width + 10, self.height + 10)) self.display.start() # Set up user agent profile = webdriver.FirefoxProfile() profile.set_preference("general.useragent.override", self.user_agent) firefox_capabilities = webdriver.DesiredCapabilities.FIREFOX firefox_capabilities['marionette'] = True driver = webdriver.Firefox(profile, capabilities=firefox_capabilities) driver.set_window_size(*self.window_size) self.driver = driver def close_driver(self): """Closes the driver""" self.driver.quit() if not isinstance(self.display, bool): self.display.stop() def restart_driver(self): """Restarts drivers and display""" self.close_driver() self._init_browser() self.driver_reset_counter = 0 time.sleep(2) def save_source(self, fn: str): """Saves the source code of a page.""" with open(fn, 'w') as f: f.write(self.driver.page_source) def screenshot_full(self, fn: str): """ Takes a full screenshot. There are other methods that work better with a headless browser (such as expanding the window). The screenshot is resized to the original dimensions. For whatever reason, I get higher res images by the default screenshot. The standard size allows us to mark up the screenshot with the element metadata in `paint_abstract_representation`. """ body = self.driver.find_element_by_tag_name('body') body.screenshot(fn) # resize image img = Image.open(fn) img.thumbnail((body.rect['width'], 1e6), Image.ANTIALIAS) img.save(fn) def identify_elements( self, body: Union[element.Tag, element.NavigableString]) -> List: """ Runs every parser in `self.parser_functions` through the web page. The results are appended to the `data` output. """ data = [] for parser in self.parser_functions: results = parser(body) data.extend(results) return data def stain_element(self, xpath: str, category: str, color: str = '#ffffff', opacity: float = 0.7) -> bool: """ Alters the HTML of a page. Stains elements located in `xpath` with `color` by overwritting the style attribute. Also sets a new param of markup_category = `category`. """ try: elm = self.driver.find_element_by_xpath(xpath) except: # couldn't find element return False if not elm.is_displayed(): return False style = elm.get_attribute('style') if elm.tag_name == 'img': custom_style = f"background-color: {color} !important; " \ "transition: all 0.5s linear;"\ "mix-blend-mode: multiply !important;" if style: style += '; ' + custom_style else: style = custom_style self.driver.execute_script( f"arguments[0].setAttribute('markup_category','{category}')", elm) self.driver.execute_script( f"arguments[0].setAttribute('style','{style}')", elm) parent = elm.find_element_by_xpath('ancestor::div[1]') style_parent = parent.get_attribute('style') custom_style = f"background-color: {color} !important; " if style_parent: style_parent += '; ' + custom_style else: style_parent = custom_style self.driver.execute_script( f"arguments[0].setAttribute('style','{style_parent}')", parent) else: self.driver.execute_script( f"arguments[0].setAttribute('markup_category','{category}')", elm) custom_style = f"background-color: {color} !important; "\ "transition: all 0.5s linear;" if style: style += '; ' + custom_style else: style = custom_style self.driver.execute_script( f"arguments[0].setAttribute('style','{style}')", elm) all_images = elm.find_elements_by_tag_name('img') for img in all_images: if img.is_displayed(): style = img.get_attribute('style') custom_style = f"background-color: {color} !important; " \ "mix-blend-mode: multiply !important; z-index:99 !important;" if style: style += '; ' + custom_style else: style = custom_style self.driver.execute_script( f"arguments[0].setAttribute('style','{style}')", img) all_videos = elm.find_elements_by_tag_name('video') for vid in all_videos: if vid.is_displayed(): style = vid.get_attribute('style') custom_style = f"background-color: {color} !important; " \ "mix-blend-mode: multiply !important; z-index:99 !important;" if style: style += '; ' + custom_style else: style = custom_style self.driver.execute_script( f"arguments[0].setAttribute('style','{style}')", vid) if elm.tag_name == 'a': all_children_by_xpath = elm.find_elements_by_tag_name("div") for child in all_children_by_xpath: if child.is_displayed(): style = elm.get_attribute('style') custom_style = f"background-color: {color} !important; " if style: style += '; ' + custom_style else: style = custom_style self.driver.execute_script( f"arguments[0].setAttribute('style','{style}')", child) return True def calculate_element_area(self, xpath: str) -> Dict: """ Selenium will try to find an element based on the `xpath`. If it is found, calculate the `area` that element occupies on first screen (`area`) and whole page (`area_page`). If the element is warpped or empty, return an empty dict. """ # get the element based on the xpath try: elm = self.driver.find_element_by_xpath(xpath) except: # couldn't find element return {} # get dimensions of element rect = elm.rect # skip warped elements if rect['height'] >= self.warpped_height: return {'is_warpped': True} # adjust the dimensions by clipping if necessay. "Area" is the first screen if elm.is_displayed(): area = calc_area(rect, location=rect, width=self.width, height_bottom=self.height) area_page = calc_area(rect, location=rect, width=self.width) meta = { 'xpath': xpath, 'dimensions': elm.size, 'location': elm.location, 'area': area, 'area_page': area_page, } return meta def open_local_html(self, fn): """Opens a local HTML page in the emulator.""" local_file = 'file://' + os.path.abspath(fn) if self.driver.current_url != local_file: self.driver.get(local_file) def run(self): """ This function must be overwritten in the inherited class. Should contain the following steps: 1. Read either the current page on the driver or a local HTML file `fn` into bs4... 2. Identify elements by sending the contents of the HTML through each parser in `parser_functions`. Do this by calling `self.identify_elements()` on the page. 3. For each element, `self.calculate_element_area()`, and optionally `self.stain_element()` if self.stain = True. 4. Assign `self.element_metadata` with the latest element metadata. And then anything else is up to you. """ raise NotImplementedError
def parse_page(self, response): try: from pyvirtualdisplay import Display display = Display(visible=0, size=(800, 800)) display.start() firefox_options = Options() firefox_options.add_argument('-headless') firefox_options.add_argument('--disable-gpu') driver = webdriver.Firefox(firefox_options=firefox_options, executable_path=settings.FIREFOX_PATH) driver.get(response.url) driver.implicitly_wait(100) elem_code = driver.find_elements_by_id('WarehouseCode') elem_acode = driver.find_elements_by_id('AccountCode') elem_name = driver.find_elements_by_id('UserName') elem_pass = driver.find_elements_by_id('Password') btn_login = driver.find_elements_by_css_selector('input[name="Login"]') if elem_code: elem_code[0].send_keys('03') if elem_acode: elem_acode[0].send_keys('001862') if elem_name: elem_name[0].send_keys('MAXLEAD') if elem_pass: elem_pass[0].send_keys('1202HXML') btn_login[0].click() driver.implicitly_wait(100) time.sleep(5) total_page = driver.find_elements_by_css_selector('#navigationTR nobr')[0].text total_page = int(total_page.split(' ')[-1]) for i in range(total_page): try: res = driver.find_elements_by_css_selector('#ViewManyListTable tr') elem = driver.find_element_by_id('MetaData') elem.click() res.pop(0) for val in res: td_re = val.find_elements_by_tag_name('td') if td_re: sku = td_re[0].text warehouse = 'Hanover' if td_re[3].text and not td_re[3].text == ' ': qty = td_re[3].text qty = qty.replace(',','') else: qty = 0 qty_sql = "select id from mmc_stocks where commodity_repertory_sku='%s' and warehouse='%s'" % ( sku, warehouse) self.db_cur.execute(qty_sql) self.db_cur.fetchone qty_re = self.db_cur.rowcount values = (qty, sku, warehouse) if qty_re > 0: sql = "update mmc_stocks set qty=%s where commodity_repertory_sku=%s and warehouse=%s" else: sql = "insert into mmc_stocks (qty, commodity_repertory_sku, warehouse) values (%s, %s, %s)" self.db_cur.execute(sql, values) if i < total_page: elem_next_page = driver.find_elements_by_id('Next') if elem_next_page: elem_next_page[0].click() driver.implicitly_wait(100) except: continue self.conn.commit() sql = "update mmc_spider_status set status=3, description='' where warehouse='Hanover'" self.db_cur.execute(sql) self.conn.commit() except Exception as e: values = (str(e),) sql = "update mmc_spider_status set status=2, description=%s where warehouse='Hanover'" self.db_cur.execute(sql, values) self.conn.commit() try: driver.refresh() driver.switch_to.alert.accept() driver.implicitly_wait(100) except: pass display.stop() driver.quit()
def create_thumbnails(documents, workingdir='.', skip=0, login=None, s3bucket=None): try: display = None from pyvirtualdisplay import Display display = Display(visible=0, size=(1280, 1024)) display.start() except: print 'No Xvfb!' workingdir = workingdir.rstrip('/') print 'workingdir:', workingdir if not os.path.exists(workingdir): parser.error("workingdir not exists") thumbnail_folder = workingdir + "/thumbnails/" if not os.path.exists(thumbnail_folder): os.mkdir(thumbnail_folder) thumbnail_doubts_folder = workingdir + "/thumbnails_doubts/" if not os.path.exists(thumbnail_doubts_folder): os.mkdir(thumbnail_doubts_folder) if skip: print 'skip:', skip log_file = codecs.open(workingdir + '/error_log.txt', 'a', 'utf-8') file(workingdir + '/running_status.txt', 'w').write("crawling started") if os.path.exists(workingdir + '/running_firefox_pid.txt'): os.remove(workingdir + '/running_firefox_pid.txt') driver, browser_pid = crawlutils.open_driver() try: if browser_pid: file(workingdir + '/running_firefox_pid.txt', 'w').write("%s" % (browser_pid)) #crawlutils.login(driver, login_url, 'loginname-id', 'password-id', 'alc_acc', 'n0thinghaschanged') if login: print 'login_url:', login['login_url'] crawlutils.login(driver, login['login_url'], login['username-control-id'], login['password-control-id'], login['username'], login['password']) print len(documents), 'to be processed' count = 0 for (document_id, document_url) in documents: count += 1 if skip > count: continue print 'count:', count try: file(workingdir + '/running_status.txt', 'w').write( "%s\t%s\t%s" % (count, document_id, document_url)) create_thumbnail(driver, workingdir, document_id, document_url, s3bucket) except: traceback.print_exc() if driver: try: driver.quit() except: pass if os.path.exists(workingdir + '/running_firefox_pid.txt'): os.remove(workingdir + '/running_firefox_pid.txt') driver, browser_pid = crawlutils.open_driver() if browser_pid: print 'browser_pid:', browser_pid file(workingdir + '/running_firefox_pid.txt', 'w').write("%s" % (browser_pid)) if login: print 'login_url:', login['login_url'] crawlutils.login(driver, login['login_url'], login['username-control-id'], login['password-control-id'], login['username'], login['password']) time.sleep(5) print 'completed' except: traceback.print_exc() finally: if os.path.exists(workingdir + '/running_firefox_pid.txt'): os.remove(workingdir + '/running_firefox_pid.txt') if os.path.exists(workingdir + '/running_status.txt'): os.remove(workingdir + '/running_status.txt') try: driver.quit() except: pass del driver if display: display.stop() log_file.close()
class GithubLogin(unittest.TestCase): def setUp(self): self.display = Display(visible=0, size=(1920, 1080)) self.display.start() self.driver = webdriver.Firefox(executable_path='./geckodriver') self.driver.implicitly_wait(30) self.base_url = "https://www.fabric.io" self.verificationErrors = [] self.accept_next_alert = True def JSonFile(self, file): ''' 讀取檔案將參數放入到ADJson()內是你要丟入的檔案 範例: ADJson = JSonFile('536_default.json') ''' with open(file, 'r', encoding='utf-8')as f: # 3.5 ADJson = json.load(f, object_pairs_hook=OrderedDict) return ADJson def Platform(self, PlatformName): PlatformCss = self.driver.find_elements_by_css_selector('.js-app-view span') PlatformNumber = 0 print("你選擇的平台是" + PlatformName) for i in PlatformCss: try: if i.text == PlatformName: PlatformCss[PlatformNumber].click() except: pass PlatformNumber += 1 time.sleep(5) def ClickCarshlytics(self): self.driver.find_element_by_css_selector(".crashlytics i").click() time.sleep(5) def EnterVserion(self,Version): print("你選擇的版本:") for i in range(len(Version)): print(Version[i]) for i in range(len(Version)): VersionCheck = self.driver.find_elements_by_css_selector(".Select-arrow-zone span") VersionCheck[0].click() self.driver.find_element_by_class_name('Select-control').send_keys(Version[i] + '\n') time.sleep(3) def ClearSelectIcon(self): # 清掉預設值 self.driver.find_element_by_class_name('Select-value-icon').click() time.sleep(5) def SelectAll(self): # ClickAll = self.driver.find_elements_by_css_selector("#state-group-all") self.driver.find_element_by_id("state-group-all").click() time.sleep(5) print("點擊All") def ReadAllUserSessions(self): All = self.driver.find_elements_by_css_selector('.crash-free-percent .stat .value') Name = self.driver.find_elements_by_css_selector('.crash-free-percent .stat .name') for i in All: AllUserSessions.append(i.text) for i in Name: AllUserSessionsName.append(i.text) def MoveWeb(self): # 向下滑動判斷150筆停止 print("正在滑動頁面請稍等...") # AllNumber = self.driver.find_elements_by_css_selector(".events-stat span span") for i in range(5): # if int(len(AllNumber)) == 180: # pass # else: js = "var q=document.documentElement.scrollTop=10000" self.driver.execute_script(js) time.sleep(5) def ReadUrl(self): #讀取URL 數量 print("開始-->讀取URL") URLNumber = self.driver.find_elements_by_css_selector(".cell-title a") for i in URLNumber: URL.append(i.get_attribute("href")) URLTitle.append("URL") print("結束-->讀取URL") def ReadCrashNumber(self): # 讀取Crash 數量 print("*"*10) print("開始-->讀取Crash數量") CrashNumber = self.driver.find_elements_by_css_selector(".events-stat span span") for i in CrashNumber: if i.text == 'CRASHES': pass elif i.text == 'CRASH': pass elif i.text == '': pass else: Crash.append(i.text) CrashTitle.append(("Crash")) print("結束-->讀取Crash數量") print("*"*10) def ReadUserNumber(self): # 讀取User 數量 print("開始-->讀取User數量") UserNumber = self.driver.find_elements_by_css_selector(".devices-stat span span") for i in UserNumber: if i.text =='USERS': pass elif i.text == '': pass elif i.text =='USER': pass else: User.append(i.text) UserTitle.append("User") print("結束-->讀取User數量") print("*"*10) def ReadVersionNumber(self): # 讀取發生的版本號 print("開始-->讀取Version") VersionNumber = self.driver.find_elements_by_css_selector(".more-info") for i in VersionNumber: Version.append(i.text) VersionTitle.append("Version") print("結束-->讀取Version") print("*"*10) def ReadIssueNumber(self): # Issue 數字 print("開始-->讀取Issue編號") IssueNumberTest = self.driver.find_elements_by_css_selector(".issue-number") for i in IssueNumberTest: IssueNumber.append(i.text) IssueNumberTitle.append("IssueNumber") print("結束-->讀取Issue編號") print("*"*10) def ReadIssueTitle(self): # Issue 開頭 print("開始-->讀取Issue開頭") IssueTitleTest = self.driver.find_elements_by_css_selector(".issue-title") for i in IssueTitleTest: IssueTitle.append(i.text) IssueTitleTitle.append("IssueTitle") print("結束-->讀取Issue開頭") print("*"*10) def ReadIssueSubtitle(self): # Issue 大綱 print("開始-->讀取Issue大綱") IssueSubtitleTest = self.driver.find_elements_by_css_selector(".issue-subtitle") for i in IssueSubtitleTest: IssueSubtitle.append(i.text) IssueSubtitleTitle.append("IssueSubtitle") print("結束-->讀取Issue大綱") print("*"*10) def ReadAllNumber(self): IssueAllNumber = len(IssueSubtitle) x = 1 for i in range(IssueAllNumber): TestAll.append(x) TestAllTitle.append("Rank") x += 1 def Get_RecentActivity(self): self.DefaultValue() RecentActivity = self.driver.find_elements_by_css_selector(".padding-left-15px tbody td") # print(len(RecentActivityOccurrences)) x = 1 for i in RecentActivity: if x % 2 == 0: RecentActivityOccurrences.append(i.text) RecentActivityOccurrencesTitle.append('Occurrences') else: RecentActivityVersion.append(i.text) RecentActivityVersionTitle.append('Version') x += 1 def ListToJsonFile(self, FileName): print("開始-->將資料轉成Json") for i in range(len(IssueNumber)): '''先將原本的字串另存到新的空字串中''' IssueNumberA.append(IssueNumber[i]) IssueTitleA.append(IssueTitle[i]) IssueSubtitleA.append(IssueSubtitle[i]) VersionA.append(Version[i]) CrashA.append(Crash[i]) UserA.append(User[i]) URLA.append(URL[i]) TestAllA.append(TestAll[i]) '''將兩個字串合併成字典''' TestAllDict = OrderedDict(zip(TestAllTitle, TestAllA)) IssueNumberDict = OrderedDict(zip(IssueNumberTitle, IssueNumberA)) IssueTitleDict = OrderedDict(zip(IssueTitleTitle, IssueTitleA)) IssueSubtitleDict = OrderedDict(zip(IssueSubtitleTitle, IssueSubtitleA)) VersionDict = OrderedDict(zip(VersionTitle, VersionA)) CrashDict = OrderedDict(zip(CrashTitle, CrashA)) UserDict = OrderedDict(zip(UserTitle, UserA)) URLDict = OrderedDict(zip(URLTitle, URLA)) '''每次字典更新新增一筆''' TestAllDict.update(IssueNumberDict) TestAllDict.update(IssueTitleDict) TestAllDict.update(IssueSubtitleDict) TestAllDict.update(VersionDict) TestAllDict.update(CrashDict) TestAllDict.update(UserDict) TestAllDict.update(URLDict) data.append(TestAllDict) dataDict['data'] = data # '''將字典存成Json''' with open(FileName, 'w') as f: json.dump(dataDict, f) f.close() print("結束-->將資料轉成Json") print("*"*10) print("請查看" + FileName) def ListToJsonFile_Crash(self, FileName): print("開始-->將資料轉成Json") User_Input.Version.append('All Version') itmes = 0 Test = len(AllUserSessions)//2 User = len(GetUserNumberTest) for i in range(Test): # 主要是新增崩潰量 因為一次抓取兩個參數值 # 一開始會先執行else部分之後都會去執行i >=1 if i >= 1: itmes += 1 AllUserSessionsA.append(AllUserSessions[i + itmes]) AllUserSessionsNameA.append(AllUserSessionsName[i + itmes]) AllUserSessionsA.append(AllUserSessions[i + itmes + 1]) AllUserSessionsNameA.append(AllUserSessionsName[i + itmes + 1]) else: AllUserSessionsA.append(AllUserSessions[i]) AllUserSessionsNameA.append(AllUserSessionsName[i]) AllUserSessionsA.append(AllUserSessions[i + 1]) AllUserSessionsNameA.append(AllUserSessionsName[i + 1]) # 主要是判斷使用者的人數 如果超過某區塊會將使用者人數設定為Null. if i > (User-1): # AllUserSessionsA.append('Null') # AllUserSessionsNameA.append('User') pass else: AllUserSessionsA.append(GetUserNumberTest[i]) AllUserSessionsNameA.append('User') Sessions = OrderedDict(zip(AllUserSessionsNameA, AllUserSessionsA)) SessionsA.append(Sessions) Get_crash_free_session = OrderedDict(zip(User_Input.Version, SessionsA)) '''將字典存成Json''' with open(FileName, 'w') as f: json.dump(Get_crash_free_session, f) f.close() print("結束-->將資料轉成Json") print("*"*10) print("請查看" + FileName) def test_Read_Fabirc(self): print('Top build version query raw data') driver = self.driver driver.get(self.base_url + "/login") driver.find_element_by_id("email").clear() driver.find_element_by_id("email").send_keys(github_account) driver.find_element_by_id("password").clear() driver.find_element_by_id("password").send_keys(github_passwd) driver.find_element_by_class_name("sign-in").click() time.sleep(5) # iOS or Android self.Platform(PlatformName) # Sean self.ClickCarshlytics() self.EnterVserion(Top_build) # Sean self.ClearSelectIcon() self.SelectAll() self.ReadAllUserSessions() self.MoveWeb() self.ReadUrl() self.ReadCrashNumber() self.ReadUserNumber() self.ReadVersionNumber() self.ReadIssueNumber() self.ReadIssueTitle() self.ReadIssueSubtitle() self.ReadAllNumber() self.ListToJsonFile('Top_build_Fabric.json') print("Get Recent Activity") time.sleep(2) ADJson = self.JSonFile('Top_build_Fabric.json') # driver.get(ADJson['data'][0]['URL']) # self.Get_RecentActivity() for i in range(len(ADJson['data'])): driver.get(ADJson['data'][i]['URL']) self.Get_RecentActivity() for j in range(len(RecentActivityOccurrences)): RecentActivityOccurrencesA.append(RecentActivityOccurrences[j]) RecentActivityVersionA.append(RecentActivityVersion[j]) '''將兩個字串合併成字典''' RecentActivityOccurrencesDict = OrderedDict( zip(RecentActivityOccurrencesTitle, RecentActivityOccurrencesA)) RecentActivityVersionDict = OrderedDict(zip(RecentActivityVersionTitle, RecentActivityVersionA)) '''每次字典更新新增一筆''' RecentActivityVersionDict.update(RecentActivityOccurrencesDict) RecentActivity.append(RecentActivityVersionDict) RecentActivityDict['RecentActivity'] = RecentActivity ADJson['data'][i].update(RecentActivityDict) # RecentActivityDict = {} with open('Top_build_Fabric.json', 'w') as f: json.dump(ADJson, f) f.close() print("結束-->將資料轉成Json") print("*" * 10) print("請查看" + 'Top_build_Fabric.json') def test_Carsh_Top(self): print('Get crash-free session only') driver = self.driver driver.get(self.base_url + "/login") driver.find_element_by_id("email").clear() driver.find_element_by_id("email").send_keys(github_account) driver.find_element_by_id("password").clear() driver.find_element_by_id("password").send_keys(github_passwd) driver.find_element_by_class_name("sign-in").click() driver.save_screenshot('Mark.png') time.sleep(5) self.Platform(PlatformName) # Sean self.ClickCarshlytics() for i in range(len(SelectVersion)): SelectVersionA.append(SelectVersion[i]) self.EnterVserion(SelectVersionA) # Sean self.ClearSelectIcon() self.ReadAllUserSessions() # self.GetGoodAdoptionURLfunction() SelectVersionA.pop() # 讀取 All Verison print("你選擇的版本:\nAll Version") self.ClearSelectIcon() self.ReadAllUserSessions() # GetUserNumberTest.append("Null") self.GetGoodAdoptionUserNumber() # self.GetGoodAdoptionURLfunction() # # for i in range(len(GetGoodAdoptionURLTest)): # # if GetGoodAdoptionURLTest[i] is 'Null': # GetUserNumberTest.append('Null') # else: # self.driver.get(GetGoodAdoptionURLTest[i]) # time.sleep(15) # self.driver.save_screenshot(str([i]) + 'Mark.png') # self.GetGoodAdoptionUserNumber() # 查詢前幾版的崩潰狀況 self.ListToJsonFile_Crash('Fabric.json') def GetGoodAdoptionUserNumber(self): UserURL = "https://www.fabric.io/photogrid/android/apps/" + pgk + "/dashboard/latest_release/launch_status?build=" UserURLAll = "https://www.fabric.io/photogrid/android/apps/" + pgk + "/dashboard/latest_release/launch_status?build=all" print("你選擇的版本:") for i in range(len(SelectVersion)): print(SelectVersion[i]) for i in range(len(SelectVersion)): self.driver.get(UserURL + SelectVersion[i]) time.sleep(5) GetUserNumber = self.driver.find_elements_by_css_selector(".coverage-section .flex-1 .flex-1 .large") print('GetUserNumber : ' + str(GetUserNumber)) x = 0 for y in GetUserNumber: x += 1 if x == 1: GetUserNumberTest.append(str(y.text)) print("get user") print(str(y.text)) time.sleep(3) self.driver.get(UserURLAll) time.sleep(5) GetUserNumber = self.driver.find_elements_by_css_selector(".coverage-section .flex-1 .flex-1 .large") x = 0 for y in GetUserNumber: x += 1 if x == 1: GetUserNumberTest.append(str(y.text)) print("get user") print(str(y.text)) def GetGoodAdoptionURLfunction(self): GetGoodAdoptionURL = self.driver.find_elements_by_css_selector('.flex-1 .answers-link') TestList = [] # 判斷如果沒有連結會自動帶入Null if GetGoodAdoptionURL == TestList: GetGoodAdoptionURLTest.append('Null') for i in GetGoodAdoptionURL: GetGoodAdoptionURLTest.append(i.get_attribute("href")) print("get href") print(i.get_attribute("href")) def DefaultValue(self): global RecentActivityOccurrencesA, RecentActivityVersionA, RecentActivityOccurrences, RecentActivityVersion, \ RecentActivityDict, RecentActivity RecentActivityOccurrencesA = [] RecentActivityVersionA = [] RecentActivityOccurrences = [] RecentActivityVersion = [] RecentActivityDict = {} RecentActivity = [] def tearDown(self): self.driver.quit() self.display.stop()
def check_assignments(): write_html('Checking for new Assignments...') driver = '' display = '' try: display = Display(visible=0, size=(1920, 1080)) display.start() # Auto logon via firefox plugin profile = webdriver.FirefoxProfile( profile_directory= r"/home/Python_User/LogFiles/ATSNoticesAndCancelledLectures/RequiredFiles/SeleniumProfile/16ykebtq.Seleniumprofile" ) profile.add_extension( r"/home/Python_User/LogFiles/ATSNoticesAndCancelledLectures/RequiredFiles/SeleniumProfile/seleniumDriver/autoauth-2.1-fx+fn.xpi" ) driver = webdriver.Firefox(firefox_profile=profile) driver.get(ATS_Asgt_url) sleep(2) driver.find_element_by_xpath( '//*[@id="cmbSemester"]/option[2]').click() sleep(2) current_assignments = [] file = open( r"/home/Python_User/LogFiles/ATSNoticesAndCancelledLectures/RequiredFiles/ATS_assignments.txt", "a+") file.seek(0) for ln in file: current_assignments.append(ln.strip('\n')) del ln try: table_id = driver.find_element_by_id('dgMaterialVerification') rows = table_id.find_elements_by_tag_name('tr') for row in rows: unt = row.find_elements_by_tag_name("td")[3] col = row.find_elements_by_tag_name("td")[5] st_ty = row.find_elements_by_tag_name("td")[7] unit_name = unt.text ass_title = col.text ass_sit_type = st_ty.text assignment = ass_title + ' ' + ass_sit_type if assignment not in current_assignments: file.write(assignment + '\n') write_html('ATS Assignment Information Collected\n') email_message = 'You have New Assignments on ATS!\n\t' + 'Assignment Title: ' + ass_title + ', ' + ass_sit_type + ' (' + unit_name + ')\n\nATS link: ' + ATS_Asgt_url group_post(str(email_message), 'GROUP ID NO ', "ATS_Assignments") del email_message del row, col, ass_title, table_id, rows, ass_title except: pass file.close() driver.quit() display.stop() except Exception as err2: driver.quit() display.stop() write_html('\nError Occurred!!') status2, err_msg2 = update_log(str(err2), 'Error Origin: ATS Assignment Script') write_html(status2) del status2 del err2 del err_msg2
def do_selenium(url, user_agent, domain, source): # start up the virtual display display = Display(visible=0, size=(1366, 768)) display.start() # start up browser profile = webdriver.FirefoxProfile() profile.set_preference("general.useragent.override", user_agent) browser = webdriver.Firefox(firefox_profile=profile) browser.set_page_load_timeout(15) try: with Timeout(60): browser.get(url) except Timeout.Timeout: print bcolors.WARNING + " [-] " + url + " has timed out. :(" + bcolors.ENDC return False except Exception: e = sys.exc_info()[0] print bcolors.WARNING + " [-] " + url + " has errored: %s" % e + bcolors.ENDC return False # accept a pop up alert if one comes up try: alert = browser.switch_to.alert print "\n[+] Popup alert observed: %s\n" % alert.text if re.search( "(?:requesting your username|zeus|call microsoft|call apple|call support)", alert.text, re.IGNORECASE): print "\n [-] This looks like it might be a tech support scam user/password popup, leaving it alone." pass else: alert.accept() print "[+] Popup Alert observed, bypassing..." except Exception: pass # check page source to eliminate looking at pages that are parked and stuff we dont care about try: pagesource = browser.page_source except Exception: return False # do the screencap and sort it into known tp, known fp, or unknown try: pagetitle = browser.title.lower() shot_name = time.strftime( "%Y%m%d-%H%M%S") + '-' + source + '-' + domain + '.png' try: browser.save_screenshot(shot_name) print " [+] Screencapped %s as %s" % (url, shot_name) except Exception: print bcolors.FAIL + " [-] Unable to screencap " + url + bcolors.ENDC pass except Exception: print bcolors.FAIL + " [-] An error occured, unable to screencap " + url + bcolors.ENDC pass # screencaps.close() browser.quit() display.stop() return True
class LowLevelAPI(object): """low level api to interface with the service""" def __init__(self, brow="firefox"): self.brow_name = brow self.positions = [] self.movements = [] self.stocks = [] # init globals Glob() def launch(self): """launch browser and virtual display, first of all to be launched""" try: # init virtual Display self.vbro = Display() self.vbro.start() logger.debug("virtual display launched") except Exception: raise exceptions.VBroException() try: self.browser = Browser(self.brow_name) logger.debug(f"browser {self.brow_name} launched") except Exception: raise exceptions.BrowserException(self.brow_name, "failed to launch") return True def css(self, css_path, dom=None): """css find function abbreviation""" if dom is None: dom = self.browser return expect(dom.find_by_css, args=[css_path]) def css1(self, css_path, dom=None): """return the first value of self.css""" if dom is None: dom = self.browser def _css1(path, domm): """virtual local func""" return self.css(path, domm)[0] return expect(_css1, args=[css_path, dom]) def search_name(self, name, dom=None): """name find function abbreviation""" if dom is None: dom = self.browser return expect(dom.find_by_name, args=[name]) def xpath(self, xpath, dom=None): """xpath find function abbreviation""" if dom is None: dom = self.browser return expect(dom.find_by_xpath, args=[xpath]) def elCss(self, css_path, dom=None): """check if element is present by css""" if dom is None: dom = self.browser return expect(dom.is_element_present_by_css, args=[css_path]) def elXpath(self, xpath, dom=None): """check if element is present by css""" if dom is None: dom = self.browser return expect(dom.is_element_present_by_xpath, args=[xpath]) def login(self, username, password, mode="demo"): """login function""" url = "https://trading212.com/it/login" try: logger.debug(f"visiting %s" % url) self.browser.visit(url) logger.debug(f"connected to %s" % url) except selenium.common.exceptions.WebDriverException: logger.critical("connection timed out") raise try: self.search_name("login[username]").fill(username) self.search_name("login[password]").fill(password) self.css1(path['log']).click() # define a timeout for logging in timeout = time.time() + 30 while not self.elCss(path['logo']): if time.time() > timeout: logger.critical("login failed") raise CredentialsException(username) time.sleep(1) logger.info(f"logged in as {username}") # check if it's a weekend if mode == "demo" and datetime.now().isoweekday() in range(5, 8): timeout = time.time() + 10 while not self.elCss(path['alert-box']): if time.time() > timeout: logger.warning("weekend trading alert-box not closed") break if self.elCss(path['alert-box']): self.css1(path['alert-box']).click() logger.debug("weekend trading alert-box closed") except Exception as e: logger.critical("login failed") raise exceptions.BaseExc(e) return True def logout(self): """logout func (quit browser)""" try: self.browser.quit() except Exception: raise exceptions.BrowserException(self.brow_name, "not started") return False self.vbro.stop() logger.info("logged out") return True def get_bottom_info(self, info): accepted_values = { 'free_funds': 'equity-free', 'account_value': 'equity-total', 'live_result': 'equity-ppl', 'used_margin': 'equity-margin' } try: info_label = accepted_values[info] val = self.css1("div#%s span.equity-item-value" % info_label).text return num(val) except KeyError as e: raise exceptions.BaseExc(e) def get_price(self, name): soup = BeautifulSoup( self.css1("div.scrollable-area-content").html, "html.parser") for product in soup.select("div.tradebox"): fullname = product.select("span.instrument-name")[0].text.lower() if name.lower() in fullname: mark_closed_list = [ x for x in product.select("div.quantity-list-input-wrapper") if x.select("div.placeholder")[0].text.lower().find( "close") != -1 ] if mark_closed_list: sell_price = product.select("div.tradebox-price-sell")[0]\ .text return float(sell_price) else: return False class MovementWindow(object): """add movement window""" def __init__(self, api, product): self.api = api self.product = product self.state = 'initialized' self.insfu = False def open(self, name_counter=None): """open the window""" if self.api.css1(path['add-mov']).visible: self.api.css1(path['add-mov']).click() else: self.api.css1('span.dataTable-no-data-action').click() logger.debug("opened window") self.api.css1(path['search-box']).fill(self.product) if self.get_result(0) is None: self.api.css1(path['close']).click() raise exceptions.ProductNotFound(self.product) result, product = self.search_res(self.product, name_counter) result.click() if self.api.elCss("div.widget_message"): self.decode(self.api.css1("div.widget_message")) self.product = product self.state = 'open' def _check_open(self): if self.state == 'open': return True else: raise exceptions.WindowException() def close(self): """close a movement""" self._check_open() self.api.css1(path['close']).click() self.state = 'closed' logger.debug("closed window") def confirm(self): """confirm the movement""" self._check_open() self.get_price() self.api.css1(path['confirm-btn']).click() widg = self.api.css("div.widget_message") if widg: self.decode(widg[0]) raise exceptions.WidgetException(widg) if all(x for x in ['quantity', 'mode'] if hasattr(self, x)): self.api.movements.append( Movement(self.product, self.quantity, self.mode, self.price)) logger.debug("%s movement appended to the list" % self.product) self.state = 'conclused' logger.debug("confirmed movement") def search_res(self, res, check_counter=None): """search for a res""" logger.debug("searching result") result = self.get_result(0) name = self.get_research_name(result) x = 0 while not self.check_name(res, name, counter=check_counter): name = self.get_research_name(self.get_result(x)) if name is None: self.api.css1(path['close']).click() raise exceptions.ProductNotFound(res) logger.debug(name) if self.check_name(res, name, counter=check_counter): return self.get_result(x) x += 1 logger.debug("found product at position %d" % (x + 1)) return result, name def check_name(self, name, string, counter=None): """if both in string return False""" name = name.lower() string = string.lower() if counter is None: if name in string: return True else: return False counter = counter.lower() if name in string and counter in string: logger.debug("check_name: counter found in string") return False elif name in string and counter not in string: return True else: return False def get_research_name(self, res): """return result name""" if res is None: return None return self.api.css1("span.instrument-name", res).text def get_result(self, pos): """get pos result, where 0 is first""" evalxpath = path['res'] + f"[{pos + 1}]" try: res = self.api.xpath(evalxpath)[0] return res except Exception: return None def set_limit(self, category, mode, value): """set limit in movement window""" self._check_open() if (mode not in ["unit", "value"] or category not in ["gain", "loss", "both"]): raise ValueError() if not hasattr(self, 'stop_limit'): self.stop_limit = {'gain': {}, 'loss': {}} logger.debug("initialized stop_limit") if category == 'gain': self.api.xpath(path['limit-gain-%s' % mode])[0].fill( str(value)) elif category == 'loss': self.api.xpath(path['limit-loss-%s' % mode])[0].fill( str(value)) if category != 'both': self.stop_limit[category]['mode'] = mode self.stop_limit[category]['value'] = value elif category == 'both': self.api.xpath(path['limit-gain-%s' % mode])[0].fill( str(value)) self.api.xpath(path['limit-loss-%s' % mode])[0].fill( str(value)) for cat in ['gain', 'loss']: self.stop_limit[cat]['mode'] = mode self.stop_limit[cat]['value'] = value logger.debug("set limit") def decode(self, message): """decode text pop-up""" title = self.api.css1("div.title", message).text text = self.api.css1("div.text", message).text if title == "Insufficient Funds": self.insfu = True elif title == "Maximum Quantity Limit": raise exceptions.MaxQuantLimit(num(text)) elif title == "Minimum Quantity Limit": raise exceptions.MinQuantLimit(num(text)) logger.debug("decoded message") def decode_update(self, message, value, mult=0.1): """decode and update the value""" try: msg_text = self.api.css1("div.text", message).text return num(msg_text) except Exception: if msg_text.lower().find("higher") != -1: value += value * mult return value else: self.decode(message) return None def get_mov_margin(self): """get the margin of the movement""" self._check_open() return num(self.api.css1("span.cfd-order-info-item-value").text) def set_mode(self, mode): """set mode (buy or sell)""" self._check_open() if mode not in ["buy", "sell"]: raise ValueError() self.api.css1(path[mode + '-btn']).click() self.mode = mode logger.debug("mode set") def get_quantity(self): """gte current quantity""" self._check_open() quant = int(num(self.api.css1(path['quantity']).value)) self.quantity = quant return quant def set_quantity(self, quant): """set quantity""" self._check_open() self.api.css1(path['quantity']).fill(str(int(quant))) self.quantity = quant logger.debug("quantity set") def get_price(self, mode='buy'): """get current price""" if mode not in ['buy', 'sell']: raise ValueError() self._check_open() price = num( self.api.css1("div.orderdialog div.tradebox-price-%s" % mode).text) self.price = price return price def get_unit_value(self): """get unit value of stock based on margin, memoized""" # find in the collection try: unit_value = Glob().theCollector.collection['unit_value'] unit_value_res = unit_value[self.product] logger.debug("unit_value found in the collection") return unit_value_res except KeyError: logger.debug("unit_value not found in the collection") pip = get_pip(mov=self) quant = 1 / pip if hasattr(self, 'quantity'): old_quant == self.quantity self.set_quantity(quant) # update the site time.sleep(0.5) margin = self.get_mov_margin() logger.debug(f"quant: {quant} - pip: {pip} - margin: {margin}") if 'old_quant' in locals(): self.set_quantity(old_quant) unit_val = margin / quant self.unit_value = unit_val Glob().unit_valueHandler.add_val({self.product: unit_val}) return unit_val def new_mov(self, name): """factory method pattern""" return self.MovementWindow(self, name) class Position(PurePosition): """position object""" def __init__(self, api, html_div): """initialized from div""" self.api = api if isinstance(html_div, type('')): self.soup_data = BeautifulSoup(html_div, 'html.parser') else: self.soup_data = html_div self.product = self.soup_data.select("td.name")[0].text self.quantity = num(self.soup_data.select("td.quantity")[0].text) if ("direction-label-buy" in self.soup_data.select("td.direction")[0].span['class']): self.mode = 'buy' else: self.mode = 'sell' self.price = num(self.soup_data.select("td.averagePrice")[0].text) self.margin = num(self.soup_data.select("td.margin")[0].text) self.id = self.find_id() def update(self, soup): """update the soup""" self.soup_data = soup return soup def find_id(self): """find pos ID with with given data""" pos_id = self.soup_data['id'] self.id = pos_id return pos_id @property def close_tag(self): """obtain close tag""" return f"#{self.id} div.close-icon" def close(self): """close position via tag""" self.api.css1(self.close_tag).click() try: self.api.xpath(path['ok_but'])[0].click() except selenium.common.exceptions.ElementNotInteractableException: if (self.api.css1('.widget_message div.title').text == 'Market Closed'): logger.error("market closed, position can't be closed") raise exceptions.MarketClosed() raise exceptions.WidgetException( self.api.css1('.widget_message div.text').text) # wait until it's been closed # set a timeout timeout = time.time() + 10 while self.api.elCss(self.close_tag): time.sleep(0.1) if time.time() > timeout: raise TimeoutError("failed to close pos %s" % self.id) logger.debug("closed pos %s" % self.id) def get_gain(self): """get current profit""" gain = num(self.soup_data.select("td.ppl")[0].text) self.gain = gain return gain def bind_mov(self): """bind the corresponding movement""" logger = logging.getLogger("tradingAPI.low_level.bind_mov") mov_list = [ x for x in self.api.movements if x.product == self.product and x.quantity == self.quantity and x.mode == self.mode ] if not mov_list: logger.debug("fail: mov not found") return None else: logger.debug("success: found movement") for x in mov_list: # find approximate price max_roof = self.price + self.price * 0.01 min_roof = self.price - self.price * 0.01 if min_roof < x.price < max_roof: logger.debug("success: price corresponding") # bind mov self.mov = x return x else: logger.debug("fail: price %f not corresponding to %f" % (self.price, x.price)) continue # if nothing, return None return None def new_pos(self, html_div): """factory method pattern""" pos = self.Position(self, html_div) pos.bind_mov() self.positions.append(pos) return pos