def setUpClass(cls): options = ChromeOptions() options.headless = True options.add_argument('--no-sandbox') options.add_argument('--disable-gpu') options.add_argument('--privileged') options.add_argument('--window-size=1920,1080') if 'TEST_CHROMEDRIVER' in os.environ: chromedriver_executable = os.environ['TEST_CHROMEDRIVER'] else: chromedriver_executable = '/usr/lib/chromium-browser/chromedriver' cls.selenium = webdriver.Chrome( executable_path=chromedriver_executable, chrome_options=options) cls.selenium.implicitly_wait(10) super(SeleniumTest, cls).setUpClass() cls.server_url = cls.live_server_url
def setUpClass(cls): # We try Chrome, fallback to Firefox try: driver_options = ChromeOptions() # Headless on Appveyor/Travis if "CI" in os.environ: driver_options.add_argument("--headless") driver_options.add_argument("--no-sandbox") cls.driver = webdriver.Chrome(chrome_options=driver_options) except WebDriverException: driver_options = FirefoxOptions() # Headless on Appveyor/Travis if "CI" in os.environ: driver_options.headless = True cls.driver = webdriver.Firefox(firefox_options=driver_options) # Get the newsserver-info, if available if "SAB_NEWSSERVER_HOST" in os.environ: cls.newsserver_host = os.environ['SAB_NEWSSERVER_HOST'] cls.newsserver_user = os.environ['SAB_NEWSSERVER_USER'] cls.newsserver_password = os.environ['SAB_NEWSSERVER_PASSWORD']
def _prepare_webdriver(): chrome_options = Options() chrome_options.headless = True return webdriver.Chrome(ChromeDriverManager().install(), chrome_options=chrome_options)
def post(self): args = Scraper.parser.parse_args() options = Options() options.headless = True try: #the chromedriver executable should be installed in the src folder driver = webdriver.Chrome('./chromedriver', chrome_options=options) except: return {"message":"Error getting chrome driver, is it installed?"} try: driver.get(args['url']) except: return {"message":"Invalid URL"} time.sleep(5) html = driver.page_source soup = BeautifulSoup(html, 'html.parser') #try nhl standings tags = [] try: for i in soup.findAll('div', {'class': 'responsive-datatable__pinned'}): nhl_table = i.find("tbody") for t in nhl_table.findAll("tr"): team_name = t.find('span', {'class':'team--name'}).text tag = {} tag['games_played'] = t.find('td', attrs={'data-col':'1'}).find('span').text tag['tag_key'] = team_name tags.append(tag) tag = {} tag['wins'] = t.find('td', attrs={'data-col':'2'}).find('span').text tag['tag_key'] = team_name tags.append(tag) tag = {} tag['losses'] = t.find('td', attrs={'data-col':'3'}).find('span').text tag['tag_key'] = team_name tags.append(tag) tag = {} tag['OT_losses'] = t.find('td', attrs={'data-col':'4'}).find('span').text tag['tag_key'] = team_name tags.append(tag) tag = {} tag['points'] = t.find('td', attrs={'data-col':'5'}).find('span').text tag['tag_key'] = team_name tags.append(tag) tag = {} tag['points_percentage'] = t.find('td', attrs={'data-col':'6'}).find('span').text tag['tag_key'] = team_name tags.append(tag) tag = {} tag['regulation_wins'] = t.find('td', attrs={'data-col':'7'}).find('span').text tag['tag_key'] = team_name tags.append(tag) tag = {} tag['regulation_and_OT_wins'] = t.find('td', attrs={'data-col':'8'}).find('span').text tag['tag_key'] = team_name tags.append(tag) tag = {} tag['goals_for'] = t.find('td', attrs={'data-col':'9'}).find('span').text tag['tag_key'] = team_name tags.append(tag) tag = {} tag['goals_against'] = t.find('td', attrs={'data-col':'10'}).find('span').text tag['tag_key'] = team_name tags.append(tag) tag = {} tag['goal_differential'] = t.find('td', attrs={'data-col':'11'}).find('span').text tag['tag_key'] = team_name tags.append(tag) tag = {} tag['home'] = t.find('td', attrs={'data-col':'12'}).find('span').text tag['tag_key'] = team_name tags.append(tag) tag = {} tag['away'] = t.find('td', attrs={'data-col':'13'}).find('span').text tag['tag_key'] = team_name tags.append(tag) tag = {} tag['shoot_out'] = t.find('td', attrs={'data-col':'14'}).find('span').text tag['tag_key'] = team_name tags.append(tag) tag = {} tag['last10'] = t.find('td', attrs={'data-col':'15'}).find('span').text tag['tag_key'] = team_name tags.append(tag) tag = {} tag['streak'] = t.find('td', attrs={'data-col':'16'}).find('span').text tag['tag_key'] = team_name tags.append(tag) if tags: #if teams is not empty, else try a different site return tags except: pass tags = [] try: head = soup.find('thead') labels = [] for c in head.findAll('th'): labels.append(c.text.strip()) body = soup.find('tbody') for i in body.findAll('tr'): colIndex = 0 name = '' for j in i.findAll('td'): tag = {} if colIndex == 0: name_tag = j.find('div',{'class': 'd3-o-club-fullname'}) if not name_tag: name_tag = j.find('a',{'class': 'd3-o-player-fullname nfl-o-cta--link'}) name = name_tag.text.strip() colIndex += 1 continue tag[labels[colIndex]] = j.text.strip() tag['tag_key'] = name tags.append(tag) colIndex += 1 if tags: #if teams is not empty, else try a different site return tags except Exception as e: pass return{"message":"Could not scrape data from URL"} divs = soup.find_all("div") tables = soup.find_all("table") return {"div_tags:" : [str(d) for d in divs], "table_tags:": [str(t) for t in tables]}
from selenium.webdriver.common.proxy import Proxy, ProxyType from selenium import webdriver from selenium.webdriver.common.keys import Keys from selenium.webdriver.chrome.options import Options import time import os import requests import sys plat = sys.argv[1:] options = Options() options.headless = True try: if 'win' in plat[0]: browser = webdriver.Chrome('chromedriver.exe', chrome_options=options) elif 'mac' in plat[0]: browser = webdriver.Chrome('mac', chrome_options=options) elif 'linux' in plat[0]: browser = webdriver.Chrome('linux', chrome_options=options) else: print('no OS argument provided. --win/--mac/--linux') except: print('No argument provided or webdriver error') apks_folder = "apks/" code_folder = 'source_code/' for file in os.listdir(apks_folder):
### This code helps to scrap the headlines from Economictimes website. ### from selenium import webdriver from bs4 import BeautifulSoup import dynamic2, dynamic3 from selenium.webdriver.chrome.options import Options from datetime import datetime import pandas as pd # initiating the timer t1 = datetime.now() chrome_options = Options() chrome_options.add_argument("--headless") chrome_options.headless = True driver = webdriver.Chrome( "C:/Users/Mein Pc/Downloads/chromedriver_win32/chromedriver") base_url = 'https://economictimes.indiatimes.com/archive.cms' driver.get(base_url) data = driver.page_source soup = BeautifulSoup(data, 'html.parser') span = soup.find_all('span', class_='normtxt') all_month_link_dict = {}
def __init__(self): opts = Options() if OPEN_BROWSER == 0: opts.headless = True self.browser = webdriver.Chrome(CHROMEDRIVER, options=opts)
def searching_information(self): columns_order = pd.read_csv(self._dir_path + '/Columns_order.txt', header=None) columns_order = columns_order[0].tolist() options = Options() options.headless = True options.add_argument('--disable-notifications') options.add_argument('--no-sandbox') options.add_argument('--verbose') options.add_argument('--disable-gpu') options.add_argument('--disable-software-rasterizer') options.add_argument("--log-level=3") options.add_argument('--hide-scrollbars') self._browser = webdriver.Chrome(ChromeDriverManager().install(), \ options = options) df = pd.DataFrame(columns=columns_order) for chemical in self.chemicals: try: headers, Name, Molecular_Weight = self._searching_headers( chemical) Properties = { 'Name': Name, 'Molecular Mass': Molecular_Weight, 'Consulted Date': self._now, 'Source': self._url, 'TRI_CHEM_ID': chemical } if len(headers) == 0: df_aux = pd.DataFrame( {key: [value] for key, value in Properties.items()}) df = pd.concat([df, df_aux], ignore_index=True, sort=True, axis=0) self._browser.back() else: for header in headers: Results = self._searching_properties(header, chemical) for key, val in Results.items(): Properties.update({ key: val[0], key + ' - Units': val[1] }) self._browser.back() df_aux = pd.DataFrame({ key: [value] for key, value in Properties.items() }) df = pd.concat([df, df_aux], ignore_index=True, sort=True, axis=0) except NoSuchElementException: continue df = df[columns_order] if self._existing: df.to_csv(self.file_save, index=False, mode='a', sep=',', header=False) else: df.to_csv(self.file_save, index=False, sep=',') self._browser.close()
def openSiap(self): options = Options() options.headless = True self.driver = webdriver.Chrome(options=options) self.driver.get(self.urlsiap)
def Flipkart(): headers = { 'Connection': 'keep-alive', 'Cache-Control': 'max-age=0', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36', 'Sec-Fetch-User': '******', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'Sec-Fetch-Site': 'same-origin', 'Sec-Fetch-Mode': 'navigate', 'referer': 'https://www.flipkart.com/', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'en-GB,en;q=0.9,en-US;q=0.8,nl;q=0.7', } URL = "https://www.flipkart.com/search?q=" + str( Product_search ) + "&otracker=search&otracker1=search&marketplace=FLIPKART&as-show=on&as=off" try: recieve = requests.get(URL, headers=headers) recieve = recieve.text except: options = Options_Chrome() options.headless = True browser = webdriver.Chrome(options=options) browser.get(URL) time.sleep(60) recieve = browser.page_source browser.close() soup = BeautifulSoup(recieve, 'lxml') def initial_viability_test(): test_count = 0 The_Whole_Page = soup.prettify() while test_count <= 100: print(The_Whole_Page) test_count += 1 def name_scrape(): time.sleep(time_variable) outlines = soup.findAll("div", {"class": "_1UoZlX"}) for x in range(len(outlines)): outline = outlines[x] identify = outline.find("div", {"class": "_3wU53n"}) name = identify.text Flipkart_Names.append(name) if len(Flipkart_Names) is 0: outlines_2 = soup.findAll("div", {"class": "_3liAhj"}) for y in range(len(outlines_2)): outline_2 = outlines_2[y] identify_2 = outline_2.find( "a", {"class": "_2cLu-l"}) name_2 = identify_2.text Flipkart_Names.append(name_2) else: pass # print(Flipkart_Names) def price_scrape(): time.sleep(time_variable) outlines = soup.findAll("div", {"class": "_1UoZlX"}) for x in range(len(outlines)): outline = outlines[x] identify = outline.find("div", {"class": "_1vC4OE _2rQ-NK"}) price = identify.text Flipkart_Prices.append(price) if len(Flipkart_Prices) is 0: outlines_2 = soup.findAll("div", {"class": "_3liAhj"}) for y in range(len(outlines_2)): outline_2 = outlines_2[y].find( "div", {"class": "_1vC4OE"}) price_2 = outline_2.text Flipkart_Prices.append(price_2) else: pass # print(Flipkart_Prices) def image_scrape(): time.sleep(time_variable) outlines = soup.findAll("div", {"class": "_1UoZlX"}) for x in range(len(outlines)): outline = outlines[x] identify = outline.find("div", {"class": "_3BTv9X"}) image = identify.find("img") image_link = image['src'] Flipkart_ImageLinks.append(image_link) if len(Flipkart_ImageLinks) is 0: outlines_2 = soup.findAll("div", {"class": "_3liAhj"}) for y in range(len(outlines_2)): outline_2 = outlines_2[y] identify_2 = outline_2.find( "div", {"class": "_3BTv9X"}) image_2 = identify_2.find("img") image_link_2 = image_2['src'] Flipkart_ImageLinks.append(image_link_2) else: pass # print(Flipkart_ImageLinks) name_scrape() price_scrape() image_scrape()
from selenium import webdriver from selenium.webdriver.common.keys import Keys from selenium.webdriver.chrome.options import Options import getpass from fblib.fbfunct import facebook_login option = Options() option.add_argument("--disable-infobars") option.add_argument("start-maximized") option.add_argument("--disable-extensions") option.headless = True # Pass the argument 1 to allow and 2 to block option.add_experimental_option("prefs", { "profile.default_content_setting_values.notifications": 1 }) p = getpass.getpass() driver = webdriver.Chrome(options=option) facebook_login(driver,'*****@*****.**',p) checking_list = ["evening.tkc", "chou.wang.39", "wang.leox"] i = 0 while i < len(checking_list): driver.implicitly_wait(5) print('https://www.facebook.com/'+checking_list[i]) driver.get('https://www.facebook.com/'+checking_list[i]) i = i + 1 scroll = 0 for scroll in range(2): scroll = scroll + 1 driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
def twitterSearch(query, maximum=10): options = Options() options.headless = True options.add_argument("--window-size=1920,1200") options.add_argument("--disable-dev-shm-usage") options.add_argument("--disable-javascript") options.add_argument("--no-sandbox") options.add_experimental_option( "prefs", {'profile.managed_default_content_settings.javascript': 2}) print('Options setted...') if os.environ.get("ENV") == "development": driver = webdriver.Chrome( ChromeDriverManager().install(), options=options) else: options.binary_location = os.environ.get("GOOGLE_CHROME_BIN") driver = webdriver.Chrome(executable_path=os.environ.get( "CHROMEDRIVER_PATH"), chrome_options=options) print('Driver mounted...') url = 'https://mobile.twitter.com/search?q='+query finalComments = [] try: driver.get(url) time.sleep(5) print('Driver opened...') # button = driver.find_element_by_tag_name("body") # a = button.get_attribute("innerHTML") # q = BeautifulSoup(a, 'html.parser') # print(q) # time.sleep(60) # button.click() print('Twitter acessed...') while True: javaScript = "window.scrollBy(0, document.body.scrollHeight);" driver.execute_script(javaScript) element = driver.find_element_by_tag_name("body") commentsDiv = element.get_attribute("innerHTML") soup = BeautifulSoup(commentsDiv, 'html.parser') print(soup) comments = soup.find_all('div', attrs={ 'class': 'css-901oao r-hkyrab r-1qd0xha r-a023e6 r-16dba41 r-ad9z0x r-bcqeeo r-bnwqim r-qvutc0'}) for i in comments: if i in finalComments: continue else: finalComments.append(html.escape(i.text)) # moreButton = driver.find_element_by_class_name("w-button-more") # moreButton.click() time.sleep(2) print(len(finalComments)) if len(finalComments) >= maximum: break print('Twitter crawling done!') driver.quit() return finalComments except Exception as x: print(x) print("Error on loading comments") driver.quit() return -1
def setUp(self): options = Options() options.headless = True self.driver = webdriver.Chrome(options=options) self.driver.get(url="https://semantic-ui.com/examples/login.html")
from selenium import webdriver from bs4 import BeautifulSoup as bs from selenium.webdriver.common.keys import Keys from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import By from selenium.webdriver.chrome.options import Options from selenium import webdriver sistema = platform.system() if sistema == 'Windows': driver = webdriver.Chrome(executable_path=r"chromedriver.exe") else: chromeOptions = Options() chromeOptions.headless = True driver = webdriver.Chrome(executable_path="./chromedriver", options=chromeOptions) import funciones import apiTwitch import creacionBot linkBase = 'https://www.twitch.tv/popout/' linkExtra = '/chat?popout=' diccionarioStreamer = apiTwitch.diccionarioStreamer cantidadCanales = len(diccionarioStreamer) inactivos = list() streamersActivos = list()
from selenium import webdriver #Import the webdriver to launch Chrome driver here from selenium.webdriver.chrome.options import Options #Import Options from Selenium to configure our driver from githubConnexion import githubConnect options = Options() #Declare options options.headless = True #True=the driver won't be launch and False it will be launch options.add_argument("--window-size=1920,1080") #Size of driver's window options.add_argument( "--incognito" ) #If you add this argument, the driver will launch a private session DRIVER_PATH = "your_driver_path" #Replace by the path of your driver's executable driver = webdriver.Chrome( options=options, executable_path=DRIVER_PATH) #Create and init with options your driver githubConnect(driver, "your_github_username", "your_github_password", True) driver.quit() #Close the driver at the end of the program
def authenticate(self): """ Authenticates with umd using https://identity.umd.edu/mfaprofile as the base request. Authenticating with identity.umd.edu gives us the most access to other sites (except notably https://app.testudo.umd.edu/main/profile), which is why it was chosen. Notes ----- Interestingly, websites under umd control seem to have a hierarchy of some kind. My best current approximation of this hierarchy is a partially ordered set. In the notation below, (a, b) means that "a grants access to b". So, the "higher up" in the hierarchy a website is, the more sites it grants access to. By "grants access" I mean that if you use CAS to log into website a, then you can freely access website b without needing ot re-authenticate. Note that the opposite is not necessarily true. 1 = https://identity.umd.edu/mfaprofile 2 = https://app.testudo.umd.edu/main/profile 3 = http://umd.instructure.com/ 4 = https://return.umd.edu/covid/returnstatus 5 = https://dsonline2.umd.edu/dpms/cas.do 6 = https://www.myuhc.umd.edu/home.aspx (1, 3) (1, 4) (1, 5) (1, 6) (2, 3) (2, 4) (2, 5) (2, 6) (3, 5) (3, 6) (4, 5) (4, 6) (5, 4) (5, 6) Sites 1 and 2 are equally high up in the hierarchy, but neither grants access to the other. So we have to pick one to authenticate with. I chose 1. In the future we may authenticate with both (or only as necessary for 2) for maximum coverage. Warnings -------- We're making more than a few requests in this method, so this could take multiple seconds to complete (around 5-6 seconds for me). """ generate_codes_after = False if len(self.codes) == 0: raise ValueError( "Need at least one authentication code to log in.") if len(self.codes) == 1: # we're down to our last code - authenticate and then generate # another set. print("down to our last code, generating more after this " "authentication") generate_codes_after = True # use up the first code available (starting from the front of the list) code = self.codes.pop(0) print(f"authenticating with code {code}") # A useful reference: "Detailed Trace of a Shibboleth Login". # https://docs.shib.ncsu.edu/docs/shiblogindetails.html r = requests.get("https://identity.umd.edu/mfaprofile") jsession_id = r.history[2].cookies["JSESSIONID"] cookies = {"JSESSIONID": jsession_id} data = { "j_username": self.username, "j_password": self.password, "_eventId_proceed": "" } r = requests.post( "https://shib.idm.umd.edu/shibboleth-idp/profile/cas" "/login?execution=e1s1", data=data, cookies=cookies) # sanity check to ensure our request / jsession id was accepted assert ("Please complete your multi-factor authentication " "using Duo.") in r.text umd_shib_url = r.url # There's an iframe on this page (the duo mobile 2fa element) which # makes some requests for us. We need to get the source code of that # iframe in order to replicate the requests by hand. Duo has a js # library that sets the iframe source based on some parameters in the # source code of this page, so we replicate that js code here to create # the iframe url and retrieve its source. # # The duo js code is minified on the umd page, but an unmified version # (that seems to be accurate as far as I can tell) can be found here: # http://shibboleth.net/pipermail/commits/2017-September/031081.html. soup = BeautifulSoup(r.text, features="lxml") duo_iframe = soup.find(id="duo_iframe") duo_host = duo_iframe.get("data-host") duo_sig_request = duo_iframe.get("data-sig-request") duo_sig = duo_sig_request.split(":")[0] app_sig = duo_sig_request.split(":")[1] # Apparently javascript's encodeURIComponent function (which we are # replicating here) replaces "/" as well, so we pass `safe=""`` to # emulate this. current_url_encoded = urllib.parse.quote(umd_shib_url, safe="") duo_iframe_source_url = ( f"https://{duo_host}/frame/web/v1/auth?tx=" f"{duo_sig}&parent={current_url_encoded}&v=2.6") options = Options() options.headless = True driver = webdriver.Chrome(Path(__file__).parent / "chromedriver", options=options) driver.get(duo_iframe_source_url) # TODO this errors with "list index out of range" randomly - race # condition somewhere? I just retry whenever I hit that error currently. sid = driver.current_url.split("sid=")[1] sid = urllib.parse.unquote(sid) data = { "sid": sid, "device": "phone1", "factor": "Passcode", "passcode": f"{code}", "out_of_date": "False", "days_out_of_date": "0", "days_to_block": "None" } r = requests.post(f"https://{duo_host}/frame/prompt", data=data) txid = json.loads(r.content)["response"]["txid"] data = {"sid": sid, "txid": txid} r = requests.post(f"https://{duo_host}/frame/status", data=data) data = {"sid": sid} r = requests.post(f"https://{duo_host}/frame/status/{txid}", data=data) auth_sig = json.loads(r.content)["response"]["cookie"] sig_response = f"{auth_sig}:{app_sig}" data = {"_eventId": "proceed", "sig_response": sig_response} session = requests.Session() add_dict_to_cookiejar(session.cookies, cookies) r = session.post(umd_shib_url, data=data, cookies=cookies) # ``len(history)`` used to be 2, but umd recently introduced a screen # which would show if you haven't completed the daily symptom survey. # The ``if``` branch deals with this scenario. the ``else`` branch deals # with the 'normal' scenario of having your daily symptom survey # completed (which the login process will presumably go back to by # default when the survey is no longer a thing). assert len(r.history) != 0 print(f"len(r.history): {len(r.history)}") if len(r.history) == 1: shib_idp_session = (r.history[0].headers["set-cookie"].split( "shib_idp_session=")[1].split(";")[0]) umd_shib_url_ = umd_shib_url[:-1] + "3&_eventId_proceed=1" r = session.get(umd_shib_url_) cookie = r.history[1].headers["set-cookie"] cookie = cookie.split("JSESSIONID=")[1] cookie = cookie.split(";")[0] self.identity_jsession_id = cookie else: shib_idp_session = (r.history[0].headers["set-cookie"].split( "shib_idp_session=")[1].split(";")[0]) # we're actually issued a *new* JSESSIONID just for the identity.umd.edu # site. If we attempt to make requests with our first (and still valid) # JSESSSIONID, they will be rejected, so store this new JSESSIONID for # later use (if we want to make requests to identity.umd.edu later). # # As far as I can tell, this doesn't occur for other websites. They will # still accept the original JSESSIONID and don't issue a new one to us. identity_jsession_id = (r.history[1].headers["set-cookie"].split( "JSESSIONID=")[1].split(";")[0]) self.identity_jsession_id = identity_jsession_id # With these two cookies, we are basically a god. We can make a request # to any umd website with full authentication permissions. cookies = { "JSESSIONID": jsession_id, "shib_idp_session": shib_idp_session } self.auth_cookies = cookies print("Authenticated. Creds: ", self.auth_cookies, self.identity_jsession_id) # we popped a code off our codes list at the beginning of this method, # so we need to remove it from our codes file as wll. self._write_codes() if generate_codes_after: self.generate_new_codes()
def set_up_driver(chromedriver_path): options = Options() options.headless = False driver = webdriver.Chrome(chromedriver_path,options=options) return driver
# -*- coding: utf-8 -*- # Harvest theses from Western Australia # JH: 2022-03-19 from time import sleep from selenium import webdriver from datetime import datetime from selenium.webdriver.chrome.options import Options from bs4 import BeautifulSoup import os import codecs import ejlmod2 import re driver_options = Options() driver_options.headless = True driver = webdriver.Chrome(options=driver_options) xmldir = '/afs/desy.de/user/l/library/inspire/ejl' retfiles_path = "/afs/desy.de/user/l/library/proc/retinspire/retfiles" #+'_special' pages = 1 now = datetime.now() stamp_of_today = '%4d-%02d-%02d' % (now.year, now.month, now.day) publisher = 'Western Australia U.' jnlfilename = 'THESES-WESTERN-AUSTRALIA-%s' % stamp_of_today recs = []
'Welcome to GTPylenium!\nWould you like this test to be headless (Y/n)? ') uname = os.system('uname') envs = mac_envs if uname == 'Darwin' else win_envs for path in envs: #Set path to driver engine_path = path print("PATH set: ", path) if browser[i] == 'chrome': from selenium.webdriver.chrome.options import Options else: from selenium.webdriver.firefox.options import Options #Enable headless option (i.e. no browser window will open) - to diable this feature, change options.headless = True to options.headless = False options = Options() if H == 'Y': options.headless = True else: options.headless = False print('Driver Status: Building Webdriver') if browser[i] == 'chrome': driver = webdriver.Chrome(path, options=options) else: driver = webdriver.Firefox(options=options, executable_path=path) path = os.path.join(path, 'firefox') #Visit website driver.get(website) print('Driver Status: Visiting website -', website) #Find and click the Schedule of Classes link
import time import random import os ''' SET YOUR DETAILS HERE ''' NRIC = 'S1234567Z' # TODO KEY YOUR IC HERE EMAIL = '*****@*****.**' # TODO KEY YOUR EMAIL HERE WORKDAY = [0, 1, 3, 4] # TODO Key your workdays, Mon = 0, Tues = 1 and so on... sun = 6 ''' Start Selenium settings ''' options = Options() options.headless = True # TODO you can set this to False, to see the outcome before submission cwd = os.getcwd() driverpath = os.path.join(cwd, "chromedriver.exe") driver = webdriver.Chrome(driverpath, options=options, port=8080) driver.get("https://form.gov.sg/#!/5e37870c73a1e90011942e50") # create a function to key value in the element by id def key_value_into_element_by_id(id, value): elem = driver.find_element_by_id(id) elem.send_keys(value) #scroll
def igbombing(): from selenium import webdriver from selenium.webdriver.common.keys import Keys from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.chrome.options import Options import time import os import sys ig_username = input(' |-$ Your Username > ') ig_password = input(' |-$ Your Password > ') ig_victim = input(" |-$ Victim's Username > ") mode = input(''' | |-PRESS------------------| | 1] Repetitive Mode | | 2] Script/Lyrical Mode | | Facing Problem ? | | Check out README.MD | |------------------------| |-> ''') if mode.lower() == '1' or mode.lower() == 'repetitive mode': reptxt = input( ' |-$ Word/Sentence that you want to send Multiple Times > ') repcount = int(input(' |-$ How many times ? > ')) elif mode.lower() == '2' or mode.lower() == 'script/lyrical mode': lyrics = open("lyrics.txt", "r+") splitedlyrics = (lyrics.read().split()) else: print(' |-} invalid input !') return print(' |-} Logging in...') options = Options() options.headless = True options.add_argument("--log-level=3") browser = webdriver.Chrome("chromedriver.exe", chrome_options=options) os.system('cls') print(f''' All Bombs away Sir \ Goodbye Dullsville! __|__ / .'(\ .-. /)'. +-====(*)===: " :===(o)=====-+ \). '-' .(/ += += += █▄▄ █▀█ █▀▄▀█ █▄▄ █▀▀ █▀█ ▀█▀ █ █ █▀█ █▄ █ █▄█ █▄█ █ ▀ █ █▄█ ██▄ █▀▄ █ █▀█ █▄█ █ ▀█ += += ########################################### += # Version >>> v1.0 (Beta) # # Last Update >>> 11th July 2020 # += # Coded by b3!ngD3v (Pramurta Sinha) # += # Named by Ritik Gupta # += # GitHub >>> https://github.com/b31ngd3v # += ########################################### |-PRESS---------------| | 1] Call Bomber | | 2] SMS Bomber | | 3] Instagram Bomber | | 4] WhatsApp Bomber | | 5] About | | 6] Exit Script | |---------------------| |-> 3 |-$ Your Username > {ig_username} |-$ Your Password > {ig_password} |-$ Victim's Username > {ig_victim} | |-PRESS------------------| | 1] Repetitive Mode | | 2] Script/Lyrical Mode | | Facing Problem ? | | Check out README.MD | |------------------------| |-> {mode}''') if mode.lower() == '1' or mode.lower() == 'repetitive mode': print( f''' |-$ Word/Sentence that you want to send Multiple Times > {reptxt} |-$ How many times ? > {repcount}''') print(' |-} Logging in...') browser.get('https://www.instagram.com/accounts/login') time.sleep(2) username_bar = browser.find_element_by_name('username') username_bar.send_keys(ig_username) password_bar = browser.find_element_by_name('password') password_bar.send_keys(ig_password + Keys.ENTER) time.sleep(7) if browser.current_url == 'https://www.instagram.com/': pass else: try: confirm = WebDriverWait(browser, 20).until( EC.presence_of_element_located( (By.CLASS_NAME, "coreSpriteKeyhole"))) except: print(' |-} Log in Failed !') return finally: pass print(' |-} Logged in Successfully !') browser.get('https://www.instagram.com/direct/new/') '''try: confirm = WebDriverWait(browser, 20).until(EC.presence_of_element_located( (By.CLASS_NAME, "mt3GC"))) finally: pass browser.find_element_by_class_name('mt3GC').click()''' try: confirm = WebDriverWait(browser, 20).until( EC.presence_of_element_located((By.NAME, "queryBox"))) finally: pass browser.find_element_by_name('queryBox').send_keys(ig_victim) try: confirm = WebDriverWait(browser, 20).until( EC.presence_of_element_located((By.CLASS_NAME, "dCJp8"))) finally: pass browser.find_element_by_class_name('dCJp8').click() time.sleep(1) browser.find_element_by_class_name('rIacr').click() try: confirm = WebDriverWait(browser, 20).until( EC.presence_of_element_located((By.XPATH, "//textarea"))) finally: pass if ig_victim == browser.find_element_by_class_name( '_7UhW9.vy6Bb.qyrsm.KV-D4.fDxYl').text: pass else: print(' |-} no such user named' + ig_victim) return if mode.lower() == '1' or mode.lower() == 'repetitive mode': for i in range(repcount): browser.find_element_by_xpath("//textarea").send_keys(reptxt + Keys.ENTER) elif mode.lower() == '2' or mode.lower() == 'script/lyrical mode': for words in splitedlyrics: browser.find_element_by_xpath("//textarea").send_keys(words + Keys.ENTER) print(''' |-} Done ! |-----------------------------------------------------------''') browser.quit()
def searching_information(self): columns_order = pd.read_csv(self._dir_path + '/Columns_order.txt', header=None) columns_order = columns_order[0].tolist() options = Options() options.headless = True options.add_argument('--disable-notifications') options.add_argument('--no-sandbox') options.add_argument('--verbose') options.add_argument('--disable-gpu') options.add_argument('--disable-software-rasterizer') options.add_argument("--log-level=3") options.add_argument('--hide-scrollbars') self._browser = webdriver.Chrome(ChromeDriverManager().install(), \ options = options) self.chemicals = pd.DataFrame({'CAS NUMBER': self.chemicals}) self._opening_dsstox_identifiers_and_casrn() df = pd.DataFrame(columns=columns_order) n_searches = 0 self.chemicals = self.chemicals.where(pd.notnull(self.chemicals), None) n_rows = self.chemicals.shape[0] for idx, row in self.chemicals.iterrows(): dsstox_substance_id = row['DSSTOX ID'] cas = row['CAS NUMBER'] preferred_name = row['PREFERRED NAME'] n_searches = n_searches + 1 try: if not dsstox_substance_id: df_aux = pd.DataFrame({ 'CAS NUMBER': [cas], 'Consulted Date': [self._now] }) else: Properties = { 'CAS NUMBER': [cas], 'Data Source': [ '{}/dsstoxdb/results?search={}'.format( self._url, dsstox_substance_id) ], 'Consulted Date': [self._now], 'PREFERRED NAME': [preferred_name], 'DSSTOX ID': [dsstox_substance_id] } list_tabs = ['properties', 'env-fate-transport', 'details'] for tab in list_tabs: self._visit(dsstox_substance_id, tab) self._dynamic_wait(self._queries['dialog_window'], action='dialog') if tab == 'details': Properties.update(self._searching_details()) else: Properties.update(self._searching_properties()) self._browser.back() time.sleep(2) df_aux = pd.DataFrame(Properties) df = pd.concat([df, df_aux], ignore_index=True, sort=True, axis=0) if (n_searches % 20 == 0) or (n_searches == n_rows): df = df[columns_order] if self._existing: df.to_csv(self.file_save, index=False, mode='a', sep=',', header=False) else: df.to_csv(self.file_save, index=False, sep=',') self._existing = True df = pd.DataFrame(columns=columns_order) except TimeoutException: continue self._browser.close()
def get_driver(headles=True): options = Options() options.headless = headles driver = webdriver.Chrome(path.join(ROOT_DIR, "chromedriver"), options=options) return driver
def make_driver(driver_path:Path): options = Options() options.headless = True # options.add_argument("--window-size=1920,1200") driver = webdriver.Chrome(executable_path=driver_path, options=options) return driver
import time from selenium.webdriver import Chrome from selenium.webdriver.chrome.options import Options import json import os import pandas as pd cwd = os.getcwd() # Allows easy option to display / hide the browser window display_brower_window = False if display_brower_window == False: opts = Options() opts.headless = True assert opts.headless browser = Chrome(f'{cwd}/chromedriver', options=opts) else: browser = Chrome(f'{cwd}/chromedriver') URL = 'https://www.pearsonham.com/' # Navigates to the URL browser.get(URL) time.sleep(0.5) # Finds and clicks hamburger menu to open menu browser.find_element_by_id('hamburger').click() time.sleep(0.1) # Finds and clicks the team link via the xpath browser.find_element_by_xpath('//*[@id="menu-item-52"]/a').click() # Instantiate empty employee dictionary employee_dict = {}
def get_currencies(currencies, start, end, export_csv=False): frames = [] # store data for each currency # Get the historic data between USD and other currencies for currency in currencies: while True: try: # Open the URL and maximize the window my_url = f"https://investing.com/currencies/usd-{currency.lower()}-historical-data" option = Options() option.add_experimental_option("excludeSwitches", ["enable-logging"]) option.headless = False # Make the actions visible driver = webdriver.Chrome(options=option) driver.get(my_url) print("Got the URL.") driver.maximize_window() print("Maximized window.") sleep(5) # Accept the cookies, otherwise the prompt does not go away... cookies_button = WebDriverWait(driver, 20).until( EC.element_to_be_clickable( (By.ID, "onetrust-accept-btn-handler"))) cookies_button.click() print("Accepted the cookies.") sleep(5) # Click on the date button to change the range date_button = WebDriverWait(driver, 20).until( EC.element_to_be_clickable( (By.ID, "flatDatePickerCanvasHol"))) date_button.click() print("Clicked the date button.") sleep(5) # Select Start and End Date, clear their contents and input our own start_bar = WebDriverWait(driver, 20).until( EC.element_to_be_clickable( (By.XPATH, "/html/body/div[7]/div[1]/input[1]"))) start_bar.clear() start_bar.send_keys(start) print("Entered the start date.") sleep(5) end_bar = WebDriverWait(driver, 20).until( EC.element_to_be_clickable( (By.XPATH, "/html/body/div[7]/div[1]/input[2]"))) end_bar.clear() end_bar.send_keys(end) print("Entered the end date.") sleep(5) # Click the apply button and wait a bit apply_button = WebDriverWait(driver, 10).until( EC.element_to_be_clickable( (By.XPATH, "/html/body/div[7]/div[5]/a"))) apply_button.click() print("Clicked 'Apply'.") sleep(5) # Get the source code of the page that appeared with pandas # Use the default read_html flavor parameter (lxml) dataframes = pd.read_html(driver.page_source) # From the webpage source code we collected, keep only the table containing the historical data for dataframe in dataframes: if dataframe.columns.tolist() == [ "Date", "Price", "Open", "High", "Low", "Change %", ]: frames.append(dataframe) df = dataframe break frames.append(df) # Export to csv if asked by function argument if export_csv: df.to_csv("currency.csv", index=False) print(f"{currency}.csv exported") driver.quit() print(f"{currency} scraped.") break except: driver.quit() print( f"Failed to scrape {currency}. Trying again in 10 seconds." ) sleep(10) continue return frames
def _local_browser_class(browser_name): """ Returns class, kwargs, and args needed to instantiate the local browser. """ # Log name of local browser LOGGER.info(u"Using local browser: %s [Default is firefox]", browser_name) # Get class of local browser based on name browser_class = BROWSERS.get(browser_name) headless = os.environ.get('BOKCHOY_HEADLESS', 'false').lower() == 'true' if browser_class is None: raise BrowserConfigError( u"Invalid browser name {name}. Options are: {options}".format( name=browser_name, options=", ".join(list(BROWSERS.keys())))) else: if browser_name == 'firefox': # Remove geckodriver log data from previous test cases log_path = os.path.join(os.getcwd(), 'geckodriver.log') if os.path.exists(log_path): os.remove(log_path) firefox_options = FirefoxOptions() firefox_options.log.level = 'trace' if headless: firefox_options.headless = True browser_args = [] browser_kwargs = { 'firefox_profile': _firefox_profile(), 'options': firefox_options, } firefox_path = os.environ.get('SELENIUM_FIREFOX_PATH') firefox_log = os.environ.get('SELENIUM_FIREFOX_LOG') if firefox_path and firefox_log: browser_kwargs.update({ 'firefox_binary': FirefoxBinary( firefox_path=firefox_path, log_file=firefox_log) }) elif firefox_path: browser_kwargs.update({ 'firefox_binary': FirefoxBinary(firefox_path=firefox_path) }) elif firefox_log: browser_kwargs.update({ 'firefox_binary': FirefoxBinary(log_file=firefox_log) }) elif browser_name == 'chrome': chrome_options = ChromeOptions() if headless: chrome_options.headless = True # Emulate webcam and microphone for testing purposes chrome_options.add_argument('--use-fake-device-for-media-stream') # Bypasses the security prompt displayed by the browser when it attempts to # access a media device (e.g., a webcam) chrome_options.add_argument('--use-fake-ui-for-media-stream') browser_args = [] browser_kwargs = { 'options': chrome_options, } else: browser_args, browser_kwargs = [], {} return browser_class, browser_args, browser_kwargs
def link_assembler(file_path, name): fieldnames = ['Grade'] with open(file_path, "ab") as csvfile: writer = csv.DictWriter(csvfile, fieldnames=fieldnames) # writer.writeheader() writer.writerow({ "Grade": name, }) options = Options() options.headless = False #PROXY = "36.67.23.117:8888" # IP:PORT or HOST:PORT #options.add_argument('--proxy-server=%s' % PROXY) driver = selenium.webdriver.Chrome(chrome_options=options) driver.set_page_load_timeout(10000) e = 100 d = 7900 for i in range(80): d += e driver.get("https://aws.amazon.com/partners/find/results/?size=100&start={}&sort=Relevance&view=Grid".format(d)) sleep(20) zpath = driver.find_element_by_xpath("//*[@id='psf-search-results-da-wrapper']/div[2]/div[3]/div[1]") child_elements = zpath.find_elements_by_class_name("psf-partner-name") for i in child_elements: b = i.find_element_by_tag_name("a") c = b.get_attribute("href")
from selenium import webdriver from selenium.webdriver.chrome.options import Options from selenium.webdriver.common.keys import Keys import time import requests # false to watch it work HEADLESS = True options = Options() options.headless = HEADLESS options.add_argument("--window-size=1920,1200") driver = webdriver.Chrome(options=options) driver.get("https://www.gutenberg.org/") search_box = driver.find_element_by_class_name("searchInput") search_box.send_keys("Heart of Darkness") search_box.send_keys(Keys.RETURN) book_results = driver.find_elements_by_class_name("booklink") print(f"Found {len(book_results)} results for 'Heart of Darkness':") result_array = [] for result in book_results: book_element = result.find_element_by_class_name("content") book_title = book_element.find_element_by_class_name("title").text book_author = book_element.find_element_by_class_name("subtitle").text book_downloads = book_element.find_element_by_class_name("extra").text print(f"{book_title}, {book_author}, downloads: {book_downloads}") result_array.append({ "link element": result, "title": book_title,
def init_webdriver(proxy_dict=None, account_credentials=None): """ Here, we are configure and intitialize webdriver :param proxy_dict: proxy dictionary :param account_credentials: account credentials :return: """ """ Webdriver's Load Strategy In majority of cases, you might want to make the load strategy to "none" to greatly improve the scrapping. Doing so will not run any JS/other script, so selenium will be much faster. In apkpure.com's case though, or in other case when you need JS/Script to load, it should be set to "normal". If no mode is set (i.e. all commented out) then the default will be "normal" More about loading strategy: https://stackoverflow.com/a/44771628/6558550 """ # webdriver.DesiredCapabilities.CHROME['pageLoadStrategy'] = "none" # webdriver.DesiredCapabilities.CHROME['pageLoadStrategy'] = "normal" if config.UseProxies: """ Here we set the proxy """ webdriver.DesiredCapabilities.CHROME['proxy'] = { "httpProxy": proxy_dict.get('httpProxy', ''), "ftpProxy": proxy_dict.get('ftpProxy', ''), "sslProxy": proxy_dict.get('sslProxy', ''), "noProxy": None, "proxyType": "MANUAL", "class": "org.openqa.selenium.Proxy", "autodetect": False } # declare an options object chrome_options = Options() # setting the browser visibility mode chrome_options.headless = not config.ShowBrowserWindows """ Setting Webdriver's Preferences All of the codes below are experimental preference that worked for my system. It should be good for you to run. But you can turn it off by commenting it should you need any adjustment. - Setting the default download directory 'download.default_directory': 'your/download/path', Note that if same filename already in directory, browser will automatically append with counter (like "files (1).zip" ) - Disable Chrome Harm file detector 'safebrowsing.enabled': True, Disable chrome popup asking that file downloaded might be harm. The pop up is preventing the download to start. - disable image load 'profile.managed_default_content_settings.images': 2, This will greatly increase the bot speed, but the images will not be loaded - Force webdriver to use diskcache. 'disk-cache-size': 4096 This will force webdriver to save browser's cache on disk. So we did not loading everytime we start the browser. 4096 is for 4gb and , you can configure yours. The bigger the better, but as high as 8gb would be wasteful I think. But we do need as high as 8gb if say you want to scrap it to download entire night. - Allow/prevent downloading multiple files 'profile.default_content_setting_values.automatic_downloads': 1, 'download.prompt_for_download': False Two lines above will automatically download without prompting a new window for saving the file. For now, I did not require these codes so I commented it and it worked on apkpure.com. You might need to turn it on should you find a problem regarding downloading - Disable download protection chrome_options.add_argument('--safebrowsing-disable-download-protection') This code is neraly have same functionality with "Disable Chrome Harm file detector" above. """ preferences = { 'download.default_directory': config.SaveDirectory if config.SaveDirectory else config.DefaultSaveDirectory, 'safebrowsing.enabled': True, 'profile.managed_default_content_settings.images': 2, 'disk-cache-size': 4096 # For now, I did not use this # 'profile.default_content_setting_values.automatic_downloads': 1, # 'download.prompt_for_download': False } chrome_options.add_argument('--safebrowsing-disable-download-protection') # and then we register the preference to the options chrome_options.add_experimental_option('prefs', preferences) # create the webdriver according to its path and options, then the browser will appears driver = webdriver.Chrome(config.WebDriverPath, chrome_options=chrome_options) # for firefox (gecko driver) you will run this # driver = webdriver.Firefox(config.WebDriverPath) if config.TestMode: # for testing, it will download a notepad++ installer and then exit driver.get( 'https://notepad-plus-plus.org/repository/7.x/7.6.5/npp.7.6.5.bin.minimalist.7z' ) input() exit() """ Clearing Cookies We seem did not need to clear the cookies, because everytime we close a driver and re-instantiating it, it is a brand new clean web driver with no default configuration. But should that requirements really necessary, we can always uncomment the code below """ # driver.delete_all_cookies() """ If Login is required, the code will perform a login action """ if config.RequireLogin: # get the credentials if ':' not in account_credentials: print("Credentials is not valid") exit() username, password = account_credentials.split(':') # get loading page print('Loading Login Page ...') driver.get(config.LoginPage) """ Form filling Clicking On Username/Email Input -> Filling Next Username/Email ... Clicking On Password Input -> Filling Next Password ... """ try: driver.find_element_by_xpath( config.XPathFormUserOrEmail).send_keys(username) driver.find_element_by_xpath( config.XPathFormPassword).send_keys(password) except NoSuchElementException as e: print( 'either username or password input form is not found by xpath. Please check the XPath ' 'for the username and password form in configuration.') if config.DebugMode: print(e) exit() """ ReCaptcha If reCaptcha found -> Solving reCaptcha ... """ recaptcha_element = driver.find_element_by_xpath(config.XPathRecaptcha) if recaptcha_element: """ For now we have 2 options, according to config_02.py/ReCaptchaOption: 1. Wait for certain seconds to solve recaptcha 2. CLI will ask for input (it will wait forever). So user can take time to solve captcha, and after that user will need to go to CLI and press any key (say, enter) 3. Requesting anti-captcha service (for now it is not available because we need the documentation of the API service. Feel free to inform me about this) """ if config.ReCaptchaOption == 1: # option 1 time.sleep( 60 ) # wait 1 minute / any given time for user to solve captcha elif config.ReCaptchaOption == 2: # option 2 input( "Captcha found. Please solve it on the browser then press any key here to continue.." ) elif config.ReCaptchaOption == 3: # todo: or anti-captcha.com services will be called here # requirements: what anti-captcha services are? and We should need the documentation for the API pass print('Clicking on Login Button ...') try: driver.find_element_by_xpath(config.XPathLoginButton).click() except NoSuchElementException as e: print( 'Login button not found by Xpath. Please check the XPath for the login button in confugiration.', e) exit() """ Then the code will check if a certain XPath that will be always appear after successful login, appears. If appears, it is guaranteed that the login process is success. If not, it may be also running well, but are suspected for error in long term (e.g. browser will keep access something when the login process is failed). If login is done via requests method, we can easily see the status_code to determine if login was successful. But because login process must be done via HTML Form, this is considered the best practice to check if login is succesful or not. """ is_successfully_login = False print('Waiting until login is succeed ...') try: login_timeout = config.LoginTimeout WebDriverWait(driver, login_timeout).until( EC.presence_of_element_located( (By.XPATH, config.LoggedInXPath))) is_successfully_login = True except TimeoutException: print( "Login takes too much time. Code can not tell if browser is logged in or not." "This is not an error statement but a warning, as in some case this may lead to an error." "This is because the code did not know if browser is successfully logged or not" ) if not is_successfully_login: # todo: need variety of error case to be coded more specifically print('Login/get sessions seems to be failed') exit() return driver else: return driver
from selenium.webdriver import Chrome from selenium.webdriver.chrome.options import Options # from selenium.webdriver.common.by import By # from selenium.webdriver.support.ui import WebDriverWait # from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.keys import Keys # from selenium.common.exceptions import StaleElementReferenceException import time # C:\Users\spi59\Documents\Drivers opts = Options() opts.headless = False assert opts.set_headless browser = Chrome( executable_path='C:/Users/spi59/Documents/Drivers/chromedriver.exe', options=opts) browser.get('https://www.ecosia.org') input_field = browser.find_element_by_tag_name('input') input_field.send_keys('ads') input_field.submit() time.sleep(3) ads = browser.find_elements_by_class_name('result-title-ad') ads[0].click() time.sleep(3) browser.quit()
l = [ 'Oi', 'Ola', 'Oi Siri', 'Ola Siri', 'Tudo e com você', 'Tudo e com voce', 'Estou bem obrigado', 'Siri', 'Qual seu nome', 'Quantos anos voce tem', 'Oque e voce', 'Tudo', 'Olá', 'Qual o seu nome' ] dialogo = '' path = './WebDriver/chromedriver.exe' option = Options() option.add_experimental_option("prefs", {\ "profile.default_content_setting_values.media_stream_mic": 1 }) option.headless = False driver = webdriver.Chrome(path, 0, option) driver.set_window_position(0, 0) driver.set_window_size(200, 200) def falar(dialogo, ): speak = pyttsx3.init('sapi5') speak.say(dialogo) speak.runAndWait() def respostanula(res, ): if res != l[0]: falar('Não entendi') elif res != l[1]:
def test_on_linux(self): exe_path = os.path.join(os.getenv("CHROMEWEBDRIVER"), "chromedriver") opts = ChromeOptions() opts.headless = True driver = webdriver.Chrome(executable_path=exe_path, options=opts) run_for_driver(driver, self)