#for tor browser use: -x 185.130.105.66:11084 args = parser.parse_args() login = args.login password = args.password options = webdriver.ChromeOptions() # socks5://user:pass@host:port proxies = None if args.proxy is not None: proxies = dict(http='socks5://' + args.proxy, https='socks5://' + args.proxy) options.add_argument("--proxy-server=socks5://" + args.proxy) driver = webdriver.Chrome(executable_path=ChromeDriverManager().install(), chrome_options=options) def normalizeDirFileName(name: str) -> str: name = re.sub('[^0-9a-zA-Zа-яА-Я]+', '_', name) name = re.sub('[_]+', ' ', name) name = name.strip() if len(name) > 254: name = name[0:254] return name def fileLink(id: str, page: int) -> str: return f"http://eais.tatar.ru/Pages/ImageFilePart.ashx?Crop=False&Id={id}&Page={page}&Zoom=1"
def browser(): driver = webdriver.Chrome(ChromeDriverManager().install()) driver.set_window_size(1920, 1080) yield driver driver.quit()
from webdriver_manager.chrome import ChromeDriverManager from selenium import webdriver from selenium.webdriver.chrome.service import Service from selenium.webdriver.support.ui import Select from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC from selenium.common.exceptions import NoSuchElementException, TimeoutException from bs4 import BeautifulSoup import time url = 'https://www.sharesansar.com/company/shl' cdm = ChromeDriverManager().install() driver = webdriver.Chrome(cdm) driver.maximize_window() driver.get(url) time.sleep(10) data = [] driver.find_element_by_link_text('Price History').click() time.sleep(3) select = Select( WebDriverWait(driver, 10).until( EC.element_to_be_clickable( (By.XPATH, '//*[@name="myTableCPriceHistory_length"]')))) select.select_by_visible_text("50")
def test_can_get_chromium_for_win(os_type): path = ChromeDriverManager(version="83.0.4103.39", os_type=os_type, chrome_type=ChromeType.CHROMIUM).install() assert os.path.exists(path)
from selenium import webdriver from selenium.webdriver.common.by import By from webdriver_manager.chrome import ChromeDriverManager from selenium.webdriver import ActionChains import time driver = webdriver.Chrome(ChromeDriverManager().install()) driver.implicitly_wait(3) driver.get("http://swisnl.github.io/jQuery-contextMenu/demo.html") right_click = driver.find_element(By.CSS_SELECTOR, 'p span') act_chain = ActionChains(driver) act_chain.context_click(right_click).perform() time.sleep(2) menu_options = driver.find_elements(By.CSS_SELECTOR, 'ul span') for options in menu_options: print(options.text) if options.text == 'Copy': options.click() break time.sleep(2) driver.quit()
def test_chrome_manager_with_selenium(): driver_path = ChromeDriverManager().install() driver = webdriver.Chrome(driver_path) driver.get("http://automation-remarks.com") driver.close()
def test_chrome_manager_with_specific_version(): bin = ChromeDriverManager("2.26").install() assert os.path.exists(bin)
def scrape(): #==========================================NASA Mars News================================================== executable_path = {'executable_path': ChromeDriverManager().install()} browser = Browser('chrome', **executable_path, headless=False) url = "https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest" browser.visit(url) time.sleep(10) html = browser.html soup = BeautifulSoup(html, 'html.parser') news_list = soup.find('ul', class_='item_list') #********Result to put into Dictionary: title1 & para nasa_title = soup.find('ul', class_='item_list').find("li").find( 'div', class_='content_title').find('a').text para = soup.find('ul', class_='item_list').find("li").find( 'div', class_='article_teaser_body').text browser.quit() #=================================JPL Mars Space Images - Featured Image===================================== executable_path = {'executable_path': ChromeDriverManager().install()} browser = Browser('chrome', **executable_path, headless=False) url = "https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/index.html" browser.visit(url) time.sleep(10) browser.links.find_by_partial_text('FULL IMAGE').click() time.sleep(10) html = browser.html soup = BeautifulSoup(html, 'html.parser') img = soup.find( "div", class_= "fancybox-wrap fancybox-desktop fancybox-type-image fancybox-opened" ).find('img', class_="fancybox-image")["src"] url_1 = url.split("index.html")[0] #********Result to put into Dictionary: img_url feature_img_url = url_1 + img browser.quit() #=====================================================Mars Facts Table============================= #Broswer to open the website executable_path = {'executable_path': ChromeDriverManager().install()} browser = Browser('chrome', **executable_path, headless=False) url = "https://space-facts.com/mars/" browser.visit(url) time.sleep(10) url = "https://space-facts.com/mars/" tables = pd.read_html(url) mars_df = tables[0] mars_df.columns = ["Mars Details", "Measurements"] #=========Result to put into dictionary: html_table html_table = mars_df.to_html() html_table1 = html_table.replace("\n", "") browser.quit() #================================ Mars Hemispheres========================================= #Broswer to open the website executable_path = {'executable_path': ChromeDriverManager().install()} browser = Browser('chrome', **executable_path, headless=False) url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" browser.visit(url) html_mars_main = browser.html soup = BeautifulSoup(html_mars_main, 'html.parser') all_div = soup.find_all("div", class_="description") #Create a list for the headlines and append them. title = [] for x in all_div: title.append(x.find("h3").text) url_2 = url.split("/search")[0] url_2 #Loop throught the headline list and visit each link to get url details img_url = [] for x in title: browser.links.find_by_partial_text(x).click() html_mars = browser.html soup = BeautifulSoup(html_mars, 'html.parser') img_url1 = soup.find("img", class_="wide-image")["src"] #combine 2 variables to get the right img_url #=====================Result to add to dictionary: img_url (list) img_url.append(url_2 + img_url1) #ask the browser to go back to the main page browser.back() #add title and img_url to a list of dictionary hemisphere_image_urls = [] for x in range(0, 4): hemisphere_image_urls.append({ "title": title[x], "img_url": img_url[x] }) browser.quit() listings = {} listings["Nasa_News_Title"] = nasa_title listings["Nasa_News_Para"] = para listings["Featured_Image"] = feature_img_url listings["Mars_Information"] = html_table1 listings["Mars_Img_Url"] = hemisphere_image_urls return listings
def scrape(): # # NASA Mars News mars_data = {} # In[164]: # URL of page to be scraped executable_path = {'executable_path': ChromeDriverManager().install()} browser = Browser('chrome', **executable_path, headless=False) # In[165]: url = "https://redplanetscience.com/" browser.visit(url) # In[166]: # Retrieve page with the requests module html = browser.html soup = BeautifulSoup(html, 'html.parser') news_title = soup.find('div', class_='content_title') news_p = soup.find('div', class_='article_teaser_body') print(news_title.text) print(news_p.text) # # JPL Mars Space Images - Featured Image # In[28]: executable_path = {'executable_path': ChromeDriverManager().install()} browser = Browser('chrome', **executable_path, headless=False) url = 'https://spaceimages-mars.com' browser.visit(url) # In[6]: html = browser.html soup = BeautifulSoup(html, 'html.parser') div = soup.find('div', class_="floating_text_area") link = div.find('a') href = link['href'] featured_image_url = f"{url}/{href}" print(featured_image_url) # # Mars Facts # # Visit the Mars Facts webpage here and use Pandas to scrape the table containing facts about the planet including Diameter, Mass, etc. # # Use Pandas to convert the data to a HTML table string. # In[7]: executable_path = {'executable_path': ChromeDriverManager().install()} browser = Browser('chrome', **executable_path, headless=False) url = 'https://galaxyfacts-mars.com/' browser.visit(url) # In[8]: tables = pd.read_html(url) # In[18]: mars_facts_df = tables[0] mars_facts_df.columns = [['Category', 'Mars Value', 'Earth Value']] del mars_facts_df['Earth Value'] mars_facts_df # In[19]: html_table = mars_facts_df.to_html() html_table # In[156]: html_table_string = html_table.replace('\n', '') # # Mars Hemispheres # # Visit the astrogeology site here to obtain high resolution images for each of Mar's hemispheres. # # You will need to click each of the links to the hemispheres in order to find the image url to the full resolution image. # # Save both the image url string for the full resolution hemisphere image, and the Hemisphere title containing the hemisphere name. Use a Python dictionary to store the data using the keys img_url and title. # # Append the dictionary with the image url string and the hemisphere title to a list. This list will contain one dictionary for each hemisphere. # In[167]: executable_path = {'executable_path': ChromeDriverManager().install()} browser = Browser('chrome', **executable_path, headless=False) url = 'https://marshemispheres.com/' browser.visit(url) # In[80]: html = browser.html soup = BeautifulSoup(html, 'html.parser') browser.links.find_by_partial_text('Hemisphere').click() # In[180]: html = browser.html soup = BeautifulSoup(html, 'html.parser') hemispheres = soup.find_all('div', class_='collapsible results') items = soup.find_all('div', class_='item') h_titles = [] url_things = [] #hemisphere_image_urls = [] for item in items: #h_titles=item.find('h3').text h3 = item.find('h3').text h_titles.append(h3) print(h3) try: browser.links.find_by_partial_text(h3).click() html = browser.html soup = BeautifulSoup(html, 'html.parser') div = soup.find('div', class_="downloads") image = div.find('a') href = image['href'] image_url = f"{url}{href}" url_things.append(image_url) print(image_url) #hemisphere_image_urls.append({'title': h_title, 'img_url': image_url}) browser.back() except: print('nope')
def setUp(self): if readExcel('../Data/data.xlsx', 'Browser_Conf', 'A2') == "Yes": self.driver = webdriver.Chrome(ChromeDriverManager().install()) else: self.driver = webdriver.Chrome(ChromeDriverManager().install())
def scrape_company_classification(tickers=None): ''' TODO: Need to do this for all companies ever listed, not only current. :return: ''' init_df = pd.read_csv( 'https://www.ishares.com/us/products/239724/ishares-core-sp-total-us-stock-market-etf/1467271812596.ajax?fileType=csv&fileName=ITOT_holdings&dataType=fund', skiprows=9, index_col=0) # tickers = init_df.index.tolist() from matilda.data_pipeline.db_crud import companies_in_classification if tickers is None: tickers = companies_in_classification( class_=config.MarketIndices.DOW_JONES) driver = webdriver.Chrome(ChromeDriverManager().install()) sic_codes_division = { (1, 9 + 1): 'Agriculture, Forestry, and Fishing', (10, 14 + 1): 'Mining', (15, 17 + 1): 'Construction', (20, 39 + 1): 'Manufacturing', (40, 49 + 1): 'Transportation, Communications, Electric, Gas, And Sanitary Services', (50, 51 + 1): 'Wholesale Trade', (52, 59 + 1): 'Retail Trade', (60, 67 + 1): 'Finance, Insurance, and Real Estate', (70, 89 + 1): 'Services', (90, 99 + 1): 'Public Administration' } exchanges_dict = { exchange: list( pd.read_csv(os.path.join(config.MARKET_EXCHANGES_DIR_PATH, f'{exchange}.txt'), sep='\t')['Symbol']) for exchange in ['AMEX', 'NYSE', 'NASDAQ'] } path = os.path.join(config.DATA_DIR_PATH, "market_data/country_codes_dictio.pkl") if not os.path.exists(path): save_country_codes() with open(path, 'rb') as f: country_codes = pickle.load(f) edgar_dict = {} for ticker in tickers: edgar_dict[ticker] = {} try: for i in range( 2 ): # just try again if didn't work first time, might be advertisement showed up try: button = driver.find_element_by_xpath( "//a[@class='acsCloseButton acsAbandonButton ']") button.click() sleep(1) except: pass # if nasdaq_df['ETF'].loc[ticker] == 'Y': # driver.get('https://www.sec.gov/edgar/searchedgar/mutualsearch.html') # field = driver.find_element_by_xpath("//input[@id='gen_input']") # field.send_keys(ticker) # TODO might split ticker from the '$' or '.' (classes) # sleep(1) # field.send_keys(Keys.ENTER) # sleep(1) # if 'No records matched your query' not in driver.page_source: # for t in driver.find_elements_by_xpath("//b[@class='blue']"): # TODO # if t.text == ticker: # cik = driver.find_element_by_xpath('').text # security_type = driver.find_element_by_xpath('').text # break # still should go to the 'finally' block base_url = 'https://www.sec.gov/cgi-bin/browse-edgar?CIK={}'.format( ticker) resp = requests.get(base_url).text if 'No matching Ticker Symbol' in resp or 'No records matched your query' in resp: driver.get( 'https://www.sec.gov/edgar/searchedgar/companysearch.html' ) # html = driver.page_source TODO for new 10-K forms maybe works? input_box = driver.find_element_by_xpath( "//input[@id='company']") input_box.send_keys(ticker) html = driver.page_source # wait until the autofill box loads WebDriverWait(driver, 10).until( EC.visibility_of_element_located(( By.XPATH, "//tr[@class='smart-search-hint smart-search-selected-hint']" ))) element = driver.find_element_by_xpath( "//tr[@class='smart-search-hint smart-search-selected-hint']" ) if not re.search( r'(\(|[^A-Z]){}([^A-Z]|\))'.format(ticker), element.text): break sleep(1) input_box.send_keys(Keys.ENTER) # wait until company page loads WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.ID, "seriesDiv"))) resp = requests.get(driver.current_url).text soup = BeautifulSoup(resp, 'html.parser') # name = soup.find('span', class_='companyName').text.split(' CIK')[0] edgar_dict[ticker]['Company Name'] = titlecase( re.compile(r'(.*) CIK#').findall(soup.text)[0]) edgar_dict[ticker]['CIK'] = re.compile( r'.*CIK#: (\d{10}).*').findall(soup.text)[0] ident_info = soup.find('p', class_="identInfo") edgar_dict[ticker]['SIC Industry'] = str( ident_info.find('br').previousSibling.split('- ') [-1]).title() sic_code = re.search(r'(\d{4})', ident_info.text).group() country_code = re.compile(r'.*State location: (..)').findall( soup.text)[0] for type, code_dict in country_codes.items(): if country_code in code_dict.keys(): edgar_dict[ticker][ 'Location'] = type + '/' + code_dict[country_code] break for exchange, tickers in exchanges_dict.items(): if ticker in tickers: if 'Exchange' in edgar_dict[ticker].keys(): edgar_dict[ticker]['Exchange'] += '|' + exchange else: edgar_dict[ticker]['Exchange'] = exchange for key, value in sic_codes_division.items(): if int(sic_code[0]) == 0: if int(sic_code[1]) in range(key[0], key[1]): edgar_dict[ticker]['SIC Sector'] = value break elif int(sic_code[:2]) in range(key[0], key[1]): edgar_dict[ticker]['SIC Sector'] = value break break # except TimeoutException or ElementNotInteractableException: except: driver.get( 'https://www.sec.gov/edgar/searchedgar/companysearch.html') edgar_df = pd.DataFrame.from_dict(edgar_dict, orient='index') init_df.rename(columns={'Sector': 'GICS Sector'}, inplace=True) init_df = init_df[['GICS Sector', 'Asset Class']] df = edgar_df.join(init_df) df = df[[ 'Company Name', 'SIC Industry', 'SIC Sector', 'GICS Sector', 'Location', 'CIK', 'Exchange', 'Asset Class' ]] # df = pd.concat([edgar_df, init_df], axis=1) path = os.path.join(config.MARKET_DATA_DIR_PATH, 'Company Classification') df.to_excel(path + '.xlsx', engine='xlsxwriter') df.to_pickle(path=path + '.pkl')
def setup_module(module): global driver driver = webdriver.Chrome(ChromeDriverManager().install()) driver.implicitly_wait(5) driver.delete_all_cookies() driver.get("https://www.google.com")
def init_driver(): options = webdriver.ChromeOptions() options.add_argument("--user-data-dir=chromeProfile") driver = webdriver.Chrome(ChromeDriverManager().install(), options=options) driver.implicitly_wait(5) return driver
# Import Splinter and BeautifulSoup from splinter import Browser from bs4 import BeautifulSoup as soup from webdriver_manager.chrome import ChromeDriverManager import pandas as pd import datetime as dt # Set the executable path and initialize the chrome browser in splinter executable_path = {'executable_path': ChromeDriverManager().install()} def scrape_all(): browser = Browser('chrome', **executable_path, headless=True) news_title, news_paragraph = mars_news(browser) # Run all scraping functions and store results in dictionary data = { 'news_title': news_title, 'news_paragraph': news_paragraph, 'featured_image': featured_image(browser), 'facts': mars_facts(), 'last_modified': dt.datetime.now() } # Stop webdriver and return data browser.quit() return data def mars_news(browser): # Scrape Mars News # Visit the mars nasa news site
def init_browser(self): # @NOTE: Replace the path with your actual path to the chromedriver executable_path = {'executable_path': ChromeDriverManager().install()} browser = Browser('chrome', **executable_path, headless=False) return browser
def generate_browser(self): self.driver = webdriver.Chrome(ChromeDriverManager().install()) self.driver.get(self.MALT_URL) time.sleep(5)
def driver(): driver = webdriver.Chrome(executable_path=ChromeDriverManager().install()) # Resize the window to the screen width/height # driver.set_window_size(1366, 768) yield driver driver.quit()
def init_browser(): executable_path = {'executable_path': ChromeDriverManager().install()} return Browser('chrome', **executable_path, headless=False)
def test_can_get_chrome_for_win(os_type): path = ChromeDriverManager(os_type=os_type).install() assert os.path.exists(path)
from selenium.common.exceptions import TimeoutException from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import By from selenium.common.exceptions import NoSuchElementException from webdriver_manager.chrome import ChromeDriverManager from webdriver_manager.utils import ChromeType import time f = open("settings.txt", "r") email, password, zipcode = f.readline().rstrip('\n'), f.readline().rstrip( '\n'), f.readline().rstrip('\n') driver = webdriver.Chrome( ChromeDriverManager(chrome_type=ChromeType.CHROMIUM).install()) driver.maximize_window( ) #Maximizes the browser window since Udemy has a responsive design and the code only works in the maximized layout def getUdemyLink(url): response = requests.get(url=url) soup = BeautifulSoup(response.content, 'html.parser') linkForUdemy = soup.find('span', class_="rh_button_wrapper").find('a').get('href') return linkForUdemy
def test_chromium_manager_with_wrong_version(): with pytest.raises(ValueError) as ex: ChromeDriverManager("0.2", chrome_type=ChromeType.CHROMIUM).install() assert "There is no such driver by url" in ex.value.args[0]
def __init__(self, urls): self.browser = webdriver.Chrome(ChromeDriverManager().install()) self.browser.maximize_window() self.urls = urls self.sizes = [320, 480, 960, 1366, 1920]
def test_chromium_manager_with_specific_version(): bin_path = ChromeDriverManager("2.27", chrome_type=ChromeType.CHROMIUM).install() assert os.path.exists(bin_path)
def webscr(url, start_time): options = Options() options.headless = False if options.headless == True: print("Headless Chrome Initialized on Linux") options.add_argument('--disable-gpu') else: pass print('headless') print(time.time() - start_time) # chrome_driver = r"C:\Users\User\Documents\virtual\webscsrape_indianoil\chromedriver.exe" # Opens url, options is for headless chrome driver = webdriver.Chrome(ChromeDriverManager().install(), options=options) # driver = webdriver.Chrome(chrome_driver, options=options) driver.get(url) print('open url') print(time.time() - start_time) # Wait until all is loaded # cur = WebDriverWait(driver,10).until(EC.presence_of_all_elements_located((By.CLASS_NAME,'current-value'))) print('webdriverwait') print(time.time() - start_time) # Grab the Contents content = driver.page_source soup = BeautifulSoup(content, features="html.parser") # print(soup.prettify()) print('soup') print(time.time() - start_time) # Close the browser driver.close() print('driver close') print(time.time() - start_time) # Initialisation datey = date.today().strftime("%d/%b/%Y") # Date of request print(datey) print(time.time() - start_time) # Scraping product_name = soup.find('h1', class_="module-pdp-title").get_text() print(product_name) print(time.time() - start_time) competitor_price = soup.find_all(class_="pre-inquiry-price")[-1].get_text() print(competitor_price) print(time.time() - start_time) # currency = soup.find_all('label',class_= "ellipsis")[-1].get_text().split(" - ")[-1] currency = 'USD' print(currency) print(time.time() - start_time) uom = soup.find_all( 'span', class_="ma-quantity-range")[-1].get_text().split(" ")[-1] print(uom) comp_name = soup.find( 'a', class_="company-name company-name-lite-vb").get_text() print(comp_name) for item in soup.find_all('dl', class_="do-entry-item"): #print(item.get_text()) if 'Place of Origin:' in item.get_text(): country = item.get_text().split(':')[-1].split(", ")[-1] print(country) elif 'Model Number:' in item.get_text(): grade = item.get_text().split(':')[-1].replace(' grade', '') print(grade) elif 'Package:' in item.get_text(): package = item.get_text().split(':')[-1].split('/')[0] print(package) elif 'Payment term:' in item.get_text(): payment_term = item.get_text().split(':')[-1] print(payment_term) else: pass # Save to dataframe and excel # df = pd.DataFrame({ # 'Product Name': [product_name], # 'Product Grade': [grade], # 'Company Name': [comp_name], # 'Country': [country], # 'Competitor price': [competitor_price], # 'Currency': [currency], # 'UOM': [uom], # 'Packaging': [package] # }) # df.to_excel('products1.xls', index=False, encoding='utf-8') # os.system("start EXCEL.EXE products1.xls") # df_json = json.loads(df.to_json(orient='records')) df_json = { 'Product Name': [product_name], 'Product Grade': [grade], 'Company Name': [comp_name], 'Country': [country], 'Competitor price': [competitor_price], 'Currency': [currency], 'UOM': [uom], 'Packaging': [package] } return df_json
def main(url): # with open(filename, 'w', encoding = 'UTF-8') as f: # f.close() cnt = 0 friend_list = [] friend_link_list = [] try: with open(f'My LinkedIn Friends.txt', 'r', encoding='UTF-8') as f: temp_list = f.read().splitlines() except: with open(f'My LinkedIn Friends.txt', 'r') as f: temp_list = f.read().splitlines() # print(temp_list) for item in temp_list: # print(item) # print(re.split(r'\t+', item)) friend_list.append(striplist(re.split(r'\t+', item))) # print(friend_list) # friend_list.append(item.split(r'\t')) link = friend_list[-1][1] friend_link_list.append(".com" + link[link.find('/in/'):]) # print(friend_link_list) # friend_list = striplist(friend_list) # friend_link_list = striplist(friend_link_list) # print(friend_link_list) df = pd.DataFrame(columns=['Person', 'Link', 'Relation']) options = webdriver.ChromeOptions() # prefs = {"profile.managed_default_content_settings.images": 2} # options.add_experimental_option("prefs", prefs) options.add_argument( '--ignore-certificate-errors') #removes SSL errors from terminal options.add_experimental_option( "excludeSwitches", ["enable-logging"]) #removes device adapter errors from terminal options.add_argument( '--disable-web-security') #removes SSL errors from terminal options.add_argument("--log-level=3") options.add_argument("--user-data-dir=chrome-data") # options.add_argument('--headless') # options.add_argument('--no-sandbox') driver = webdriver.Chrome(ChromeDriverManager().install(), options=options) driver.maximize_window() driver.get(url) try: # driver.maximize_window() SCROLL_PAUSE_TIME = 3 while True: try: WebDriverWait(driver, 60).until( \ EC.presence_of_element_located((By.XPATH, '//*[@id="ember21"]')) \ ) break except: pass # eleclick=WebDriverWait(driver, 60).until( \ # EC.presence_of_element_located((By.XPATH, '//a[contains(string(), " connections")]')) \ # ) # driver.execute_script("arguments[0].click()",eleclick) # WebDriverWait(driver, 60).until( \ # EC.invisibility_of_element_located((By.XPATH, '//a[contains(string(), " connections")]')) \ # ) eleclick=WebDriverWait(driver, 60).until( \ EC.presence_of_element_located((By.CSS_SELECTOR, 'a.app-aware-link.pv-highlight-entity__card-action-link')) \ ) count = "" try: count = driver.find_elements( By.CSS_SELECTOR, "a.app-aware-link.pv-highlight-entity__card-action-link" )[1].find_element(By.TAG_NAME, "h3").text.replace(" mutual connections", "") print("Shared connections:" + count) # print(driver.find_elements(By.CSS_SELECTOR, "a.app-aware-link.pv-highlight-entity__card-action-link")[1].find_element(By.TAG_NAME,"h3").text.replace("mutual connections","")) except: count = "0" url = driver.current_url filename = url.split('/')[-2] + ".txt" if url.split('/')[-1] != '': filename = url.split('/')[-1] + ".txt" path = os.path.abspath(os.getcwd()) + "\\output" if os.path.exists(path) == False: os.mkdir(path) filename = path + "\\" + filename # print(filename) last_height = driver.execute_script( "return document.body.scrollHeight") while True: # Scroll down to bottom driver.execute_script( "window.scrollTo(0, document.body.scrollHeight);") # Wait to load page time.sleep(SCROLL_PAUSE_TIME) # Calculate new scroll height and compare with last scroll height new_height = driver.execute_script( "return document.body.scrollHeight") if new_height == last_height: break last_height = new_height profile = "" profile_atract = "" try: profile_ele=WebDriverWait(driver, 15).until( \ EC.presence_of_element_located((By.CSS_SELECTOR, 'div.pv-profile-section-pager.ember-view')) \ ) profile_eles = driver.find_elements(By.TAG_NAME, "section") for ele in profile_eles: attr = ele.get_attribute("class") if "pv-profile-section" in attr: if "pv-interests-section" in attr: continue profile += ele.text profile_atract = keyword_contain(profile) # if "Harvard" in profile: # profile_atract +="Harvard " # if "Stanford" in profile: # profile_atract +="Stanford " # if "MIT " in profile: # profile_atract +="MIT " # if "Brown University" in profile: # profile_atract +="Brown University " # if "MIT Sloan" in profile: # profile_atract +="MIT Sloan " except: pass print("keywords:" + profile_atract) driver.execute_script("arguments[0].click()", eleclick) WebDriverWait(driver, 60).until( \ EC.invisibility_of_element_located((By.CSS_SELECTOR, 'a.app-aware-link.pv-highlight-entity__card-action-link')) \ ) while True: time.sleep(2) # Get scroll height last_height = driver.execute_script( "return document.body.scrollHeight") while True: # Scroll down to bottom driver.execute_script( "window.scrollTo(0, document.body.scrollHeight);") # Wait to load page time.sleep(SCROLL_PAUSE_TIME) # Calculate new scroll height and compare with last scroll height new_height = driver.execute_script( "return document.body.scrollHeight") if new_height == last_height: break last_height = new_height items_div = driver.find_elements_by_class_name( "entity-result__item") for item in items_div: href_str = item.find_element_by_tag_name('a').get_attribute( 'href') + "/" # print(href_str) href_str = ".com" + href_str[href_str.find('/in/'):] if href_str in friend_link_list: index = friend_link_list.index(href_str) print(index) df.loc[cnt] = friend_list[index] print("matching---") # print(friend_list[index]) cnt += 1 df = df.sort_values('Relation') # print(df) save_text(df, filename) # df.to_csv(filename, header=None, index=None, sep=' ', mode='w') print(cnt) append_write = "" try: nextclick = WebDriverWait(driver, 10).until( \ EC.presence_of_element_located((By.CSS_SELECTOR, 'button.artdeco-pagination__button.artdeco-pagination__button--next.artdeco-button.artdeco-button--muted.artdeco-button--icon-right.artdeco-button--1.artdeco-button--tertiary.ember-view')) \ ) except: break try: if nextclick.get_attribute("disabled") == None: nextclick.click() else: break except: break if os.path.exists(filename): append_write = 'a' # append if already exists else: append_write = 'w' # make a new file if not with open(filename, append_write, encoding='UTF-8') as f: f.write("\nShared connections:" + count) f.write("\nKeywords:" + profile_atract) with open(filename, 'r', encoding='UTF-8') as f: print(f.read()) driver.quit() except Exception as e: print(e) # print("Error!") driver.quit() print("Finish!")
def chrome(): return webdriver.Chrome(ChromeDriverManager('latest').install())
# # print("Real Browser Launching") # browser = webdriver.Chrome(ChromeDriverManager().install()) # # print("Real Browser has Launched") """ The Headless browsing option greatly reduces the amount of time it takes for the scraper to run. """ print("Headless Browser Running") options = Options() options.add_argument("--headless") # Runs Chrome in headless mode. options.add_argument('--no-sandbox') # Bypass OS security model options.add_argument('--disable-gpu') # applicable to windows os only options.add_argument('start-maximized') # options.add_argument('disable-infobars') options.add_argument("--disable-extensions") browser = webdriver.Chrome(chrome_options=options, executable_path=ChromeDriverManager().install()) print("Headless Browser has Launched") def login_into_dash(json_target_file): """ Takes the login information from JSON file and passes data to login form. Parameter json_target_file needs to be equal to the file's location. Contents of the file must be organized as follows [Note: don't forget the curly braces]: { "username": "******", "password": "******" }
def scrape_all(): executable_path = {'executable_path': ChromeDriverManager().install()} browser = Browser('chrome', **executable_path) # ### Visit the NASA Mars News Site # Visit the mars nasa news site url = 'https://data-class-mars.s3.amazonaws.com/Mars/index.html' browser.visit(url) # Optional delay for loading the page time.sleep(5) # Convert the browser html to a soup object and then quit the browser html = browser.html news_soup = soup(html, 'html.parser') print(news_soup) slide_elem = news_soup.select_one('div.list_text') # Use the parent element to find the first a tag and save it as `news_title` news_title = slide_elem.find('div', class_='content_title').get_text() # Use the parent element to find the paragraph text news_p = slide_elem.find('div', class_='article_teaser_body').get_text() # ### JPL Space Images Featured Image # Visit URL url = 'https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/index.html' browser.visit(url) # Find and click the full image button full_image_elem = browser.find_by_tag('button')[1] full_image_elem.click() # Parse the resulting html with soup html = browser.html img_soup = soup(html, 'html.parser') # find the relative image url img_url_rel = img_soup.find('img', class_='fancybox-image').get('src') # Use the base url to create an absolute url img_url = f'https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/{img_url_rel}' # ### Mars Facts df = pd.read_html('https://data-class-mars-facts.s3.amazonaws.com/Mars_Facts/index.html')[0] df.columns=['Description', 'Mars', 'Earth'] df.set_index('Description', inplace=True) mars_facts = df.to_html() # # D1: Scrape High-Resolution Mars’ Hemisphere Images and Titles # ### Hemispheres # 1. Use browser to visit the URL base_url = 'https://data-class-mars-hemispheres.s3.amazonaws.com/Mars_Hemispheres/' url = base_url + "index.html" browser.visit(url) # 2. Create a list to hold the images and titles. hemisphere_image_urls = [] # 3. Write code to retrieve the image urls and titles for each hemisphere. html = browser.html hemisphere_soup = soup(html, 'html.parser') hemi_findall = hemisphere_soup.findAll("div",class_="description") #hemi = hemi_findall[0] pre_hemi_urls=[] for hemi in hemi_findall: hemifind = hemi.find("a")["href"] hemi_url = base_url + hemifind pre_hemi_urls.append(hemi_url) #hemi_url = pre_hemi_urls[0] for hemi_url in pre_hemi_urls: browser.visit(hemi_url) html = browser.html hemisphere_soup = soup(html, 'html.parser') hurl = base_url + hemisphere_soup.find("div", class_="downloads").find("a")["href"] hemisphere_image_urls.append({ "img_url":hurl, "title": hemisphere_soup.find("h2", class_="title").text, }) # 4. Quit the browser. browser.quit() # # merge all scraped data into one dictionary mars_data = { "news_title": news_title, "news_paragraph": news_p, "featured_image":img_url, "facts": mars_facts, "hemispheres": hemisphere_image_urls, } print(mars_data) return mars_data
def bot(): # lists initialized for storing data of employees names = [] abouts = [] expers = [] locs = [] titles = [] # set paths for webdriver + initialize options = Options() options.add_argument('--incognito') options.binary_location = 'C:/Program Files (x86)/Google/Chrome/Application/chrome.exe' driver = webdriver.Chrome(ChromeDriverManager().install(), options=options) driver.maximize_window() # load into linkedin site driver.get( 'https://www.linkedin.com/login?fromSignIn=true&trk=guest_homepage-basic_nav-header-signin' ) time.sleep(1) # enter login info user = driver.find_element_by_xpath( '/html/body/div/main/div[2]/form/div[1]/input') user.send_keys('EMAIL') # REPLACE WITH USER EMAIL pswd = driver.find_element_by_xpath( '/html/body/div/main/div[2]/form/div[2]/input') pswd.send_keys('PASSWORD') # REPLACE WITH USER PASSWORD # submit form, try catch because it was having issues finding the button by a single absolute path try: driver.find_element_by_xpath( '/html/body/div/main/div[2]/form/div[4]').click() except Exception: try: driver.find_element_by_xpath( '/html/body/div/main/div[2]/form/div[3]/button').click() except Exception: driver.find_element_by_xpath( '//*[@id="app__container"]/main/div[2]/form/div[4]/button' ).click() try: driver.find_element_by_link_text('Sign in').click() except Exception: quit() time.sleep(1) # try catch for mobile authentication, seems unnecessary as of now try: driver.find_element_by_xpath( '/html/body/div/div[1]/section/div[2]/div/article/footer/div/div/button' ).click() except Exception: print('No auth needed') # keep track of what page script is on page_tracker = 1 # boolean li query, must first search in google then copy search address site = 'QUERY LINK AFTER SEARCHED' driver.get(site) # main loop for handling bot while True: # set the main window and intialize a random user agent to avoid captchas main_window = driver.current_window_handle link_counter = 0 ua = UserAgent() userAgent = ua.random print(userAgent) time.sleep(1) # find all clickable links, the iterate though them data = driver.find_elements_by_partial_link_text('linkedin.com') for data[link_counter] in data: data = driver.find_elements_by_partial_link_text('linkedin.com') data[link_counter].send_keys(Keys.CONTROL + Keys.RETURN) # open in new tab driver.switch_to.window( driver.window_handles[1]) # switch to new tab time.sleep(2) # Block to figure out their name: if both attempts fail, go to essentially end of task loop and resume # process or move on to next page try: con_name = driver.find_element_by_xpath( '/html/body/div[7]/div[3]/div/div/div/div/div[2]/main/div[1]/section/div[2]/div[2]/div[1]/ul[1]/li[1]' ) except Exception: try: con_name = driver.find_element_by_xpath( '/html/body/div[8]/div[3]/div/div/div/div/div[2]/main/div[1]/section/div[2]/div[2]/div[1]/ul[1]/li[1]' ) except Exception: ua = UserAgent() userAgent = ua.random print(userAgent) driver.close() driver.switch_to.window(main_window) link_counter += 1 time.sleep(1) continue # block to find location - if not specificied or undistinguishable, leave empty and move on try: location = driver.find_element_by_xpath( '/html/body/div[7]/div[3]/div/div/div/div/div[2]/main/div[1]/section/div[2]/div[2]/div[1]/ul[2]/li[1]' ).text except: location = "" # block to make sure ths is a real person account and create an anchor for later (to a specific point on their profile page) try: head = driver.find_element_by_xpath( '/html/body/div[7]/div[3]/div/div/div/div/div[2]/main/div[2]/div[5]/span/div/section/div[1]/section/header/h2' ) except Exception: try: head = driver.find_element_by_xpath( '/html/body/div[7]/div[3]/div/div/div/div/div[2]/main/div[2]/div[6]/span/div/section/div[1]/section/header/h2' ) except Exception: ua = UserAgent() userAgent = ua.random print(userAgent) driver.close() driver.switch_to.window(main_window) link_counter += 1 time.sleep(1) break # block to expand the about section try: driver.find_element_by_xpath( '/html/body/div[7]/div[3]/div/div/div/div/div[2]/main/div[2]/div[3]/section/p/span[3]/span/a' ).click() except Exception: print('about expanded') # now, grab about text try: about = driver.find_element_by_xpath( '/html/body/div[7]/div[3]/div/div/div/div/div[2]/main/div[2]/div[3]/section/p/span[1]' ) about = str(about.text) except Exception: about = "EMPTY" # move to head anchor actions = ActionChains(driver) actions.move_to_element(head).perform() # blovk to grab their title try: title = driver.find_element_by_xpath( '/html/body/div[7]/div[3]/div/div/div/div/div[2]/main/div[2]/div[5]/span/div/section/div[1]/section/ul/li[1]/section/div/div/a/div[2]/h3' ) title = title.text except Exception: try: title = driver.find_element_by_xpath( '/html/body/div[7]/div[3]/div/div/div/div/div[2]/main/div[2]/div[6]/span/div/section/div[1]/section/ul/li[1]/section/ul/li[1]/div/div/div/div/div/div/h3/span[2]' ) title = title.text except Exception: print('Now what...') # block to expand experience tab try: driver.find_element_by_xpath( '/html/body/div[7]/div[3]/div/div/div/div/div[2]/main/div[2]/div[5]/span/div/section/div[1]/section/ul/li[1]/section/div/div/div/p/span/button' ).click() except Exception: print('No see more button') # now, try to grab the experience text try: experience = driver.find_element_by_xpath( '/html/body/div[7]/div[3]/div/div/div/div/div[2]/main/div[2]/div[5]/span/div/section/div[1]/section/ul/li[1]/section/div/div/div/p' ) experience = experience.text except Exception: print('No experience for this chum') experience = "" # convert to string for data normalization con_name = str(con_name.text) # print fields for testing, then append them to the lists print(con_name) print(location) print(title) print(about) print(experience) expers.append(experience) abouts.append(about) names.append(con_name) locs.append(location) titles.append(title) # initialize new user agent for anonymity, then go back to main window and close any extra tabs ua = UserAgent() userAgent = ua.random print(userAgent) driver.close() driver.switch_to.window(main_window) link_counter += 1 time.sleep(1) # block to continue to next page on google search, < num pages to go through. Default cap is 25, change here if less is desired if page_tracker < 25: time.sleep(.75 * 60) # sleep time, helps avoid google captchas driver.find_element_by_xpath( '/html/body/div[8]/div[2]/div[10]/div[1]/div[2]/div/div[5]/div[2]/span[1]/div/table/tbody/tr/td[12]/a/span[2]' ).click() page_tracker += 1 time.sleep(1) main_window = driver.current_window_handle else: driver.quit() break # convert lists to pandas series so they can be placed in a dataframe for storage names_ser = pd.Series(names) exp_ser = pd.Series(expers) about_ser = pd.Series(abouts) locs_ser = pd.Series(locs) titles_ser = pd.Series(titles) frame = { 'Name': names_ser, 'Description/Bio': about_ser, 'Location': locs_ser, 'Title': titles_ser, 'Expertise (subj matter)': exp_ser } final = pd.DataFrame(frame) final.to_csv('DESIRED NAME.csv') # specify file name here time.sleep(10) # sleep for 10 seconds then continue on to next batch of people. If no other queries are necessary, comment out the next blocks of code. names = [] abouts = [] expers = [] locs = [] titles = [] # set paths for webdriver + initialize options = Options() options.add_argument('--incognito') options.binary_location = 'C:/Program Files (x86)/Google/Chrome/Application/chrome.exe' driver_path = "C:/Users/Matt Turi/Downloads/chromedriver_win32/chromedriver.exe" driver = webdriver.Chrome(options=options, executable_path=driver_path) driver.maximize_window() driver.get( 'https://www.linkedin.com/login?fromSignIn=true&trk=guest_homepage-basic_nav-header-signin' ) time.sleep(1) # enter login info user = driver.find_element_by_xpath( '/html/body/div/main/div[2]/form/div[1]/input') user.send_keys('EMAIL') pswd = driver.find_element_by_xpath( '/html/body/div/main/div[2]/form/div[2]/input') pswd.send_keys('PASSWORD') # submit form, try catch because it was having issues finding the button by a single absolute path try: driver.find_element_by_xpath( '/html/body/div/main/div[2]/form/div[4]').click() except Exception: try: driver.find_element_by_xpath( '/html/body/div/main/div[2]/form/div[3]/button').click() except Exception: driver.find_element_by_xpath( '//*[@id="app__container"]/main/div[2]/form/div[4]/button' ).click() try: driver.find_element_by_link_text('Sign in').click() except Exception: quit() time.sleep(1) # try catch for mobile authentication, seems unnecessary as of now try: driver.find_element_by_xpath( '/html/body/div/div[1]/section/div[2]/div/article/footer/div/div/button' ).click() except Exception: print('No auth needed') page_tracker = 1 # boolean li query, must first search in google then copy search address site = 'GOOGLE QUERY' driver.get(site) # main loop for handling bot while True: main_window = driver.current_window_handle link_counter = 0 ua = UserAgent() userAgent = ua.random print(userAgent) time.sleep(1) data = driver.find_elements_by_partial_link_text('linkedin.com') for data[link_counter] in data: data = driver.find_elements_by_partial_link_text('linkedin.com') data[link_counter].send_keys(Keys.CONTROL + Keys.RETURN) driver.switch_to.window(driver.window_handles[1]) time.sleep(2) # Block to figure out their name: if both attempts fail, go to essentially end of task loop and resume # process or move on to next page try: con_name = driver.find_element_by_xpath( '/html/body/div[7]/div[3]/div/div/div/div/div[2]/main/div[1]/section/div[2]/div[2]/div[1]/ul[1]/li[1]' ) except Exception: try: con_name = driver.find_element_by_xpath( '/html/body/div[8]/div[3]/div/div/div/div/div[2]/main/div[1]/section/div[2]/div[2]/div[1]/ul[1]/li[1]' ) except Exception: ua = UserAgent() userAgent = ua.random print(userAgent) driver.close() driver.switch_to.window(main_window) link_counter += 1 time.sleep(1) continue try: location = driver.find_element_by_xpath( '/html/body/div[7]/div[3]/div/div/div/div/div[2]/main/div[1]/section/div[2]/div[2]/div[1]/ul[2]/li[1]' ).text except: location = "" try: head = driver.find_element_by_xpath( '/html/body/div[7]/div[3]/div/div/div/div/div[2]/main/div[2]/div[5]/span/div/section/div[1]/section/header/h2' ) except Exception: # driver.execute_script("window.scrollTo(0, 400)") try: head = driver.find_element_by_xpath( '/html/body/div[7]/div[3]/div/div/div/div/div[2]/main/div[2]/div[6]/span/div/section/div[1]/section/header/h2' ) except Exception: ua = UserAgent() userAgent = ua.random print(userAgent) driver.close() driver.switch_to.window(main_window) link_counter += 1 time.sleep(1) break try: driver.find_element_by_xpath( '/html/body/div[7]/div[3]/div/div/div/div/div[2]/main/div[2]/div[3]/section/p/span[3]/span/a' ).click() except Exception: print('about expanded') try: about = driver.find_element_by_xpath( '/html/body/div[7]/div[3]/div/div/div/div/div[2]/main/div[2]/div[3]/section/p/span[1]' ) about = str(about.text) except Exception: about = "EMPTY" actions = ActionChains(driver) actions.move_to_element(head).perform() try: title = driver.find_element_by_xpath( '/html/body/div[7]/div[3]/div/div/div/div/div[2]/main/div[2]/div[5]/span/div/section/div[1]/section/ul/li[1]/section/div/div/a/div[2]/h3' ) title = title.text except Exception: try: title = driver.find_element_by_xpath( '/html/body/div[7]/div[3]/div/div/div/div/div[2]/main/div[2]/div[6]/span/div/section/div[1]/section/ul/li[1]/section/ul/li[1]/div/div/div/div/div/div/h3/span[2]' ) title = title.text except Exception: print('Now what...') try: driver.find_element_by_xpath( '/html/body/div[7]/div[3]/div/div/div/div/div[2]/main/div[2]/div[5]/span/div/section/div[1]/section/ul/li[1]/section/div/div/div/p/span/button' ).click() except Exception: print('No see more button') try: experience = driver.find_element_by_xpath( '/html/body/div[7]/div[3]/div/div/div/div/div[2]/main/div[2]/div[5]/span/div/section/div[1]/section/ul/li[1]/section/div/div/div/p' ) experience = experience.text except Exception: print('No experience for this chum') experience = "" con_name = str(con_name.text) print(experience) print(about) print(con_name) expers.append(experience) abouts.append(about) names.append(con_name) locs.append(location) titles.append(title) ua = UserAgent() userAgent = ua.random print(userAgent) driver.close() driver.switch_to.window(main_window) link_counter += 1 time.sleep(1) # block to continue to next page on google search, <num pages to go through. Default cap is 25, change here if less is desired if page_tracker < 25: time.sleep(.75 * 60) driver.find_element_by_xpath( '/html/body/div[8]/div[2]/div[10]/div[1]/div[2]/div/div[5]/div[2]/span[1]/div/table/tbody/tr/td[12]/a/span[2]' ).click() page_tracker += 1 time.sleep(1) main_window = driver.current_window_handle else: driver.quit() break names_ser = pd.Series(names) exp_ser = pd.Series(expers) about_ser = pd.Series(abouts) locs_ser = pd.Series(locs) titles_ser = pd.Series(titles) frame = { 'Name': names_ser, 'Description/Bio': about_ser, 'Location': locs_ser, 'Title': titles_ser, 'Expertise (subj matter)': exp_ser } final = pd.DataFrame(frame) final.to_csv('NAME HERE.csv') time.sleep(10) names = [] abouts = [] expers = [] locs = [] titles = [] # set paths for webdriver + initialize options = Options() options.add_argument('--incognito') options.binary_location = 'C:/Program Files (x86)/Google/Chrome/Application/chrome.exe' driver_path = "C:/Users/Matt Turi/Downloads/chromedriver_win32/chromedriver.exe" driver = webdriver.Chrome(options=options, executable_path=driver_path) driver.maximize_window() driver.get( 'https://www.linkedin.com/login?fromSignIn=true&trk=guest_homepage-basic_nav-header-signin' ) time.sleep(1) # enter login info user = driver.find_element_by_xpath( '/html/body/div/main/div[2]/form/div[1]/input') user.send_keys('EMAIL') pswd = driver.find_element_by_xpath( '/html/body/div/main/div[2]/form/div[2]/input') pswd.send_keys('PASSWORD') # submit form, try catch because it was having issues finding the button by a single absolute path try: driver.find_element_by_xpath( '/html/body/div/main/div[2]/form/div[4]').click() except Exception: try: driver.find_element_by_xpath( '/html/body/div/main/div[2]/form/div[3]/button').click() except Exception: driver.find_element_by_xpath( '//*[@id="app__container"]/main/div[2]/form/div[4]/button' ).click() try: driver.find_element_by_link_text('Sign in').click() except Exception: quit() time.sleep(1) # try catch for mobile authentication, seems unnecessary as of now try: driver.find_element_by_xpath( '/html/body/div/div[1]/section/div[2]/div/article/footer/div/div/button' ).click() except Exception: print('No auth needed') page_tracker = 1 # boolean li query, must first search in google then copy search address site = '' driver.get(site) # main loop for handling bot while True: main_window = driver.current_window_handle link_counter = 0 ua = UserAgent() userAgent = ua.random print(userAgent) time.sleep(1) data = driver.find_elements_by_partial_link_text('linkedin.com') for data[link_counter] in data: data = driver.find_elements_by_partial_link_text('linkedin.com') data[link_counter].send_keys(Keys.CONTROL + Keys.RETURN) driver.switch_to.window(driver.window_handles[1]) time.sleep(2) # Block to figure out their name: if both attempts fail, go to essentially end of task loop and resume # process or move on to next page try: con_name = driver.find_element_by_xpath( '/html/body/div[7]/div[3]/div/div/div/div/div[2]/main/div[1]/section/div[2]/div[2]/div[1]/ul[1]/li[1]' ) except Exception: try: con_name = driver.find_element_by_xpath( '/html/body/div[8]/div[3]/div/div/div/div/div[2]/main/div[1]/section/div[2]/div[2]/div[1]/ul[1]/li[1]' ) except Exception: ua = UserAgent() userAgent = ua.random print(userAgent) driver.close() driver.switch_to.window(main_window) link_counter += 1 time.sleep(1) continue try: location = driver.find_element_by_xpath( '/html/body/div[7]/div[3]/div/div/div/div/div[2]/main/div[1]/section/div[2]/div[2]/div[1]/ul[2]/li[1]' ).text except: location = "" try: head = driver.find_element_by_xpath( '/html/body/div[7]/div[3]/div/div/div/div/div[2]/main/div[2]/div[5]/span/div/section/div[1]/section/header/h2' ) except Exception: # driver.execute_script("window.scrollTo(0, 400)") try: head = driver.find_element_by_xpath( '/html/body/div[7]/div[3]/div/div/div/div/div[2]/main/div[2]/div[6]/span/div/section/div[1]/section/header/h2' ) except Exception: ua = UserAgent() userAgent = ua.random print(userAgent) driver.close() driver.switch_to.window(main_window) link_counter += 1 time.sleep(1) break try: driver.find_element_by_xpath( '/html/body/div[7]/div[3]/div/div/div/div/div[2]/main/div[2]/div[3]/section/p/span[3]/span/a' ).click() except Exception: print('about expanded') try: about = driver.find_element_by_xpath( '/html/body/div[7]/div[3]/div/div/div/div/div[2]/main/div[2]/div[3]/section/p/span[1]' ) about = str(about.text) except Exception: about = "EMPTY" actions = ActionChains(driver) actions.move_to_element(head).perform() try: title = driver.find_element_by_xpath( '/html/body/div[7]/div[3]/div/div/div/div/div[2]/main/div[2]/div[5]/span/div/section/div[1]/section/ul/li[1]/section/div/div/a/div[2]/h3' ) title = title.text except Exception: try: title = driver.find_element_by_xpath( '/html/body/div[7]/div[3]/div/div/div/div/div[2]/main/div[2]/div[6]/span/div/section/div[1]/section/ul/li[1]/section/ul/li[1]/div/div/div/div/div/div/h3/span[2]' ) title = title.text except Exception: print('Now what...') try: driver.find_element_by_xpath( '/html/body/div[7]/div[3]/div/div/div/div/div[2]/main/div[2]/div[5]/span/div/section/div[1]/section/ul/li[1]/section/div/div/div/p/span/button' ).click() except Exception: print('No see more button') try: experience = driver.find_element_by_xpath( '/html/body/div[7]/div[3]/div/div/div/div/div[2]/main/div[2]/div[5]/span/div/section/div[1]/section/ul/li[1]/section/div/div/div/p' ) experience = experience.text except Exception: print('No experience for this chum') experience = "" con_name = str(con_name.text) print(experience) print(about) print(con_name) expers.append(experience) abouts.append(about) names.append(con_name) locs.append(location) titles.append(title) ua = UserAgent() userAgent = ua.random print(userAgent) driver.close() driver.switch_to.window(main_window) link_counter += 1 time.sleep(1) # block to continue to next page on google search, <num pages to go through. Default cap is 25, change here if less is desired if page_tracker < 3: # page_stop: # 30: time.sleep(.75 * 60) driver.find_element_by_xpath( '/html/body/div[8]/div[2]/div[10]/div[1]/div[2]/div/div[5]/div[2]/span[1]/div/table/tbody/tr/td[5]/a/span[2]' ).click() page_tracker += 1 time.sleep(1) main_window = driver.current_window_handle else: driver.quit() break names_ser = pd.Series(names) exp_ser = pd.Series(expers) about_ser = pd.Series(abouts) locs_ser = pd.Series(locs) titles_ser = pd.Series(titles) frame = { 'Name': names_ser, 'Description/Bio': about_ser, 'Location': locs_ser, 'Title': titles_ser, 'Expertise (subj matter)': exp_ser } final = pd.DataFrame(frame) final.to_csv('FILE NAME HERE.csv')
def setUpClass(cls): cls.driver = webdriver.Chrome(ChromeDriverManager().install()) cls.driver.implicitly_wait(10) cls.driver.maximize_window()