Пример #1
0
#for tor browser use: -x 185.130.105.66:11084

args = parser.parse_args()
login = args.login
password = args.password

options = webdriver.ChromeOptions()

# socks5://user:pass@host:port
proxies = None
if args.proxy is not None:
    proxies = dict(http='socks5://' + args.proxy,
                   https='socks5://' + args.proxy)
    options.add_argument("--proxy-server=socks5://" + args.proxy)

driver = webdriver.Chrome(executable_path=ChromeDriverManager().install(),
                          chrome_options=options)


def normalizeDirFileName(name: str) -> str:
    name = re.sub('[^0-9a-zA-Zа-яА-Я]+', '_', name)
    name = re.sub('[_]+', ' ', name)
    name = name.strip()
    if len(name) > 254:
        name = name[0:254]
    return name


def fileLink(id: str, page: int) -> str:
    return f"http://eais.tatar.ru/Pages/ImageFilePart.ashx?Crop=False&Id={id}&Page={page}&Zoom=1"
Пример #2
0
def browser():
    driver = webdriver.Chrome(ChromeDriverManager().install())
    driver.set_window_size(1920, 1080)
    yield driver
    driver.quit()
Пример #3
0
from webdriver_manager.chrome import ChromeDriverManager
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from bs4 import BeautifulSoup
import time

url = 'https://www.sharesansar.com/company/shl'

cdm = ChromeDriverManager().install()
driver = webdriver.Chrome(cdm)

driver.maximize_window()
driver.get(url)
time.sleep(10)

data = []

driver.find_element_by_link_text('Price History').click()
time.sleep(3)

select = Select(
    WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable(
            (By.XPATH, '//*[@name="myTableCPriceHistory_length"]'))))
select.select_by_visible_text("50")
def test_can_get_chromium_for_win(os_type):
    path = ChromeDriverManager(version="83.0.4103.39",
                               os_type=os_type,
                               chrome_type=ChromeType.CHROMIUM).install()
    assert os.path.exists(path)
Пример #5
0
from selenium import webdriver
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver import ActionChains
import time

driver = webdriver.Chrome(ChromeDriverManager().install())
driver.implicitly_wait(3)
driver.get("http://swisnl.github.io/jQuery-contextMenu/demo.html")

right_click = driver.find_element(By.CSS_SELECTOR, 'p span')
act_chain = ActionChains(driver)
act_chain.context_click(right_click).perform()

time.sleep(2)

menu_options = driver.find_elements(By.CSS_SELECTOR, 'ul span')
for options in menu_options:
    print(options.text)
    if options.text == 'Copy':
        options.click()
        break

time.sleep(2)

driver.quit()
Пример #6
0
def test_chrome_manager_with_selenium():
    driver_path = ChromeDriverManager().install()
    driver = webdriver.Chrome(driver_path)
    driver.get("http://automation-remarks.com")
    driver.close()
Пример #7
0
def test_chrome_manager_with_specific_version():
    bin = ChromeDriverManager("2.26").install()
    assert os.path.exists(bin)
Пример #8
0
def scrape():

    #==========================================NASA Mars News==================================================
    executable_path = {'executable_path': ChromeDriverManager().install()}
    browser = Browser('chrome', **executable_path, headless=False)

    url = "https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest"
    browser.visit(url)
    time.sleep(10)

    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')
    news_list = soup.find('ul', class_='item_list')

    #********Result to put into Dictionary: title1 & para
    nasa_title = soup.find('ul', class_='item_list').find("li").find(
        'div', class_='content_title').find('a').text
    para = soup.find('ul', class_='item_list').find("li").find(
        'div', class_='article_teaser_body').text

    browser.quit()

    #=================================JPL Mars Space Images - Featured Image=====================================
    executable_path = {'executable_path': ChromeDriverManager().install()}
    browser = Browser('chrome', **executable_path, headless=False)
    url = "https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/index.html"
    browser.visit(url)
    time.sleep(10)

    browser.links.find_by_partial_text('FULL IMAGE').click()
    time.sleep(10)

    html = browser.html

    soup = BeautifulSoup(html, 'html.parser')

    img = soup.find(
        "div",
        class_=
        "fancybox-wrap fancybox-desktop fancybox-type-image fancybox-opened"
    ).find('img', class_="fancybox-image")["src"]

    url_1 = url.split("index.html")[0]

    #********Result to put into Dictionary: img_url
    feature_img_url = url_1 + img

    browser.quit()

    #=====================================================Mars Facts Table=============================

    #Broswer to open the website
    executable_path = {'executable_path': ChromeDriverManager().install()}
    browser = Browser('chrome', **executable_path, headless=False)
    url = "https://space-facts.com/mars/"
    browser.visit(url)
    time.sleep(10)
    url = "https://space-facts.com/mars/"
    tables = pd.read_html(url)

    mars_df = tables[0]
    mars_df.columns = ["Mars Details", "Measurements"]

    #=========Result to put into dictionary: html_table
    html_table = mars_df.to_html()
    html_table1 = html_table.replace("\n", "")

    browser.quit()

    #================================ Mars Hemispheres=========================================
    #Broswer to open the website
    executable_path = {'executable_path': ChromeDriverManager().install()}
    browser = Browser('chrome', **executable_path, headless=False)
    url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
    browser.visit(url)

    html_mars_main = browser.html
    soup = BeautifulSoup(html_mars_main, 'html.parser')
    all_div = soup.find_all("div", class_="description")

    #Create a list for the headlines and append them.
    title = []
    for x in all_div:
        title.append(x.find("h3").text)

    url_2 = url.split("/search")[0]
    url_2

    #Loop throught the headline list and visit each link to get url details
    img_url = []
    for x in title:
        browser.links.find_by_partial_text(x).click()
        html_mars = browser.html
        soup = BeautifulSoup(html_mars, 'html.parser')
        img_url1 = soup.find("img", class_="wide-image")["src"]

        #combine 2 variables to get the right img_url
        #=====================Result to add to dictionary: img_url (list)
        img_url.append(url_2 + img_url1)

        #ask the browser to go back to the main page
        browser.back()

    #add title and img_url to a list of dictionary
    hemisphere_image_urls = []
    for x in range(0, 4):
        hemisphere_image_urls.append({
            "title": title[x],
            "img_url": img_url[x]
        })

    browser.quit()

    listings = {}
    listings["Nasa_News_Title"] = nasa_title
    listings["Nasa_News_Para"] = para
    listings["Featured_Image"] = feature_img_url
    listings["Mars_Information"] = html_table1
    listings["Mars_Img_Url"] = hemisphere_image_urls

    return listings
def scrape():
    # # NASA Mars News
    mars_data = {}
    # In[164]:

    # URL of page to be scraped
    executable_path = {'executable_path': ChromeDriverManager().install()}
    browser = Browser('chrome', **executable_path, headless=False)

    # In[165]:

    url = "https://redplanetscience.com/"
    browser.visit(url)

    # In[166]:

    # Retrieve page with the requests module

    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')

    news_title = soup.find('div', class_='content_title')
    news_p = soup.find('div', class_='article_teaser_body')

    print(news_title.text)
    print(news_p.text)

    # # JPL Mars Space Images - Featured Image

    # In[28]:

    executable_path = {'executable_path': ChromeDriverManager().install()}
    browser = Browser('chrome', **executable_path, headless=False)
    url = 'https://spaceimages-mars.com'
    browser.visit(url)

    # In[6]:

    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')

    div = soup.find('div', class_="floating_text_area")

    link = div.find('a')
    href = link['href']

    featured_image_url = f"{url}/{href}"
    print(featured_image_url)

    # # Mars Facts
    #
    # Visit the Mars Facts webpage here and use Pandas to scrape the table containing facts about the planet including Diameter, Mass, etc.
    #
    # Use Pandas to convert the data to a HTML table string.

    # In[7]:

    executable_path = {'executable_path': ChromeDriverManager().install()}
    browser = Browser('chrome', **executable_path, headless=False)
    url = 'https://galaxyfacts-mars.com/'
    browser.visit(url)

    # In[8]:

    tables = pd.read_html(url)

    # In[18]:

    mars_facts_df = tables[0]
    mars_facts_df.columns = [['Category', 'Mars Value', 'Earth Value']]
    del mars_facts_df['Earth Value']
    mars_facts_df

    # In[19]:

    html_table = mars_facts_df.to_html()
    html_table

    # In[156]:

    html_table_string = html_table.replace('\n', '')

    # # Mars Hemispheres
    #
    # Visit the astrogeology site here to obtain high resolution images for each of Mar's hemispheres.
    #
    # You will need to click each of the links to the hemispheres in order to find the image url to the full resolution image.
    #
    # Save both the image url string for the full resolution hemisphere image, and the Hemisphere title containing the hemisphere name. Use a Python dictionary to store the data using the keys img_url and title.
    #
    # Append the dictionary with the image url string and the hemisphere title to a list. This list will contain one dictionary for each hemisphere.

    # In[167]:

    executable_path = {'executable_path': ChromeDriverManager().install()}
    browser = Browser('chrome', **executable_path, headless=False)
    url = 'https://marshemispheres.com/'
    browser.visit(url)

    # In[80]:

    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')

    browser.links.find_by_partial_text('Hemisphere').click()

    # In[180]:

    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')
    hemispheres = soup.find_all('div', class_='collapsible results')
    items = soup.find_all('div', class_='item')
    h_titles = []
    url_things = []
    #hemisphere_image_urls = []

    for item in items:
        #h_titles=item.find('h3').text
        h3 = item.find('h3').text
        h_titles.append(h3)
        print(h3)
        try:
            browser.links.find_by_partial_text(h3).click()
            html = browser.html
            soup = BeautifulSoup(html, 'html.parser')
            div = soup.find('div', class_="downloads")
            image = div.find('a')
            href = image['href']
            image_url = f"{url}{href}"
            url_things.append(image_url)
            print(image_url)
            #hemisphere_image_urls.append({'title': h_title, 'img_url': image_url})
            browser.back()
        except:
            print('nope')
    def setUp(self):

        if readExcel('../Data/data.xlsx', 'Browser_Conf', 'A2') == "Yes":
            self.driver = webdriver.Chrome(ChromeDriverManager().install())
        else:
            self.driver = webdriver.Chrome(ChromeDriverManager().install())
Пример #11
0
def scrape_company_classification(tickers=None):
    '''
    TODO: Need to do this for all companies ever listed, not only current.
    :return:
    '''

    init_df = pd.read_csv(
        'https://www.ishares.com/us/products/239724/ishares-core-sp-total-us-stock-market-etf/1467271812596.ajax?fileType=csv&fileName=ITOT_holdings&dataType=fund',
        skiprows=9,
        index_col=0)
    # tickers = init_df.index.tolist()
    from matilda.data_pipeline.db_crud import companies_in_classification

    if tickers is None:
        tickers = companies_in_classification(
            class_=config.MarketIndices.DOW_JONES)
    driver = webdriver.Chrome(ChromeDriverManager().install())

    sic_codes_division = {
        (1, 9 + 1): 'Agriculture, Forestry, and Fishing',
        (10, 14 + 1): 'Mining',
        (15, 17 + 1): 'Construction',
        (20, 39 + 1): 'Manufacturing',
        (40, 49 + 1):
        'Transportation, Communications, Electric, Gas, And Sanitary Services',
        (50, 51 + 1): 'Wholesale Trade',
        (52, 59 + 1): 'Retail Trade',
        (60, 67 + 1): 'Finance, Insurance, and Real Estate',
        (70, 89 + 1): 'Services',
        (90, 99 + 1): 'Public Administration'
    }

    exchanges_dict = {
        exchange: list(
            pd.read_csv(os.path.join(config.MARKET_EXCHANGES_DIR_PATH,
                                     f'{exchange}.txt'),
                        sep='\t')['Symbol'])
        for exchange in ['AMEX', 'NYSE', 'NASDAQ']
    }

    path = os.path.join(config.DATA_DIR_PATH,
                        "market_data/country_codes_dictio.pkl")
    if not os.path.exists(path):
        save_country_codes()

    with open(path, 'rb') as f:
        country_codes = pickle.load(f)

    edgar_dict = {}
    for ticker in tickers:
        edgar_dict[ticker] = {}
        try:
            for i in range(
                    2
            ):  # just try again if didn't work first time, might be advertisement showed up
                try:
                    button = driver.find_element_by_xpath(
                        "//a[@class='acsCloseButton acsAbandonButton ']")
                    button.click()
                    sleep(1)
                except:
                    pass
                # if nasdaq_df['ETF'].loc[ticker] == 'Y':
                #     driver.get('https://www.sec.gov/edgar/searchedgar/mutualsearch.html')
                #     field = driver.find_element_by_xpath("//input[@id='gen_input']")
                #     field.send_keys(ticker)  # TODO might split ticker from the '$' or '.' (classes)
                #     sleep(1)
                #     field.send_keys(Keys.ENTER)
                #     sleep(1)
                #     if 'No records matched your query' not in driver.page_source:
                #         for t in driver.find_elements_by_xpath("//b[@class='blue']"):  # TODO
                #             if t.text == ticker:
                #                 cik = driver.find_element_by_xpath('').text
                #                 security_type = driver.find_element_by_xpath('').text
                #     break  # still should go to the 'finally' block

                base_url = 'https://www.sec.gov/cgi-bin/browse-edgar?CIK={}'.format(
                    ticker)
                resp = requests.get(base_url).text

                if 'No matching Ticker Symbol' in resp or 'No records matched your query' in resp:
                    driver.get(
                        'https://www.sec.gov/edgar/searchedgar/companysearch.html'
                    )
                    # html = driver.page_source TODO for new 10-K forms maybe works?
                    input_box = driver.find_element_by_xpath(
                        "//input[@id='company']")
                    input_box.send_keys(ticker)
                    html = driver.page_source
                    # wait until the autofill box loads
                    WebDriverWait(driver, 10).until(
                        EC.visibility_of_element_located((
                            By.XPATH,
                            "//tr[@class='smart-search-hint smart-search-selected-hint']"
                        )))
                    element = driver.find_element_by_xpath(
                        "//tr[@class='smart-search-hint smart-search-selected-hint']"
                    )
                    if not re.search(
                            r'(\(|[^A-Z]){}([^A-Z]|\))'.format(ticker),
                            element.text):
                        break
                    sleep(1)
                    input_box.send_keys(Keys.ENTER)
                    # wait until company page loads
                    WebDriverWait(driver, 10).until(
                        EC.presence_of_element_located((By.ID, "seriesDiv")))
                    resp = requests.get(driver.current_url).text

                soup = BeautifulSoup(resp, 'html.parser')
                # name = soup.find('span', class_='companyName').text.split(' CIK')[0]
                edgar_dict[ticker]['Company Name'] = titlecase(
                    re.compile(r'(.*) CIK#').findall(soup.text)[0])
                edgar_dict[ticker]['CIK'] = re.compile(
                    r'.*CIK#: (\d{10}).*').findall(soup.text)[0]

                ident_info = soup.find('p', class_="identInfo")
                edgar_dict[ticker]['SIC Industry'] = str(
                    ident_info.find('br').previousSibling.split('- ')
                    [-1]).title()
                sic_code = re.search(r'(\d{4})', ident_info.text).group()
                country_code = re.compile(r'.*State location: (..)').findall(
                    soup.text)[0]
                for type, code_dict in country_codes.items():
                    if country_code in code_dict.keys():
                        edgar_dict[ticker][
                            'Location'] = type + '/' + code_dict[country_code]
                        break

                for exchange, tickers in exchanges_dict.items():
                    if ticker in tickers:
                        if 'Exchange' in edgar_dict[ticker].keys():
                            edgar_dict[ticker]['Exchange'] += '|' + exchange
                        else:
                            edgar_dict[ticker]['Exchange'] = exchange

                for key, value in sic_codes_division.items():
                    if int(sic_code[0]) == 0:
                        if int(sic_code[1]) in range(key[0], key[1]):
                            edgar_dict[ticker]['SIC Sector'] = value
                            break
                    elif int(sic_code[:2]) in range(key[0], key[1]):
                        edgar_dict[ticker]['SIC Sector'] = value
                        break

                break

            # except TimeoutException or ElementNotInteractableException:
        except:
            driver.get(
                'https://www.sec.gov/edgar/searchedgar/companysearch.html')

    edgar_df = pd.DataFrame.from_dict(edgar_dict, orient='index')
    init_df.rename(columns={'Sector': 'GICS Sector'}, inplace=True)
    init_df = init_df[['GICS Sector', 'Asset Class']]
    df = edgar_df.join(init_df)
    df = df[[
        'Company Name', 'SIC Industry', 'SIC Sector', 'GICS Sector',
        'Location', 'CIK', 'Exchange', 'Asset Class'
    ]]
    # df = pd.concat([edgar_df, init_df], axis=1)
    path = os.path.join(config.MARKET_DATA_DIR_PATH, 'Company Classification')
    df.to_excel(path + '.xlsx', engine='xlsxwriter')
    df.to_pickle(path=path + '.pkl')
def setup_module(module):
    global driver
    driver = webdriver.Chrome(ChromeDriverManager().install())
    driver.implicitly_wait(5)
    driver.delete_all_cookies()
    driver.get("https://www.google.com")
Пример #13
0
def init_driver():
    options = webdriver.ChromeOptions()
    options.add_argument("--user-data-dir=chromeProfile")
    driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)
    driver.implicitly_wait(5)
    return driver
Пример #14
0
# Import Splinter and BeautifulSoup
from splinter import Browser
from bs4 import BeautifulSoup as soup
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import datetime as dt

# Set the executable path and initialize the chrome browser in splinter
executable_path = {'executable_path': ChromeDriverManager().install()}
def scrape_all():
    browser = Browser('chrome', **executable_path, headless=True)

    news_title, news_paragraph = mars_news(browser)

    # Run all scraping functions and store results in dictionary
    data = {
        'news_title': news_title,
        'news_paragraph': news_paragraph,
        'featured_image': featured_image(browser),
        'facts': mars_facts(),
        'last_modified': dt.datetime.now()
    }

    # Stop webdriver and return data
    browser.quit()
    return data

def mars_news(browser):
        
    # Scrape Mars News
    # Visit the mars nasa news site
Пример #15
0
 def init_browser(self):
     # @NOTE: Replace the path with your actual path to the chromedriver
     executable_path = {'executable_path': ChromeDriverManager().install()}
     browser = Browser('chrome', **executable_path, headless=False)
     return browser
Пример #16
0
 def generate_browser(self):
     self.driver = webdriver.Chrome(ChromeDriverManager().install())
     self.driver.get(self.MALT_URL)
     time.sleep(5)
Пример #17
0
def driver():
    driver = webdriver.Chrome(executable_path=ChromeDriverManager().install())
    # Resize the window to the screen width/height
    # driver.set_window_size(1366, 768)
    yield driver
    driver.quit()
def init_browser():
    executable_path = {'executable_path': ChromeDriverManager().install()}
    return Browser('chrome', **executable_path, headless=False)
Пример #19
0
def test_can_get_chrome_for_win(os_type):
    path = ChromeDriverManager(os_type=os_type).install()
    assert os.path.exists(path)
Пример #20
0
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
from webdriver_manager.chrome import ChromeDriverManager
from webdriver_manager.utils import ChromeType
import time

f = open("settings.txt", "r")

email, password, zipcode = f.readline().rstrip('\n'), f.readline().rstrip(
    '\n'), f.readline().rstrip('\n')

driver = webdriver.Chrome(
    ChromeDriverManager(chrome_type=ChromeType.CHROMIUM).install())

driver.maximize_window(
)  #Maximizes the browser window since Udemy has a responsive design and the code only works in the maximized layout


def getUdemyLink(url):
    response = requests.get(url=url)

    soup = BeautifulSoup(response.content, 'html.parser')

    linkForUdemy = soup.find('span',
                             class_="rh_button_wrapper").find('a').get('href')

    return linkForUdemy
def test_chromium_manager_with_wrong_version():
    with pytest.raises(ValueError) as ex:
        ChromeDriverManager("0.2", chrome_type=ChromeType.CHROMIUM).install()
    assert "There is no such driver by url" in ex.value.args[0]
Пример #22
0
 def __init__(self, urls):
     self.browser = webdriver.Chrome(ChromeDriverManager().install())
     self.browser.maximize_window()
     self.urls = urls
     self.sizes = [320, 480, 960, 1366, 1920]
def test_chromium_manager_with_specific_version():
    bin_path = ChromeDriverManager("2.27",
                                   chrome_type=ChromeType.CHROMIUM).install()
    assert os.path.exists(bin_path)
Пример #24
0
def webscr(url, start_time):
    options = Options()
    options.headless = False

    if options.headless == True:
        print("Headless Chrome Initialized on Linux")
        options.add_argument('--disable-gpu')
    else:
        pass
    print('headless')
    print(time.time() - start_time)

    # chrome_driver = r"C:\Users\User\Documents\virtual\webscsrape_indianoil\chromedriver.exe"

    # Opens url, options is for headless chrome
    driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)
    # driver = webdriver.Chrome(chrome_driver, options=options)
    driver.get(url)
    print('open url')
    print(time.time() - start_time)

    # Wait until all is loaded
    # cur = WebDriverWait(driver,10).until(EC.presence_of_all_elements_located((By.CLASS_NAME,'current-value')))

    print('webdriverwait')
    print(time.time() - start_time)
    # Grab the Contents
    content = driver.page_source
    soup = BeautifulSoup(content, features="html.parser")
    # print(soup.prettify())
    print('soup')
    print(time.time() - start_time)

    # Close the browser
    driver.close()
    print('driver close')
    print(time.time() - start_time)

    # Initialisation
    datey = date.today().strftime("%d/%b/%Y")  # Date of request
    print(datey)
    print(time.time() - start_time)

    # Scraping
    product_name = soup.find('h1', class_="module-pdp-title").get_text()
    print(product_name)
    print(time.time() - start_time)

    competitor_price = soup.find_all(class_="pre-inquiry-price")[-1].get_text()
    print(competitor_price)
    print(time.time() - start_time)

    # currency = soup.find_all('label',class_= "ellipsis")[-1].get_text().split(" - ")[-1]
    currency = 'USD'
    print(currency)
    print(time.time() - start_time)

    uom = soup.find_all(
        'span', class_="ma-quantity-range")[-1].get_text().split(" ")[-1]
    print(uom)

    comp_name = soup.find(
        'a', class_="company-name company-name-lite-vb").get_text()
    print(comp_name)

    for item in soup.find_all('dl', class_="do-entry-item"):
        #print(item.get_text())

        if 'Place of Origin:' in item.get_text():
            country = item.get_text().split(':')[-1].split(", ")[-1]
            print(country)

        elif 'Model Number:' in item.get_text():
            grade = item.get_text().split(':')[-1].replace(' grade', '')
            print(grade)

        elif 'Package:' in item.get_text():
            package = item.get_text().split(':')[-1].split('/')[0]
            print(package)

        elif 'Payment term:' in item.get_text():
            payment_term = item.get_text().split(':')[-1]
            print(payment_term)

        else:
            pass

    # Save to dataframe and excel
    # df = pd.DataFrame({
    #     'Product Name': [product_name],
    #     'Product Grade': [grade],
    #     'Company Name': [comp_name],
    #     'Country': [country],
    #     'Competitor price': [competitor_price],
    #     'Currency': [currency],
    #     'UOM': [uom],
    #     'Packaging': [package]
    #     })

    # df.to_excel('products1.xls', index=False, encoding='utf-8')
    # os.system("start EXCEL.EXE products1.xls")
    # df_json = json.loads(df.to_json(orient='records'))
    df_json = {
        'Product Name': [product_name],
        'Product Grade': [grade],
        'Company Name': [comp_name],
        'Country': [country],
        'Competitor price': [competitor_price],
        'Currency': [currency],
        'UOM': [uom],
        'Packaging': [package]
    }
    return df_json
Пример #25
0
def main(url):

    # with open(filename, 'w', encoding = 'UTF-8') as f:
    #     f.close()
    cnt = 0
    friend_list = []
    friend_link_list = []
    try:
        with open(f'My LinkedIn Friends.txt', 'r', encoding='UTF-8') as f:
            temp_list = f.read().splitlines()
    except:
        with open(f'My LinkedIn Friends.txt', 'r') as f:
            temp_list = f.read().splitlines()

    # print(temp_list)
    for item in temp_list:
        # print(item)
        # print(re.split(r'\t+', item))
        friend_list.append(striplist(re.split(r'\t+', item)))
        # print(friend_list)
        # friend_list.append(item.split(r'\t'))
        link = friend_list[-1][1]
        friend_link_list.append(".com" + link[link.find('/in/'):])
        # print(friend_link_list)
    # friend_list = striplist(friend_list)
    # friend_link_list = striplist(friend_link_list)
    # print(friend_link_list)
    df = pd.DataFrame(columns=['Person', 'Link', 'Relation'])
    options = webdriver.ChromeOptions()
    # prefs = {"profile.managed_default_content_settings.images": 2}
    # options.add_experimental_option("prefs", prefs)
    options.add_argument(
        '--ignore-certificate-errors')  #removes SSL errors from terminal
    options.add_experimental_option(
        "excludeSwitches",
        ["enable-logging"])  #removes device adapter errors from terminal
    options.add_argument(
        '--disable-web-security')  #removes SSL errors from terminal
    options.add_argument("--log-level=3")
    options.add_argument("--user-data-dir=chrome-data")
    # options.add_argument('--headless')
    # options.add_argument('--no-sandbox')
    driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)
    driver.maximize_window()
    driver.get(url)
    try:
        # driver.maximize_window()
        SCROLL_PAUSE_TIME = 3
        while True:
            try:
                WebDriverWait(driver, 60).until( \
                        EC.presence_of_element_located((By.XPATH, '//*[@id="ember21"]')) \
                        )
                break
            except:
                pass
        # eleclick=WebDriverWait(driver, 60).until( \
        #         EC.presence_of_element_located((By.XPATH, '//a[contains(string(), " connections")]')) \
        #         )
        # driver.execute_script("arguments[0].click()",eleclick)
        # WebDriverWait(driver, 60).until( \
        #         EC.invisibility_of_element_located((By.XPATH, '//a[contains(string(), " connections")]')) \
        #         )

        eleclick=WebDriverWait(driver, 60).until( \
                EC.presence_of_element_located((By.CSS_SELECTOR, 'a.app-aware-link.pv-highlight-entity__card-action-link')) \
                )
        count = ""
        try:
            count = driver.find_elements(
                By.CSS_SELECTOR,
                "a.app-aware-link.pv-highlight-entity__card-action-link"
            )[1].find_element(By.TAG_NAME,
                              "h3").text.replace(" mutual connections", "")
            print("Shared connections:" + count)
        # print(driver.find_elements(By.CSS_SELECTOR, "a.app-aware-link.pv-highlight-entity__card-action-link")[1].find_element(By.TAG_NAME,"h3").text.replace("mutual connections",""))
        except:
            count = "0"

        url = driver.current_url
        filename = url.split('/')[-2] + ".txt"
        if url.split('/')[-1] != '':
            filename = url.split('/')[-1] + ".txt"

        path = os.path.abspath(os.getcwd()) + "\\output"

        if os.path.exists(path) == False:
            os.mkdir(path)
        filename = path + "\\" + filename
        # print(filename)

        last_height = driver.execute_script(
            "return document.body.scrollHeight")

        while True:
            # Scroll down to bottom
            driver.execute_script(
                "window.scrollTo(0, document.body.scrollHeight);")

            # Wait to load page
            time.sleep(SCROLL_PAUSE_TIME)

            # Calculate new scroll height and compare with last scroll height
            new_height = driver.execute_script(
                "return document.body.scrollHeight")
            if new_height == last_height:
                break
            last_height = new_height
        profile = ""
        profile_atract = ""
        try:
            profile_ele=WebDriverWait(driver, 15).until( \
                EC.presence_of_element_located((By.CSS_SELECTOR, 'div.pv-profile-section-pager.ember-view')) \
                )
            profile_eles = driver.find_elements(By.TAG_NAME, "section")
            for ele in profile_eles:
                attr = ele.get_attribute("class")
                if "pv-profile-section" in attr:
                    if "pv-interests-section" in attr:
                        continue
                    profile += ele.text

            profile_atract = keyword_contain(profile)

            # if  "Harvard"  in profile:
            #     profile_atract +="Harvard "
            # if  "Stanford"  in profile:
            #     profile_atract +="Stanford "
            # if  "MIT "  in profile:
            #     profile_atract +="MIT "
            # if  "Brown University"  in profile:
            #     profile_atract +="Brown University "
            # if  "MIT Sloan"  in profile:
            #     profile_atract +="MIT Sloan "

        except:
            pass
        print("keywords:" + profile_atract)

        driver.execute_script("arguments[0].click()", eleclick)
        WebDriverWait(driver, 60).until( \
                EC.invisibility_of_element_located((By.CSS_SELECTOR, 'a.app-aware-link.pv-highlight-entity__card-action-link')) \
                )

        while True:

            time.sleep(2)
            # Get scroll height
            last_height = driver.execute_script(
                "return document.body.scrollHeight")

            while True:
                # Scroll down to bottom
                driver.execute_script(
                    "window.scrollTo(0, document.body.scrollHeight);")

                # Wait to load page
                time.sleep(SCROLL_PAUSE_TIME)

                # Calculate new scroll height and compare with last scroll height
                new_height = driver.execute_script(
                    "return document.body.scrollHeight")
                if new_height == last_height:
                    break
                last_height = new_height
            items_div = driver.find_elements_by_class_name(
                "entity-result__item")
            for item in items_div:
                href_str = item.find_element_by_tag_name('a').get_attribute(
                    'href') + "/"
                # print(href_str)
                href_str = ".com" + href_str[href_str.find('/in/'):]
                if href_str in friend_link_list:
                    index = friend_link_list.index(href_str)
                    print(index)
                    df.loc[cnt] = friend_list[index]
                    print("matching---")
                    # print(friend_list[index])
                    cnt += 1
                    df = df.sort_values('Relation')
                    # print(df)
                    save_text(df, filename)
                    # df.to_csv(filename, header=None, index=None, sep=' ', mode='w')
                    print(cnt)
                    append_write = ""

            try:
                nextclick = WebDriverWait(driver, 10).until( \
                        EC.presence_of_element_located((By.CSS_SELECTOR, 'button.artdeco-pagination__button.artdeco-pagination__button--next.artdeco-button.artdeco-button--muted.artdeco-button--icon-right.artdeco-button--1.artdeco-button--tertiary.ember-view')) \
                        )
            except:
                break
            try:
                if nextclick.get_attribute("disabled") == None:
                    nextclick.click()
                else:
                    break
            except:
                break
        if os.path.exists(filename):
            append_write = 'a'  # append if already exists
        else:
            append_write = 'w'  # make a new file if not
        with open(filename, append_write, encoding='UTF-8') as f:
            f.write("\nShared connections:" + count)
            f.write("\nKeywords:" + profile_atract)
        with open(filename, 'r', encoding='UTF-8') as f:
            print(f.read())
        driver.quit()
    except Exception as e:
        print(e)
        # print("Error!")
        driver.quit()

    print("Finish!")
def chrome():
    return webdriver.Chrome(ChromeDriverManager('latest').install())
# # print("Real Browser Launching")
# browser = webdriver.Chrome(ChromeDriverManager().install())
# # print("Real Browser has Launched")
"""
The Headless browsing option greatly reduces the amount of time it takes for the scraper to run.
"""
print("Headless Browser Running")
options = Options()
options.add_argument("--headless")  # Runs Chrome in headless mode.
options.add_argument('--no-sandbox')  # Bypass OS security model
options.add_argument('--disable-gpu')  # applicable to windows os only
options.add_argument('start-maximized')  #
options.add_argument('disable-infobars')
options.add_argument("--disable-extensions")
browser = webdriver.Chrome(chrome_options=options,
                           executable_path=ChromeDriverManager().install())
print("Headless Browser has Launched")


def login_into_dash(json_target_file):
    """
    Takes the login information from JSON file and passes data to login form.

    Parameter json_target_file needs to be equal to the file's location.

    Contents of the file must be organized as follows [Note: don't forget the curly braces]:
    
    {
    "username": "******",
    "password": "******"
    }
Пример #28
0
def scrape_all():
    executable_path = {'executable_path': ChromeDriverManager().install()}
    browser = Browser('chrome', **executable_path)

    # ### Visit the NASA Mars News Site

    # Visit the mars nasa news site
    url = 'https://data-class-mars.s3.amazonaws.com/Mars/index.html'
    browser.visit(url)

    # Optional delay for loading the page
    time.sleep(5)

    # Convert the browser html to a soup object and then quit the browser
    html = browser.html
    news_soup = soup(html, 'html.parser')
    print(news_soup)
    slide_elem = news_soup.select_one('div.list_text')

    # Use the parent element to find the first a tag and save it as `news_title`
    news_title = slide_elem.find('div', class_='content_title').get_text()

    # Use the parent element to find the paragraph text
    news_p = slide_elem.find('div', class_='article_teaser_body').get_text()

    # ### JPL Space Images Featured Image

    # Visit URL
    url = 'https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/index.html'
    browser.visit(url)

    # Find and click the full image button
    full_image_elem = browser.find_by_tag('button')[1]
    full_image_elem.click()

    # Parse the resulting html with soup
    html = browser.html
    img_soup = soup(html, 'html.parser')
    
    # find the relative image url
    img_url_rel = img_soup.find('img', class_='fancybox-image').get('src')
    
    # Use the base url to create an absolute url
    img_url = f'https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/{img_url_rel}'

    # ### Mars Facts

    df = pd.read_html('https://data-class-mars-facts.s3.amazonaws.com/Mars_Facts/index.html')[0]

    df.columns=['Description', 'Mars', 'Earth']
    df.set_index('Description', inplace=True)

    mars_facts = df.to_html()

    # # D1: Scrape High-Resolution Mars’ Hemisphere Images and Titles

    # ### Hemispheres

    # 1. Use browser to visit the URL 
    base_url = 'https://data-class-mars-hemispheres.s3.amazonaws.com/Mars_Hemispheres/'

    url = base_url + "index.html"
    browser.visit(url)

    # 2. Create a list to hold the images and titles.
    hemisphere_image_urls = []

    # 3. Write code to retrieve the image urls and titles for each hemisphere.
    html = browser.html
    hemisphere_soup = soup(html, 'html.parser')
    hemi_findall = hemisphere_soup.findAll("div",class_="description")

    #hemi = hemi_findall[0]
    pre_hemi_urls=[]

    for hemi in hemi_findall:
        hemifind = hemi.find("a")["href"]
        hemi_url = base_url + hemifind
        pre_hemi_urls.append(hemi_url)

    #hemi_url = pre_hemi_urls[0]
    for hemi_url in pre_hemi_urls:

        browser.visit(hemi_url)
        html = browser.html
        hemisphere_soup = soup(html, 'html.parser')
        hurl = base_url + hemisphere_soup.find("div", class_="downloads").find("a")["href"]
        hemisphere_image_urls.append({
            "img_url":hurl,
            "title": hemisphere_soup.find("h2", class_="title").text,
        })

    # 4. Quit the browser.
    browser.quit()

    # # merge all scraped data into one dictionary

    mars_data = {
        "news_title": news_title,
        "news_paragraph": news_p,
        "featured_image":img_url,
        "facts": mars_facts,
        "hemispheres": hemisphere_image_urls,
    }
    print(mars_data)
    return mars_data
def bot():
    # lists initialized for storing data of employees
    names = []
    abouts = []
    expers = []
    locs = []
    titles = []

    # set paths for webdriver + initialize
    options = Options()
    options.add_argument('--incognito')
    options.binary_location = 'C:/Program Files (x86)/Google/Chrome/Application/chrome.exe'
    driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)

    driver.maximize_window()

    # load into linkedin site
    driver.get(
        'https://www.linkedin.com/login?fromSignIn=true&trk=guest_homepage-basic_nav-header-signin'
    )
    time.sleep(1)

    # enter login info
    user = driver.find_element_by_xpath(
        '/html/body/div/main/div[2]/form/div[1]/input')
    user.send_keys('EMAIL')  # REPLACE WITH USER EMAIL
    pswd = driver.find_element_by_xpath(
        '/html/body/div/main/div[2]/form/div[2]/input')
    pswd.send_keys('PASSWORD')  # REPLACE WITH USER PASSWORD

    # submit form, try catch because it was having issues finding the button by a single absolute path
    try:
        driver.find_element_by_xpath(
            '/html/body/div/main/div[2]/form/div[4]').click()
    except Exception:
        try:
            driver.find_element_by_xpath(
                '/html/body/div/main/div[2]/form/div[3]/button').click()
        except Exception:
            driver.find_element_by_xpath(
                '//*[@id="app__container"]/main/div[2]/form/div[4]/button'
            ).click()
            try:
                driver.find_element_by_link_text('Sign in').click()
            except Exception:
                quit()

    time.sleep(1)
    # try catch for mobile authentication, seems unnecessary as of now
    try:
        driver.find_element_by_xpath(
            '/html/body/div/div[1]/section/div[2]/div/article/footer/div/div/button'
        ).click()
    except Exception:
        print('No auth needed')

    # keep track of what page script is on
    page_tracker = 1

    # boolean li query, must first search in google then copy search address
    site = 'QUERY LINK AFTER SEARCHED'
    driver.get(site)

    # main loop for handling bot
    while True:
        # set the main window and intialize a random user agent to avoid captchas
        main_window = driver.current_window_handle
        link_counter = 0
        ua = UserAgent()
        userAgent = ua.random
        print(userAgent)

        time.sleep(1)

        # find all clickable links, the iterate though them
        data = driver.find_elements_by_partial_link_text('linkedin.com')

        for data[link_counter] in data:
            data = driver.find_elements_by_partial_link_text('linkedin.com')
            data[link_counter].send_keys(Keys.CONTROL +
                                         Keys.RETURN)  # open in new tab

            driver.switch_to.window(
                driver.window_handles[1])  # switch to new tab

            time.sleep(2)

            # Block to figure out their name: if both attempts fail, go to essentially end of task loop and resume
            # process or move on to next page
            try:
                con_name = driver.find_element_by_xpath(
                    '/html/body/div[7]/div[3]/div/div/div/div/div[2]/main/div[1]/section/div[2]/div[2]/div[1]/ul[1]/li[1]'
                )
            except Exception:
                try:
                    con_name = driver.find_element_by_xpath(
                        '/html/body/div[8]/div[3]/div/div/div/div/div[2]/main/div[1]/section/div[2]/div[2]/div[1]/ul[1]/li[1]'
                    )
                except Exception:
                    ua = UserAgent()
                    userAgent = ua.random
                    print(userAgent)
                    driver.close()
                    driver.switch_to.window(main_window)
                    link_counter += 1
                    time.sleep(1)
                    continue

            # block to find location - if not specificied or undistinguishable, leave empty and move on
            try:
                location = driver.find_element_by_xpath(
                    '/html/body/div[7]/div[3]/div/div/div/div/div[2]/main/div[1]/section/div[2]/div[2]/div[1]/ul[2]/li[1]'
                ).text
            except:
                location = ""

            # block to make sure ths is a real person account and create an anchor for later (to a specific point on their profile page)
            try:
                head = driver.find_element_by_xpath(
                    '/html/body/div[7]/div[3]/div/div/div/div/div[2]/main/div[2]/div[5]/span/div/section/div[1]/section/header/h2'
                )
            except Exception:
                try:
                    head = driver.find_element_by_xpath(
                        '/html/body/div[7]/div[3]/div/div/div/div/div[2]/main/div[2]/div[6]/span/div/section/div[1]/section/header/h2'
                    )
                except Exception:
                    ua = UserAgent()
                    userAgent = ua.random
                    print(userAgent)
                    driver.close()
                    driver.switch_to.window(main_window)
                    link_counter += 1
                    time.sleep(1)
                    break

            # block to expand the about section
            try:
                driver.find_element_by_xpath(
                    '/html/body/div[7]/div[3]/div/div/div/div/div[2]/main/div[2]/div[3]/section/p/span[3]/span/a'
                ).click()
            except Exception:
                print('about expanded')

            # now, grab about text
            try:
                about = driver.find_element_by_xpath(
                    '/html/body/div[7]/div[3]/div/div/div/div/div[2]/main/div[2]/div[3]/section/p/span[1]'
                )
                about = str(about.text)
            except Exception:
                about = "EMPTY"

            # move to head anchor
            actions = ActionChains(driver)
            actions.move_to_element(head).perform()

            # blovk to grab their title
            try:
                title = driver.find_element_by_xpath(
                    '/html/body/div[7]/div[3]/div/div/div/div/div[2]/main/div[2]/div[5]/span/div/section/div[1]/section/ul/li[1]/section/div/div/a/div[2]/h3'
                )
                title = title.text
            except Exception:
                try:
                    title = driver.find_element_by_xpath(
                        '/html/body/div[7]/div[3]/div/div/div/div/div[2]/main/div[2]/div[6]/span/div/section/div[1]/section/ul/li[1]/section/ul/li[1]/div/div/div/div/div/div/h3/span[2]'
                    )
                    title = title.text
                except Exception:
                    print('Now what...')

            # block to expand experience tab
            try:
                driver.find_element_by_xpath(
                    '/html/body/div[7]/div[3]/div/div/div/div/div[2]/main/div[2]/div[5]/span/div/section/div[1]/section/ul/li[1]/section/div/div/div/p/span/button'
                ).click()
            except Exception:
                print('No see more button')

            # now, try to grab the experience text
            try:
                experience = driver.find_element_by_xpath(
                    '/html/body/div[7]/div[3]/div/div/div/div/div[2]/main/div[2]/div[5]/span/div/section/div[1]/section/ul/li[1]/section/div/div/div/p'
                )
                experience = experience.text
            except Exception:
                print('No experience for this chum')
                experience = ""

            # convert to string for data normalization
            con_name = str(con_name.text)

            # print fields for testing, then append them to the lists
            print(con_name)
            print(location)
            print(title)
            print(about)
            print(experience)

            expers.append(experience)
            abouts.append(about)
            names.append(con_name)
            locs.append(location)
            titles.append(title)

            # initialize new user agent for anonymity, then go back to main window and close any extra tabs
            ua = UserAgent()
            userAgent = ua.random
            print(userAgent)
            driver.close()
            driver.switch_to.window(main_window)
            link_counter += 1
            time.sleep(1)

        # block to continue to next page on google search, < num pages to go through. Default cap is 25, change here if less is desired
        if page_tracker < 25:
            time.sleep(.75 * 60)  # sleep time, helps avoid google captchas
            driver.find_element_by_xpath(
                '/html/body/div[8]/div[2]/div[10]/div[1]/div[2]/div/div[5]/div[2]/span[1]/div/table/tbody/tr/td[12]/a/span[2]'
            ).click()
            page_tracker += 1
            time.sleep(1)
            main_window = driver.current_window_handle
        else:
            driver.quit()
            break

    # convert lists to pandas series so they can be placed in a dataframe for storage
    names_ser = pd.Series(names)
    exp_ser = pd.Series(expers)
    about_ser = pd.Series(abouts)
    locs_ser = pd.Series(locs)
    titles_ser = pd.Series(titles)

    frame = {
        'Name': names_ser,
        'Description/Bio': about_ser,
        'Location': locs_ser,
        'Title': titles_ser,
        'Expertise (subj matter)': exp_ser
    }
    final = pd.DataFrame(frame)
    final.to_csv('DESIRED NAME.csv')  # specify file name here

    time.sleep(10)

    # sleep for 10 seconds then continue on to next batch of people. If no other queries are necessary, comment out the next blocks of code.

    names = []
    abouts = []
    expers = []
    locs = []
    titles = []
    # set paths for webdriver + initialize
    options = Options()
    options.add_argument('--incognito')
    options.binary_location = 'C:/Program Files (x86)/Google/Chrome/Application/chrome.exe'
    driver_path = "C:/Users/Matt Turi/Downloads/chromedriver_win32/chromedriver.exe"
    driver = webdriver.Chrome(options=options, executable_path=driver_path)

    driver.maximize_window()

    driver.get(
        'https://www.linkedin.com/login?fromSignIn=true&trk=guest_homepage-basic_nav-header-signin'
    )
    time.sleep(1)

    # enter login info
    user = driver.find_element_by_xpath(
        '/html/body/div/main/div[2]/form/div[1]/input')
    user.send_keys('EMAIL')
    pswd = driver.find_element_by_xpath(
        '/html/body/div/main/div[2]/form/div[2]/input')
    pswd.send_keys('PASSWORD')

    # submit form, try catch because it was having issues finding the button by a single absolute path
    try:
        driver.find_element_by_xpath(
            '/html/body/div/main/div[2]/form/div[4]').click()
    except Exception:
        try:
            driver.find_element_by_xpath(
                '/html/body/div/main/div[2]/form/div[3]/button').click()
        except Exception:
            driver.find_element_by_xpath(
                '//*[@id="app__container"]/main/div[2]/form/div[4]/button'
            ).click()
            try:
                driver.find_element_by_link_text('Sign in').click()
            except Exception:
                quit()

    time.sleep(1)
    # try catch for mobile authentication, seems unnecessary as of now
    try:
        driver.find_element_by_xpath(
            '/html/body/div/div[1]/section/div[2]/div/article/footer/div/div/button'
        ).click()
    except Exception:
        print('No auth needed')

    page_tracker = 1

    # boolean li query, must first search in google then copy search address
    site = 'GOOGLE QUERY'
    driver.get(site)

    # main loop for handling bot
    while True:
        main_window = driver.current_window_handle
        link_counter = 0
        ua = UserAgent()
        userAgent = ua.random
        print(userAgent)

        time.sleep(1)

        data = driver.find_elements_by_partial_link_text('linkedin.com')

        for data[link_counter] in data:
            data = driver.find_elements_by_partial_link_text('linkedin.com')
            data[link_counter].send_keys(Keys.CONTROL + Keys.RETURN)

            driver.switch_to.window(driver.window_handles[1])

            time.sleep(2)

            # Block to figure out their name: if both attempts fail, go to essentially end of task loop and resume
            # process or move on to next page
            try:
                con_name = driver.find_element_by_xpath(
                    '/html/body/div[7]/div[3]/div/div/div/div/div[2]/main/div[1]/section/div[2]/div[2]/div[1]/ul[1]/li[1]'
                )
            except Exception:
                try:
                    con_name = driver.find_element_by_xpath(
                        '/html/body/div[8]/div[3]/div/div/div/div/div[2]/main/div[1]/section/div[2]/div[2]/div[1]/ul[1]/li[1]'
                    )
                except Exception:
                    ua = UserAgent()
                    userAgent = ua.random
                    print(userAgent)
                    driver.close()
                    driver.switch_to.window(main_window)
                    link_counter += 1
                    time.sleep(1)
                    continue

            try:
                location = driver.find_element_by_xpath(
                    '/html/body/div[7]/div[3]/div/div/div/div/div[2]/main/div[1]/section/div[2]/div[2]/div[1]/ul[2]/li[1]'
                ).text
            except:
                location = ""

            try:
                head = driver.find_element_by_xpath(
                    '/html/body/div[7]/div[3]/div/div/div/div/div[2]/main/div[2]/div[5]/span/div/section/div[1]/section/header/h2'
                )
            except Exception:
                # driver.execute_script("window.scrollTo(0, 400)")
                try:
                    head = driver.find_element_by_xpath(
                        '/html/body/div[7]/div[3]/div/div/div/div/div[2]/main/div[2]/div[6]/span/div/section/div[1]/section/header/h2'
                    )
                except Exception:
                    ua = UserAgent()
                    userAgent = ua.random
                    print(userAgent)
                    driver.close()
                    driver.switch_to.window(main_window)
                    link_counter += 1
                    time.sleep(1)
                    break

            try:
                driver.find_element_by_xpath(
                    '/html/body/div[7]/div[3]/div/div/div/div/div[2]/main/div[2]/div[3]/section/p/span[3]/span/a'
                ).click()
            except Exception:
                print('about expanded')
            try:
                about = driver.find_element_by_xpath(
                    '/html/body/div[7]/div[3]/div/div/div/div/div[2]/main/div[2]/div[3]/section/p/span[1]'
                )
                about = str(about.text)
            except Exception:
                about = "EMPTY"

            actions = ActionChains(driver)
            actions.move_to_element(head).perform()

            try:
                title = driver.find_element_by_xpath(
                    '/html/body/div[7]/div[3]/div/div/div/div/div[2]/main/div[2]/div[5]/span/div/section/div[1]/section/ul/li[1]/section/div/div/a/div[2]/h3'
                )
                title = title.text
            except Exception:
                try:
                    title = driver.find_element_by_xpath(
                        '/html/body/div[7]/div[3]/div/div/div/div/div[2]/main/div[2]/div[6]/span/div/section/div[1]/section/ul/li[1]/section/ul/li[1]/div/div/div/div/div/div/h3/span[2]'
                    )
                    title = title.text
                except Exception:
                    print('Now what...')

            try:
                driver.find_element_by_xpath(
                    '/html/body/div[7]/div[3]/div/div/div/div/div[2]/main/div[2]/div[5]/span/div/section/div[1]/section/ul/li[1]/section/div/div/div/p/span/button'
                ).click()
            except Exception:
                print('No see more button')
            try:
                experience = driver.find_element_by_xpath(
                    '/html/body/div[7]/div[3]/div/div/div/div/div[2]/main/div[2]/div[5]/span/div/section/div[1]/section/ul/li[1]/section/div/div/div/p'
                )
                experience = experience.text
            except Exception:
                print('No experience for this chum')
                experience = ""

            con_name = str(con_name.text)

            print(experience)
            print(about)
            print(con_name)

            expers.append(experience)
            abouts.append(about)
            names.append(con_name)
            locs.append(location)
            titles.append(title)

            ua = UserAgent()
            userAgent = ua.random
            print(userAgent)
            driver.close()
            driver.switch_to.window(main_window)
            link_counter += 1
            time.sleep(1)

        # block to continue to next page on google search, <num pages to go through. Default cap is 25, change here if less is desired
        if page_tracker < 25:
            time.sleep(.75 * 60)
            driver.find_element_by_xpath(
                '/html/body/div[8]/div[2]/div[10]/div[1]/div[2]/div/div[5]/div[2]/span[1]/div/table/tbody/tr/td[12]/a/span[2]'
            ).click()
            page_tracker += 1
            time.sleep(1)
            main_window = driver.current_window_handle
        else:
            driver.quit()
            break

    names_ser = pd.Series(names)
    exp_ser = pd.Series(expers)
    about_ser = pd.Series(abouts)
    locs_ser = pd.Series(locs)
    titles_ser = pd.Series(titles)

    frame = {
        'Name': names_ser,
        'Description/Bio': about_ser,
        'Location': locs_ser,
        'Title': titles_ser,
        'Expertise (subj matter)': exp_ser
    }
    final = pd.DataFrame(frame)
    final.to_csv('NAME HERE.csv')

    time.sleep(10)

    names = []
    abouts = []
    expers = []
    locs = []
    titles = []
    # set paths for webdriver + initialize
    options = Options()
    options.add_argument('--incognito')
    options.binary_location = 'C:/Program Files (x86)/Google/Chrome/Application/chrome.exe'
    driver_path = "C:/Users/Matt Turi/Downloads/chromedriver_win32/chromedriver.exe"
    driver = webdriver.Chrome(options=options, executable_path=driver_path)

    driver.maximize_window()

    driver.get(
        'https://www.linkedin.com/login?fromSignIn=true&trk=guest_homepage-basic_nav-header-signin'
    )
    time.sleep(1)

    # enter login info
    user = driver.find_element_by_xpath(
        '/html/body/div/main/div[2]/form/div[1]/input')
    user.send_keys('EMAIL')
    pswd = driver.find_element_by_xpath(
        '/html/body/div/main/div[2]/form/div[2]/input')
    pswd.send_keys('PASSWORD')

    # submit form, try catch because it was having issues finding the button by a single absolute path
    try:
        driver.find_element_by_xpath(
            '/html/body/div/main/div[2]/form/div[4]').click()
    except Exception:
        try:
            driver.find_element_by_xpath(
                '/html/body/div/main/div[2]/form/div[3]/button').click()
        except Exception:
            driver.find_element_by_xpath(
                '//*[@id="app__container"]/main/div[2]/form/div[4]/button'
            ).click()
            try:
                driver.find_element_by_link_text('Sign in').click()
            except Exception:
                quit()

    time.sleep(1)
    # try catch for mobile authentication, seems unnecessary as of now
    try:
        driver.find_element_by_xpath(
            '/html/body/div/div[1]/section/div[2]/div/article/footer/div/div/button'
        ).click()
    except Exception:
        print('No auth needed')

    page_tracker = 1

    # boolean li query, must first search in google then copy search address
    site = ''
    driver.get(site)

    # main loop for handling bot
    while True:
        main_window = driver.current_window_handle
        link_counter = 0
        ua = UserAgent()
        userAgent = ua.random
        print(userAgent)

        time.sleep(1)

        data = driver.find_elements_by_partial_link_text('linkedin.com')

        for data[link_counter] in data:
            data = driver.find_elements_by_partial_link_text('linkedin.com')
            data[link_counter].send_keys(Keys.CONTROL + Keys.RETURN)

            driver.switch_to.window(driver.window_handles[1])

            time.sleep(2)

            # Block to figure out their name: if both attempts fail, go to essentially end of task loop and resume
            # process or move on to next page
            try:
                con_name = driver.find_element_by_xpath(
                    '/html/body/div[7]/div[3]/div/div/div/div/div[2]/main/div[1]/section/div[2]/div[2]/div[1]/ul[1]/li[1]'
                )
            except Exception:
                try:
                    con_name = driver.find_element_by_xpath(
                        '/html/body/div[8]/div[3]/div/div/div/div/div[2]/main/div[1]/section/div[2]/div[2]/div[1]/ul[1]/li[1]'
                    )
                except Exception:
                    ua = UserAgent()
                    userAgent = ua.random
                    print(userAgent)
                    driver.close()
                    driver.switch_to.window(main_window)
                    link_counter += 1
                    time.sleep(1)
                    continue

            try:
                location = driver.find_element_by_xpath(
                    '/html/body/div[7]/div[3]/div/div/div/div/div[2]/main/div[1]/section/div[2]/div[2]/div[1]/ul[2]/li[1]'
                ).text
            except:
                location = ""

            try:
                head = driver.find_element_by_xpath(
                    '/html/body/div[7]/div[3]/div/div/div/div/div[2]/main/div[2]/div[5]/span/div/section/div[1]/section/header/h2'
                )
            except Exception:
                # driver.execute_script("window.scrollTo(0, 400)")
                try:
                    head = driver.find_element_by_xpath(
                        '/html/body/div[7]/div[3]/div/div/div/div/div[2]/main/div[2]/div[6]/span/div/section/div[1]/section/header/h2'
                    )
                except Exception:
                    ua = UserAgent()
                    userAgent = ua.random
                    print(userAgent)
                    driver.close()
                    driver.switch_to.window(main_window)
                    link_counter += 1
                    time.sleep(1)
                    break

            try:
                driver.find_element_by_xpath(
                    '/html/body/div[7]/div[3]/div/div/div/div/div[2]/main/div[2]/div[3]/section/p/span[3]/span/a'
                ).click()
            except Exception:
                print('about expanded')
            try:
                about = driver.find_element_by_xpath(
                    '/html/body/div[7]/div[3]/div/div/div/div/div[2]/main/div[2]/div[3]/section/p/span[1]'
                )
                about = str(about.text)
            except Exception:
                about = "EMPTY"

            actions = ActionChains(driver)
            actions.move_to_element(head).perform()

            try:
                title = driver.find_element_by_xpath(
                    '/html/body/div[7]/div[3]/div/div/div/div/div[2]/main/div[2]/div[5]/span/div/section/div[1]/section/ul/li[1]/section/div/div/a/div[2]/h3'
                )
                title = title.text
            except Exception:
                try:
                    title = driver.find_element_by_xpath(
                        '/html/body/div[7]/div[3]/div/div/div/div/div[2]/main/div[2]/div[6]/span/div/section/div[1]/section/ul/li[1]/section/ul/li[1]/div/div/div/div/div/div/h3/span[2]'
                    )
                    title = title.text
                except Exception:
                    print('Now what...')

            try:
                driver.find_element_by_xpath(
                    '/html/body/div[7]/div[3]/div/div/div/div/div[2]/main/div[2]/div[5]/span/div/section/div[1]/section/ul/li[1]/section/div/div/div/p/span/button'
                ).click()
            except Exception:
                print('No see more button')
            try:
                experience = driver.find_element_by_xpath(
                    '/html/body/div[7]/div[3]/div/div/div/div/div[2]/main/div[2]/div[5]/span/div/section/div[1]/section/ul/li[1]/section/div/div/div/p'
                )
                experience = experience.text
            except Exception:
                print('No experience for this chum')
                experience = ""

            con_name = str(con_name.text)

            print(experience)
            print(about)
            print(con_name)

            expers.append(experience)
            abouts.append(about)
            names.append(con_name)
            locs.append(location)
            titles.append(title)

            ua = UserAgent()
            userAgent = ua.random
            print(userAgent)
            driver.close()
            driver.switch_to.window(main_window)
            link_counter += 1
            time.sleep(1)

        # block to continue to next page on google search, <num pages to go through. Default cap is 25, change here if less is desired
        if page_tracker < 3:  # page_stop: # 30:
            time.sleep(.75 * 60)
            driver.find_element_by_xpath(
                '/html/body/div[8]/div[2]/div[10]/div[1]/div[2]/div/div[5]/div[2]/span[1]/div/table/tbody/tr/td[5]/a/span[2]'
            ).click()
            page_tracker += 1
            time.sleep(1)
            main_window = driver.current_window_handle
        else:
            driver.quit()
            break

    names_ser = pd.Series(names)
    exp_ser = pd.Series(expers)
    about_ser = pd.Series(abouts)
    locs_ser = pd.Series(locs)
    titles_ser = pd.Series(titles)

    frame = {
        'Name': names_ser,
        'Description/Bio': about_ser,
        'Location': locs_ser,
        'Title': titles_ser,
        'Expertise (subj matter)': exp_ser
    }
    final = pd.DataFrame(frame)
    final.to_csv('FILE NAME HERE.csv')
 def setUpClass(cls):
     cls.driver = webdriver.Chrome(ChromeDriverManager().install())
     cls.driver.implicitly_wait(10)
     cls.driver.maximize_window()