def scrapy_books(url): # Opção 1) Instala o driver do Chrome para o selenium # A opção 1 não funciona em alguns computadores. # Você saberá se funcionou caso a apareça uma janela do Chrome em branco driver = webdriver.Chrome(ChromeDriverManager().install()) # Opção 2) Usar o driver do firefox, o geckodriver. # Para instalar no windows: # 1) Baixe o arquivo para Windows (32 ou 64bits) de https://github.com/mozilla/geckodriver/releases # 2) Descompacte o arquivo; # 3) Adicione a pasta do arquivo executável na variavel de ambiente PATH. # Veja como adicionar https://knowledge.autodesk.com/pt-br/support/navisworks-products/troubleshooting/caas/sfdcarticles/sfdcarticles/PTB/Adding-folder-path-to-Windows-PATH-environment-variable.html # driver = webdriver.Firefox() driver.get(url) to_continue = True whole_dataset = [] while to_continue: current_items = get_page_data(driver) whole_dataset.extend(current_items) try: next_button = driver.find_element_by_xpath('//li[@class="next"]/a') except NoSuchElementException: pass break next_button.click() wait_element(driver, '//img[@class="thumbnail"]', by=By.XPATH) driver.close() return whole_dataset
def scrapy_students(user, password, url=slack_channel_url, exclude_list=None, no_window=True): import os.path if os.path.isfile(path_to_firefoxdirver): options = webdriver.FirefoxOptions() #FirefoxOptions() if no_window: options.add_argument('headless') driver = webdriver.Firefox(firefox_options=options, executable_path=path_to_firefoxdirver) elif os.path.isfile(path_to_chromedriver): options = webdriver.ChromeOptions() #FirefoxOptions() if no_window: options.add_argument('headless') driver = webdriver.Chrome(chrome_options=options, executable_path=path_to_chromedriver) else: print("error: no driver found") return None driver.get(url) result = {} email_input = driver.find_element_by_xpath('//input[@id="email"]') email_input.send_keys(user) password_input = driver.find_element_by_xpath('//input[@id="password"]') password_input.send_keys(password) button = driver.find_element_by_xpath('//button[@id="signin_btn"]') button.click() wait_element(driver, id='//a[@data-qa-channel-sidebar-channel-type="im"]', tag=By.XPATH, to_sleep=5) tree_items = driver.find_elements_by_xpath( '//a[@data-qa-channel-sidebar-channel-type="im"]') students = [] today = (datetime.datetime.today() + datetime.timedelta(days=1)).strftime('%Y-%m-%d') for i in tree_items: name = i.text.replace('(you)', '') if i.find_elements_by_tag_name('i')[1].get_attribute( 'title') == "Active": students.append({'nome': name, today: 'Ativo'}) else: students.append({'nome': name, today: 'Inativo'}) students = [s for s in students if s['nome'] not in exclude_list] driver.close() return students
def scrapy_books(url): driver = webdriver.Firefox() driver.get(url) to_continue = True whole_dataset = [] while to_continue: current_items = get_page_data(driver) whole_dataset.extend(current_items) try: next_button = driver.find_element_by_xpath('//li[@class="next"]/a') except NoSuchElementException: pass break next_button.click() wait_element(driver, '//img[@class="thumbnail"]', by=By.XPATH) driver.close() return whole_dataset
def get_page_data(driver): a_tags = driver.find_elements_by_xpath('//article/h3/a') dataset = [] current_page_driver = webdriver.Firefox() for a in a_tags: href = a.get_attribute('href') current_page_driver.get(href) wait_element(current_page_driver, '//tr/td', by=By.XPATH) product_main = current_page_driver.find_element_by_xpath( '//div[contains(@class,"product_main")]') product_main_text = product_main.text.split('\n') title = product_main_text[0] price = product_main_text[1] stock = re.findall('\d+', product_main_text[2]) product_main_ps = product_main.find_elements_by_tag_name('p') stars_colors = [ x.value_of_css_property("color") for x in product_main_ps[2].find_elements_by_tag_name('i') ] stars = stars_colors.count('rgb(230, 206, 49)') description = current_page_driver.find_element_by_xpath( '//article/p').text tds = current_page_driver.find_elements_by_xpath('//tr/td') upc = tds[0].text type = tds[1].text price_exc_tax = tds[2].text price_inc_tax = tds[3].text tax = tds[4].text nreviews = tds[6].text record = { 'title': title, 'price': price, 'stars': stars, 'description': description, 'stock': stock, 'upc': upc, 'type': type, 'price_exc_tax': price_exc_tax, 'price_inc_tax': price_inc_tax, 'tax': tax, 'nreviews': nreviews } dataset.append(record) current_page_driver.close() return dataset
def scrapy_datasus(value, download_path): url = 'http://tabnet.datasus.gov.br/cgi/deftohtm.exe?sih/cnv/qiuf.def' # options = webdriver.FirefoxOptions() # options.add_argument("download.default_directory=~/Downloads") # Set the download Path # driver = webdriver.Firefox(options=options) options = webdriver.ChromeOptions() options.add_argument("download.default_directory={0}".format( download_path)) # Set the download Path driver = webdriver.Chrome(options=options) driver.get(url) driver.find_elements_by_xpath("//select[@id='L']/option")[2].click() options_I = driver.find_elements_by_xpath("//select[@id='I']/option") options_I[0].click() for o in options_I: o.click() driver.find_element_by_xpath("//label[@for='S4']").find_element_by_xpath( '../img').click() driver.find_element_by_xpath( "//select[@id='S4']/option[@value='{0}']".format(value)).click() options_A = driver.find_elements_by_xpath("//select[@id='A']/option") n_months = len(options_A) options_A[0].click() for n in range(n_months): if n > 0: options_A[n - 1].click() options_A[n].click() driver.find_elements_by_xpath("//input[@id='F']")[1].click() driver.find_element_by_xpath("//input[@type='submit']").click() wait_element(driver, '//tr/td/a', by=By.XPATH) sleep(2) buttons = driver.find_elements_by_xpath("//tr/td/a") current_nfiles = len( fnmatch.filter(os.listdir(download_path), '*.{0}'.format('csv'))) buttons[0].click() wait_download(download_path, 'csv', current_nfiles) sleep(1) buttons[-1].click() wait_element(driver, "//select[@id='A']/option", by=By.XPATH) options_A = driver.find_elements_by_xpath("//select[@id='A']/option")
def scrapy_forbes(url): driver = webdriver.Firefox() driver.get(url) print(driver) to_continue = True # Espera a página carregar. Esperando pelo ID da propaganda wait_element(driver, 'piano-wrapper', by=By.ID) # Remove o elemento de propaganda sobreposto à página remove_element(driver, driver.find_element_by_id('piano-wrapper')) # Muda a paginação para 100 wait_element(driver, '//option[@value="100"]', by=By.XPATH) o100 = driver.find_element_by_xpath('//option[@value="100"]') o100.click() whole_dataset = [] while to_continue: current_items = get_page_data(driver) whole_dataset.extend(current_items) try: driver.find_element_by_xpath('//div[@class="-next"]/button[@disabled]') break except NoSuchElementException: pass next_button = driver.find_element_by_xpath('//div[@class="-next"]/button') next_button.click() wait_element(driver, '//div[@role="row"]', by=By.XPATH) driver.close() return whole_dataset
from scrapy.auth_data import user_github, password_github from scrapy.util import wait_element url = 'https://jupyter.enap.gov.br/' driver = webdriver.Firefox() driver.get(url) driver.find_element_by_xpath('//div[@class="service-login"]').click() driver.find_element_by_xpath('//input[@id="login_field"]').send_keys( user_github) driver.find_element_by_xpath('//input[@id="password"]').send_keys( password_github) button_signin = driver.find_element_by_xpath('//input[@type="submit"]') button_signin.click() print(driver) url_aula1 = 'https://jupyter.enap.gov.br/user/alexlopespereira/notebooks/bootcamp/Aula2/Aula2_Exercicios.ipynb' wait_element(driver, '//input[@type="checkbox"]', by=By.XPATH) driver.get(url_aula1) wait_element(driver, '//div[@class="input_area"]', by=By.XPATH) div_area = driver.find_elements_by_xpath('//div[@class="prompt_container"]')[0] while True: div_area.click() driver.find_element_by_xpath('//button[@title="Run"]').click() sleep(5 * 60) print('running now at {0}'.format(datetime.datetime.now())) with open("./log.txt", "a") as file_object: file_object.write('running now at {0}\n'.format( datetime.datetime.now())) driver.close()