def makecafe24Modify(): options = Options() options = webdriver.ChromeOptions() # options.add_argument('headless') driver = webdriver.Chrome(chromedriver_path, chrome_options=options) # label.config(text="btn2, Clicked!") href = "https://eclogin.cafe24.com/Shop/" # chromedriver_path = 'C:/python-program/webcrawling_1230-r2/chromedriver' # driver = webdriver.Chrome(chromedriver_path) driver.get(href) time.sleep(2) # 카페24 아이디 / 비번 입력 driver.find_element_by_id('mall_id').send_keys('kkwjkd') driver.find_element_by_id('userpasswd').send_keys('zmffldh123') driver.find_element_by_class_name('btnSubmit').click() time.sleep(7) yield "data:10\n\n" #팝업삭제 try: driver.find_element_by_xpath( '//*[@id="admngLayerWrapper29"]/form/div/button').click() time.sleep(2) except: pass #상품관리 버튼클릭 driver.find_element_by_xpath('//*[@id="QA_Gnb_product2"]').click() time.sleep(2) #재고관리 버튼클릭 driver.find_element_by_xpath('//*[@id="QA_Lnb_Menu2049"]').click() time.sleep(2) driver.find_element_by_xpath('//*[@id="QA_Lnb_Menu2050"]').click() time.sleep(2) #카테고리 화장품으로 수정 driver.find_element_by_xpath('//*[@id="eCategory1"]/option[2]').click() time.sleep(2) #하위분류 포함검색 체크 driver.find_element_by_xpath( '//*[@id="submitSearchBox"]/table/tbody/tr[2]/td/div/span/label[1]/input' ).click() time.sleep(1) #상세검색열기 클릭 driver.find_element_by_xpath( '//*[@id="QA_list1"]/div[4]/div/span/button').click() time.sleep(1) #재고관리 사용안함 체크 driver.find_element_by_xpath( '//*[@id="QA_list1"]/div[3]/table/tbody/tr[1]/td/label[3]/input' ).click() time.sleep(1) #100개씩 보기선택 driver.find_element_by_xpath( '//*[@id="QA_list2"]/div[2]/div[2]/select[2]/option[5]').click() time.sleep(2) #검색버튼 클릭 driver.find_element_by_xpath('//*[@id="eBtnSearch"]/span').click() time.sleep(2) yield "data:30\n\n" #사용안함을 사용함,품절사용체크 반복문 i = 1 while True: try: # paging = driver.find_element_by_xpath('//*[@id="QA_list2"]/div[6]/ol/li['+str(i)+']') # print(paging.text) #체크박스 체크 driver.find_element_by_xpath( '//*[@id="QA_list2"]/div[4]/table/thead/tr/th[5]/input' ).click() #재고관리 일괄설정클릭 driver.find_element_by_xpath( '//*[@id="QA_list2"]/div[3]/div[1]/a[1]').click() #팝업창 열리고 사용함 선택 driver.find_element_by_xpath( '//*[@id="eManageStockBatchForm"]/table/tbody/tr[1]/td[1]/select/option[1]' ).click() #품절가능 체크 driver.find_element_by_xpath( '//*[@id="eManageStockBatchForm"]/table/tbody/tr[1]/td[6]/input' ).click() #선택완료버튼 클릭 driver.find_element_by_xpath( '//*[@id="layerBatchSet"]/div[2]/a[1]').click() time.sleep(2) #팝업창 완료 driver.switch_to_alert().accept() time.sleep(2) #팝업창 문자확인 # poptext = driver.switch_to_alert().text # if poptext == "처리할 품목이 없습니다.": # break #팝업창 완료 driver.switch_to_alert().accept() time.sleep(3) # driver.switch_to_window # time.sleep(3) continue # i += 1 # if i == 11: # driver.find_element_by_css_selector('#QA_list2 > div.mPaginate > a.next').click() # i = 1 # continue except: break # 사용함으로 모두 수정후 닫기 클릭 driver.find_element_by_xpath( '//*[@id="layerBatchSet"]/div[2]/a[2]/span').click() time.sleep(3) #상세검색창에 재고 사용함 체크 driver.find_element_by_xpath( '//*[@id="QA_list1"]/div[3]/table/tbody/tr[1]/td/label[2]/input' ).click() time.sleep(1) #재고 0부터 driver.find_element_by_class_name('fText.right.eSearchText').send_keys( '0') time.sleep(1) #재고 0까지입력 driver.find_element_by_css_selector( '#eSearchFormStock > li > input:nth-child(3)').send_keys('0') time.sleep(1) #판매상태 판매함 체크 driver.find_element_by_xpath( '//*[@id="submitSearchBox"]/table/tbody/tr[4]/td[2]/label[2]/input' ).click() time.sleep(1) #진열상태 진열함 체크 driver.find_element_by_xpath( '//*[@id="QA_list1"]/div[3]/table/tbody/tr[5]/td[1]/label[2]/input' ).click() time.sleep(1) #검색버튼 클릭 driver.find_element_by_xpath('//*[@id="eBtnSearch"]').click() yield "data:60\n\n" #재고 0 상품을 재고관리 사용안함으로 수정 반복문 while True: try: #체크버튼 전체선택 driver.find_element_by_xpath( '//*[@id="QA_list2"]/div[4]/table/thead/tr/th[5]/input' ).click() time.sleep(2) #재고관리 일괄설정 driver.find_element_by_xpath( '//*[@id="QA_list2"]/div[3]/div[1]/a[1]').click() time.sleep(2) #재고관리 사용안함 체크 driver.find_element_by_xpath( '//*[@id="eManageStockBatchForm"]/table/tbody/tr[1]/td[1]/select/option[2]' ).click() time.sleep(2) #진열여부체크 driver.find_element_by_xpath( '//*[@id="eManageStockBatchForm"]/table/tbody/tr[2]/td/div/table/tbody/tr[1]/th/label/input' ).click() time.sleep(1) #판매여부체크 driver.find_element_by_xpath( '//*[@id="eManageStockBatchForm"]/table/tbody/tr[2]/td/div/table/tbody/tr[2]/th/label/input' ).click() time.sleep(1) #확인 버튼 클릭 driver.find_element_by_xpath( '//*[@id="layerBatchSet"]/div[2]/a[1]/span').click() time.sleep(2) #팝업창 확인 클릭 driver.switch_to_alert().accept() time.sleep(2) #팝업창 확인 클릭 driver.switch_to_alert().accept() time.sleep(2) # driver.switch_to_window time.sleep(2) continue except: break # 사용안함으로 모두 수정후 닫기 클릭 driver.find_element_by_xpath( '//*[@id="layerBatchSet"]/div[2]/a[2]/span').click() time.sleep(3) try: #팝업창 제거 driver.find_element_by_xpath( '//*[@id="layerBatchSet"]/div[2]/a[2]').click() time.sleep(1) except: pass #재고0 상품 모두 품절 처리 yield "data:80\n\n" #상품관리 메뉴클릭 driver.find_element_by_xpath('//*[@id="QA_Lnb_Menu2036"]').click() time.sleep(1) #상품목록 메뉴클릭 driver.find_element_by_xpath('//*[@id="QA_Lnb_Menu2037"]').click() time.sleep(1) #상세검색클릭 driver.find_element_by_xpath( '//*[@id="QA_list1"]/div[4]/div/span/button').click() time.sleep(2) #카테고리 화장품으로 수정 driver.find_element_by_xpath('//*[@id="eCategory1"]/option[2]').click() time.sleep(2) #하위분류 포함검색 체크 driver.find_element_by_xpath( '//*[@id="submitSearchBox"]/table/tbody/tr[3]/td/div/span/label[1]/input' ).click() time.sleep(1) #재고관리 사용안함 체크 driver.find_element_by_xpath( '//*[@id="QA_list1"]/div[3]/table/tbody/tr[1]/td/label[3]/input' ).click() time.sleep(2) #판매상태 판매함으로 체크 driver.find_element_by_xpath( '//*[@id="submitSearchBox"]/table/tbody/tr[5]/td[2]/label[2]/input' ).click() time.sleep(2) #상품 100개 열기 driver.find_element_by_xpath( '//*[@id="QA_list2"]/div[2]/div[2]/select[2]/option[5]').click() time.sleep(2) #검색버튼 클릭 driver.find_element_by_xpath('//*[@id="eBtnSearch"]/span').click() time.sleep(2) #재고관리 사용안함, 판매함으로 되어 있는 상품들 모두 판매안함으로 수정 while True: try: #상품 체크박스 선택 driver.find_element_by_xpath( '//*[@id="QA_list2"]/div[4]/table/thead/tr/th[1]/input' ).click() time.sleep(2) #판매안함 버튼 클릭 driver.find_element_by_xpath( '//*[@id="QA_list2"]/div[3]/div[1]/a[4]/span').click() time.sleep(2) #팝업창 확인 driver.switch_to_alert().accept() time.sleep(2) #팝업창 확인 driver.switch_to_alert().accept() time.sleep(2) # driver.switch_to_window time.sleep(2) continue except: break time.sleep(3) try: driver.switch_to_alert().accept() time.sleep(2) except: pass #입고상품 판매함으로 수정 #재고관리 전체로 선택 driver.find_element_by_xpath( '//*[@id="QA_list1"]/div[3]/table/tbody/tr[1]/td/label[1]/input' ).click() time.sleep(2) #판매상태 판매안함 선택 driver.find_element_by_xpath( '//*[@id="submitSearchBox"]/table/tbody/tr[5]/td[2]/label[3]/input' ).click() time.sleep(2) #재고관리 재고 1입력 driver.find_element_by_css_selector( '#eSearchFormStock > li > input:nth-child(2)').send_keys('1') time.sleep(2) #검색버튼 클릭 driver.find_element_by_xpath('//*[@id="eBtnSearch"]/span').click() time.sleep(4) yield "data:90\n\n" #판매안함 상품을 모두 판매함으로 수정 while True: try: #체크박스 선택 driver.find_element_by_xpath( '//*[@id="QA_list2"]/div[4]/table/thead/tr/th[1]/input' ).click() time.sleep(2) #판매함버튼 클릭 driver.find_element_by_xpath( '//*[@id="QA_list2"]/div[3]/div[1]/a[3]/span').click() time.sleep(2) #팝업창 확인 driver.switch_to_alert().accept() time.sleep(2) #팝업창 확인 driver.switch_to_alert().accept() time.sleep(2) driver.switch_to_window time.sleep(2) continue except: break driver.close() yield "data:100\n\n"
def setUpClass(cls): chrome_options = webdriver.ChromeOptions() chrome_options.add_argument('headless') chrome_options.add_argument('window-size=1920x1080') cls.driver = webdriver.Chrome(options=chrome_options)
def execute(args): """Run Javascript unit tests. Here are the steps: 1. Execute the HTML with chromedriver. 2. Read the test result from the HTML.""" test_filepath = os.path.join('src', 'appengine', 'private', 'test.html') print('Running chromedriver on %s' % test_filepath) chrome_options = webdriver.ChromeOptions() chrome_options.add_argument('--allow-file-access-from-files') is_ci = os.getenv('TEST_BOT_ENVIRONMENT') if is_ci: # Turn off sandbox since running under root, with trusted tests. chrome_options.add_argument('--no-sandbox') chrome_options.add_argument('--headless') driver = webdriver.Chrome( executable_path=common.get_chromedriver_path(), chrome_options=chrome_options) try: driver.get('file://%s' % os.path.abspath(test_filepath)) # Wait for tests to be completed. while True: success_count = driver.execute_script( 'return WCT._reporter.stats.passes;') failure_count = driver.execute_script( 'return WCT._reporter.stats.failures;') sys.stdout.write( '\rSuccess: %d, Failure: %d' % (success_count, failure_count)) sys.stdout.flush() is_complete = driver.execute_script('return WCT._reporter.complete;') if is_complete: break time.sleep(0.1) sys.stdout.write('\r' + (' ' * 70)) sys.stdout.flush() success_count = int( driver.find_element_by_css_selector('#mocha-stats .passes em').text) failure_count = int( driver.find_element_by_css_selector('#mocha-stats .failures em').text) error_report = _parse_error_report(driver) if error_report: print(error_report) print() print(_SUITE_SEPARATOR) print('Test results:') print('| Success: %d' % success_count) print('| Failure: %d' % failure_count) print(_SUITE_SEPARATOR) print() if args.persist: # pylint: disable=eval-used eval( input('--persist is used. Leave the browser open.' ' Press ENTER to close it:')) finally: driver.quit() if failure_count > 0: sys.exit(1)
def test_reactbank(timeout_sec=2.0): """ This runs the test. """ driver = None try: options = webdriver.ChromeOptions() options.add_argument("--headless") options.add_argument("--disable-extensions") driver = webdriver.Chrome(options=options) # get to the landing page driver.get(landingPageUrl) # # login # logger.info("[RUNNER] doing login") login_to_reactbank(driver, timeout_sec) logger.info("[RUNNER] login done") # # other tests go here # # balance check logger.info("[RUNNER] doing balance check") find_balance_on_userpage(driver, timeout_sec) logger.info("[RUNNER] balance check done") # help form fill-in logger.info("[RUNNER] doing help form") fill_out_help_form(driver, timeout_sec) logger.info("[RUNNER] help form done") # # logout # logger.info("[RUNNER] doing logout") logout_from_reactbank(driver, timeout_sec) logger.info("[RUNNER] logout done") except Exception: logger.exception("Ran into exception when running test.") raise finally: if driver is not None: logger.info("[RUNNER] waiting to close the window") time.sleep(2.5) driver.close() logger.info("[RUNNER] window close done, waiting for driver quit") driver.quit() logger.info("[RUNNER] driver quit done") logger.info("[RUNNER] test run complete")
def daily_task(): global DATE DATE = str(datetime.date.today()) chromeOptions = webdriver.ChromeOptions() prefs = {"profile.managed_default_content_settings.images":2} # chromeOptions.add_argument("--disable-javascript") chromeOptions.add_experimental_option("prefs",prefs) chromeOptions.add_argument("--headless") chromeOptions.add_argument("start-maximized") chromeOptions.add_argument("disable-infobars") chromeOptions.add_argument("--disable-extensions") chromeOptions.add_argument("--no-sandbox") chromeOptions.add_argument("--disable-dev-shm-usage") browser2 = webdriver.Chrome(chrome_options=chromeOptions,executable_path=CHROME_DRIVER_PATH) # browser2 = webdriver.Chrome(chrome_options=chromeOptions) browser2.set_window_position(100, 40) browser2.set_window_size(1300, 1024) wait2 = ui.WebDriverWait(browser2,30) # browser = webdriver.Chrome(chrome_options=chromeOptions,executable_path=CHROME_DRIVER_PATH) browser = webdriver.Chrome(chrome_options=chromeOptions,executable_path=CHROME_DRIVER_PATH) browser.set_window_position(400, 40) browser.set_window_size(1300, 1024) wait = ui.WebDriverWait(browser,30) browser.get(BASE_URL) urls = [] titles = [] wait.until(lambda browser: browser.find_element_by_xpath('/html/body/div[2]/nav/div/div[3]')) soup = BeautifulSoup(browser.page_source, 'lxml') category_list = soup.find('nav', class_='white').find('div', class_='top-cate').find_all('a') c=0 for item in category_list: if c==0 : c+=1 continue href = BASE_URL + item.get('href') title = item.text.strip() if href not in urls: urls.append(href) titles.append(title) c+=1 # print(len(category_list)) # print(category_list) # print(len(urls)) # print(urls) write_html(browser.page_source, "All_cat_") j=0 while j < len(urls): sys.stdout.write('Scraping ' + urls[j] + ' ...' + ' '*10) browser.get(urls[j]) wait.until(lambda browser: browser.find_element_by_xpath('//*[@id="list-page"]/div[2]/div[33]')) soup = BeautifulSoup(browser.page_source, 'lxml') category = titles[j] i=0 pagination = True while pagination: soup = BeautifulSoup(browser.page_source, 'lxml') if i != 0: try: wait.until(lambda browser: browser.find_element_by_xpath('//*[@id="list-page"]/div[2]/div[33]')) element = browser.find_element_by_css_selector('#list-page > div.container-list-restaurant.clearfix.active-view-column > div.pagation.clearfix > a.ico-page.ico-page-next.ng-scope') if element.is_displayed(): browser.execute_script("arguments[0].click();", element) time.sleep(3) else: pagination = False wait.until(lambda browser: browser.find_element_by_xpath('//*[@id="list-page"]/div[2]/div[33]')) soup = BeautifulSoup(browser.page_source, 'lxml') list = soup.find('div', id='list-page').find_all('div', class_='view-column-list') except NoSuchElementException: pagination = False except TimeoutException: pagination = False except: pagination = False if i == 0: soup = BeautifulSoup(browser.page_source, 'lxml') list = soup.find('div', id='list-page').find_all('div', class_='view-column-list') if pagination == False: break # print(len(list)) # print(i+1) file_name = str(j+1) + "_" + str(i+1) + "_" write_html(browser.page_source, file_name) for item in list: # if item.find('div', class_='ct_title') != None: # title = item.find('div', class_='ct_title').text.strip() # else: # title = None try: href = BASE_URL + item.find('a').get('href') browser2.get(href) # wait.until(lambda browser: browser.find_element_by_xpath('//*[@id="right"]/div[1]')) soup = BeautifulSoup(browser2.page_source, 'lxml') except TimeoutException: continue except: continue try: if soup.find('div', class_='info-basic-hot-restaurant').find('h2', class_='kind-restaurant') != None: food_category = soup.find('div', class_='info-basic-hot-restaurant').find('h2', class_='kind-restaurant').text.strip() if soup.find('div', class_='info-basic-hot-restaurant').find('h2', class_='kind-restaurant').find('a') != None: txt = soup.find('div', class_='info-basic-hot-restaurant').find('h2', class_='kind-restaurant').find('a').text.strip() food_category = food_category.replace(txt, '') food_category = food_category.strip() else: food_category = None except: food_category = None try: if soup.find('div', class_='info-basic-hot-restaurant').find('h1', class_='name-hot-restaurant') != None: seller = soup.find('div', class_='info-basic-hot-restaurant').find('h1', class_='name-hot-restaurant').text.strip() else: seller = None except: seller = None try: if soup.find('div', class_='info-basic-hot-restaurant').find('p', itemprop='description') != None: location = soup.find('div', class_='info-basic-hot-restaurant').find('p', itemprop='description').text.strip() else: location = None except: location = None try: if soup.find('div', class_='slick-list').find('span', class_='font14') != None: delivery_fee = soup.find('div', class_='slick-list').find('span', class_='font14').text.strip() delivery_fee = delivery_fee.replace('[?]','') else: delivery_fee = None except: delivery_fee = None try: if soup.find('div', class_='info-basic-hot-restaurant').find('p', itemprop='description') != None: location = soup.find('div', class_='info-basic-hot-restaurant').find('p', itemprop='description').text.strip() else: location = None except: location = None try: if soup.find('div', class_='info-basic-hot-restaurant').find('p', itemprop='description') != None: location = soup.find('div', class_='info-basic-hot-restaurant').find('p', itemprop='description').text.strip() else: location = None except: location = None try: if soup.find('div', class_='info-basic-hot-restaurant').find('p', itemprop='description') != None: location = soup.find('div', class_='info-basic-hot-restaurant').find('p', itemprop='description').text.strip() else: location = None except: location = None # 555555555555555---location, # 555555555555555---seller, # 555555555555555---delivery fee, # ---food name, # ---food price, # ---food old_price (previous price if exists), # ---food orders, # 111111---food type, # 55555555555---food category, # 55555555555---category (name of category), # 55555555555---current date try: products_types = soup.find('div', class_='detail-menu-kind').find_all('div', class_='scrollspy') except: continue # print(products_types) for products_type in products_types: # print(products_type) food_type = products_type.find('h2', class_='title-kind-food').text.strip() products = products_type.find_all('div', class_='box-menu-detail') for product in products: try: food_name = product.find('h3').text.strip() except: continue try: food_orders = product.find('div', class_='name-food-detail').find('p', class_='light-grey').text.strip() except: continue try: food_price = product.find('div', class_='product-price').find('p', class_='current-price').text.strip() except: continue try: old_price = product.find('div', class_='product-price').find('p', class_='old-price').text.strip() except: old_price = None data = {'category': category, 'food_category': food_category, 'location': location, 'seller': seller, 'delivery_fee': delivery_fee, 'food_type': food_type, 'food_name': food_name, 'food_orders': food_orders, 'food_price': food_price, 'old_price': old_price, 'date': DATE} write_csv(data) i+=1 j+=1 # Close browser browser.close() browser.service.process.send_signal(signal.SIGTERM) browser.quit() compress_data()
def get_data(url, write_file='data_temp.json', write_type='a'): data_label = '城市等级划分' data_city_nums = 404 data_city_per_page = 50 data_sub_label1 = '设备类型分布' data_sub_label2 = '场景分布' cities_data = [] options = webdriver.ChromeOptions() options.add_argument('headless') user_data_path = get_usr_data_dir() options.add_argument(user_data_path) driver = webdriver.Chrome(options=options) for index in range(400, 404): page_index = int(index / data_city_per_page) + 1 index_in_pages = (index % data_city_per_page) + 1 city = City() driver.get(url) time.sleep(1) if (index_in_pages == 1): time.sleep(3) goto_basic_page(driver, label=data_label, page_number=page_index) time.sleep(1) city.get_basic_info(driver, current_line=index_in_pages) time.sleep(1) print('{}/{}'.format(index + 1, data_city_nums), page_index, city.city_rank, city.city_name, city.city_point, city.city_screens) goto_device_types_page(driver, current_line=index_in_pages, label=data_sub_label1) time.sleep(1) city.get_device_types_distribution(driver) time.sleep(1) driver.get(url) goto_basic_page(driver, label=data_label, page_number=page_index) time.sleep(1) goto_device_scenes_page(driver, current_line=index_in_pages, label=data_sub_label2) city.get_device_scenes_distribution(driver) time.sleep(1) city_dict = {'城市':city.city_name, \ '城市分类':city.city_rank, \ '省份':city.city_province, \ '所属地区':city.city_location, \ '点位数量':city.city_point, \ '屏幕数量':city.city_screens, \ '类型分布':city.city_device_types_distribution, \ '场景分布':city.city_device_scenes_distribution} cities_data.append(city_dict) write_data(write_file, city_dict, write_type) return cities_data
def crawl_daum_comments(url): chrome_options = webdriver.ChromeOptions() prefs = {"profile.managed_default_content_settings.images": 2} chrome_options.add_experimental_option("prefs", prefs) driver = webdriver.Chrome('/home/apostcto/ITDA/chromedriver', chrome_options=chrome_options) driver.get(url) try: elements = driver.find_element_by_class_name('alex_more') # 마지막 댓글까지 찾기위해 '더보기' 버튼 클릭 # 100번이상 클릭하면 문제가 있단 뜻이니 50번넘어가면 break count = 0 while(elements): try : elements.click() time.sleep(0.1) count+=1 if (count>100): break except : break # 더보기 버튼이 없으면 현재 페이지에서 댓글 수집 except : pass # 최대한 확장된 페이지에서 댓글 요소를 찾습니다 comment_lists = [] # 최대한 확장된 페이지에서 댓글 요소를 찾습니다 try: comment_elements = driver.find_element_by_css_selector('.cmt_news').text count_elements = driver.find_element_by_css_selector('.cmt_news .alex_single .cmt_count').text count_elements = int(count_elements[3:]) while len(comment_elements) > 30: # 댓글을 검색 comment_first_index = comment_elements.find('시간전') comment_last_index = comment_elements.find('답글') comment = comment_elements[comment_first_index+4:comment_last_index-1].replace('\n','') if('댓글로그인' in comment): comment = '' comment_lists.append(comment) # 더이상 댓글이 없으면 break if('새로고침' in comment): break # 다음 댓글을 찾기위해 댓글 요소 slicing comment_elements = comment_elements[comment_last_index+3:] return count_elements, comment_lists, url except : return 0, [], []
def main(): if input('c or d: ') is 'd': target_dl_folder_path = r"D:\Temp\H" # target download folder path else: target_dl_folder_path = r"C:\Temp\H" timeout = 10 # chrome_options = Options() chrome_options = webdriver.ChromeOptions() chrome_options.add_argument("--disable-gpu") prefs = {"download.default_directory": target_dl_folder_path} chrome_options.add_experimental_option('prefs', prefs) # chrome_options.add_experimental_option("detach", True) # keep brower open chrome_options.add_argument("--headless") # keep brower close driver = webdriver.Chrome(options=chrome_options) driver.implicitly_wait(10) # driver.maximize_window() while True: urlList = get_urls() # START FOR each url in urlList for url in urlList: # check url not validtion, continue to next url if not url_validtion(driver, url): continue # check download btn if found if not find_dl_btn(driver, "dl-button"): continue print(Fore.CYAN + "Downloading", url + " ... ") time.sleep(1) if progressbar_timeout(driver, "progressbar", "aria-valuenow", timeout): continue # START while progessbar # TODO progressbar stops downloadning stop, retry progressbar_value = '0.000' # init progressbar while progressbar_value != "100": progressbar_value = driver.find_element_by_id( "progressbar").get_attribute("aria-valuenow") sys.stdout.write("\r{0}".format( str(progressbar_value)[:5] + " %")) sys.stdout.flush() time.sleep(1) print() time.sleep(1) # END while progessbar # old file name url_path = urlparse(url).path try: old_file_name = url_path.split("/")[3] except IndexError: print(Fore.RED + "GET OLD FILE NAME FROM URL ERROR: ", url) continue # go to next url # new file name potential_filenames = driver.find_elements_by_class_name( "alert-success") driver.implicitly_wait(10) new_file_name = get_correct_file_name( potential_filenames, old_file_name) # check if file exists in locol folder # NOT exists, 3rd arg represent timeout second*2 if file_timeout(target_dl_folder_path, old_file_name, timeout): print(Fore.RED + "DOWNLOAD FAIL", new_file_name) print(Fore.CYAN + "RE-DOWNLOAD ...") retry_task(url, urlList) else: # file exists # replace file name try: os.rename(target_dl_folder_path + "\\" + old_file_name + ".zip ", target_dl_folder_path + "\\" + new_file_name + ".zip") except os.error: print(Fore.YELLOW + "CAN NOT RENAME ", old_file_name + " -> " + new_file_name) finally: print(Fore.GREEN + 'DOWNLOAD COMPLETE', new_file_name + ".zip") # TODO unzip # END FOR LOOP urlList # END WHILE True driver.quit()
def lambda_handler(event, context): try: options = webdriver.ChromeOptions() options.binary_location = "./bin/headless-chromium" options.add_argument('--headless') options.add_argument("--no-sandbox") options.add_argument("--single-process") browser = webdriver.Chrome("./bin/chromedriver", chrome_options=options) # 都道府県一覧を取得:「HeartRails Geo API」のサービスを利用する(郵便番号/住所/緯度経度などの地理情報を無料で提供 http://geoapi.heartrails.com/) # 全都道府県だとスクレイピングが大変なので、関東、もしくは東京に限定してみる areaParams = {} areaParams['area'] = '関東' json_areaParam = json.dumps(areaParams).encode('utf-8') prefectures_response = requests.get( 'http://geoapi.heartrails.com/api/json?method=getPrefectures', data=json_areaParam, headers={'Content-Type': 'application/json'}) prefectures = prefectures_response.json()['response']['prefecture'] # 東京都に限定 prefectures = [ prefecture for prefecture in prefectures_response.json() ['response']['prefecture'] if prefecture == '東京都' ] towns = [] for prefecture in prefectures: # 取得した都道府県一覧から、町域情報を取得 params = {} params['prefecture'] = prefecture json_param = json.dumps(params).encode('utf-8') towns_response = requests.get( 'http://geoapi.heartrails.com/api/json?method=getTowns', data=json_param, headers={'Content-Type': 'application/json'}) cities = towns_response.json()['response']['location'] cities = [ testCity for testCity in towns_response.json()['response']['location'] if testCity['city'] == '新宿区' ] # townsのLIST内に、取得した都道府県毎の町域辞書を追加する(0~46) towns.append(cities) # townsWeathers = [] townsWeathers = scriping_weather(browser, towns) # S3バケットの設定 bucket = 'jdmc2019-weather' key = 'weather_' + datetime.now().strftime( '%Y-%m-%d-%H-%M-%S') + '.txt' # 取得した気象データをjson形式で保存 files = json.dumps(townsWeathers, indent=4, sort_keys=True, separators=(',', ': ')) # DynamoDBのテーブルインスタンス作成(sequenceテーブル) seqtable = dynamodb.Table('sequence') # 取得した気象データをDynamoDBに一括保存する。 tablename = "weather" table = dynamodb.Table(tablename) with table.batch_writer() as batch: for weather in townsWeathers: batch.put_item( Item={ 'id': next_seq(seqtable, 'weather'), 'prefuctureName': weather['prefuctureName'], 'cityName': weather['cityName'], 'townName': weather['townName'], 'longitude': weather['longitude'], 'latitude': weather['latitude'], 'postalCode': weather['postalCode'], 'date': weather['date'], 'hour': weather['hour'], 'weather': weather['weather'], 'temperature': weather['temperature'], 'probPrecip': weather['probPrecip'], 'precipitation': weather['precipitation'], 'humidity': weather['humidity'], 'windBlow': weather['windBlow'], 'windSpeed': weather['windSpeed'] }) obj = s3.Object(bucket, key) obj.put(Body=files) # 後始末 browser.close() browser.quit() return except Exception as error: LOGGER.error(error) raise error
def crawl_trend(self): TEST_URL1 = 'https://www.melon.com/chart/' TEST_URL2 = 'https://www.melon.com/chart/#params%5Bidx%5D=51' options = webdriver.ChromeOptions() options.add_argument('headless') options.add_argument('window-size=1920x1080') options.add_argument("disable-gpu") # UserAgent값을 바꿔줍시다! options.add_argument( "user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36" ) driver_1_to_50 = None driver_51_to_100 = None melon_chart_artist_title = {} try: driver_1_to_50 = webdriver.Chrome( chromedriver_binary.chromedriver_filename, chrome_options=options) driver_1_to_50.get(TEST_URL1) title_1_to_50 = WebDriverWait(driver_1_to_50, 3).until( EC.presence_of_all_elements_located(( By.CSS_SELECTOR, "#lst50 > td:nth-child(6) > div > div > div.ellipsis.rank01" ))) artist_1_to_50 = WebDriverWait(driver_1_to_50, 3).until( EC.presence_of_all_elements_located(( By.CSS_SELECTOR, "#lst50 > td:nth-child(6) > div > div > div.ellipsis.rank02" ))) link_1_t0_50 = WebDriverWait(driver_1_to_50, 3).until( EC.presence_of_all_elements_located( (By.CSS_SELECTOR, "#lst50 > td:nth-child(9) > div > button"))) driver_51_to_100 = webdriver.Chrome( chromedriver_binary.chromedriver_filename, chrome_options=options) driver_51_to_100.get(TEST_URL2) title_51_to_100 = WebDriverWait(driver_51_to_100, 3).until( EC.presence_of_all_elements_located(( By.CSS_SELECTOR, "#lst100 > td:nth-child(6) > div > div > div.ellipsis.rank01" ))) artist_51_to_100 = WebDriverWait(driver_51_to_100, 3).until( EC.presence_of_all_elements_located(( By.CSS_SELECTOR, "#lst100 > td:nth-child(6) > div > div > div.ellipsis.rank02" ))) link_51_tp_100 = WebDriverWait(driver_51_to_100, 3).until( EC.presence_of_all_elements_located( (By.CSS_SELECTOR, "#lst100 > td:nth-child(9) > div > button"))) title_webelement = title_1_to_50 + title_51_to_100 artist_webelement = artist_1_to_50 + artist_51_to_100 link_webelement = link_1_t0_50 + link_51_tp_100 artist = {} title = {} link = {} for idx in range(0, len(artist_webelement)): artist[idx] = artist_webelement[idx].text title[idx] = title_webelement[idx].text link[idx] = link_webelement[idx].get_attribute("onclick") melon_chart_artist_title[0] = artist melon_chart_artist_title[1] = title melon_chart_artist_title[2] = link except (WebDriverException, TimeoutException, NoSuchElementException): logging.error(traceback.format_exc()) finally: driver_1_to_50.quit() driver_51_to_100.quit() return melon_chart_artist_title
def setUp(self): chrome_options = webdriver.ChromeOptions() chrome_options.add_argument('headless') self.browser = webdriver.Chrome(chrome_options=chrome_options) self.browser.implicitly_wait(3)
def getPlayerStats(): chrome_options = webdriver.ChromeOptions() prefs = {"profile.managed_default_content_settings.images": 2} chrome_options.add_experimental_option("prefs", prefs) driver = webdriver.Chrome(chrome_options=chrome_options) # connect webdriver url = "https://www.fifaindex.com/players/1/?league=13&order=desc" driver.get(url) # wait for getting data time.sleep(3) count = 1 # click event for gettig data relevant_features = ['Name','Ball Skills','Defence','Mental','Passing','Physical','Shooting','Goalkeeper','Traits'] #print relevant_features[0] while True: players = driver.find_elements_by_css_selector("a.link-player") #print len(players) flag = 0 footballers = dict() for player in players: if not player.text: continue match_url = player.get_attribute("href") man = player.text driver.execute_script("window.open('"+match_url+"', 'new_window')") time.sleep(5) driver.switch_to.window(driver.window_handles[-1]) elements = driver.find_elements_by_css_selector(".card.mb-5") #man = str(player.text) print len(elements) footballers[man] = {} nm = 0; footballers[man]['Name'] = list() footballers[man]['Name'].append(man) footballers[man]['Traits']=list() for element in elements: card_name = element.find_elements_by_css_selector(".card-header")[0].text print card_name,nm nm = nm+1 if card_name not in relevant_features: continue card_name = card_name if card_name not in ['Traits']: footballers[man][card_name] = list() card_values = element.find_elements_by_xpath(".//div[@class='card-body']/p") for values in card_values: temp = str(values.text).split('\n') if len(temp)==1: footballers[man]['Traits'].append(temp[0]) else: footballers[man][card_name].append(temp[1]) csv_file = "PL_Fifa_Data.csv" with open(csv_file, 'a') as f: w = csv.DictWriter(f, relevant_features) w.writerow(footballers[man]) driver.close() driver.switch_to.window(driver.window_handles[0]) #driver.execute_script("window.history.go(-1)") count = count+1 url = "https://www.fifaindex.com/players/"+str(count)+"/?league=13&order=desc" driver.get(url) time.sleep(3) driver.close()
def __init__(self): self.mensagem = "Olá estou testando o meu bot que fiz com pythom!" self.grupos = ["Grupo de teste"] options = webdriver.ChromeOptions() options.add_argument('lang=pt-br') self.driver = webdriver.Chrome(executable_path=r'./chromedriver.exe')
def index(): if request.method == 'POST': #Input Constants orgName = request.form['content'] urlName = orgName.replace(' ', '+') urlNameNews = orgName.replace(' ', '%20') #Functionality Constants options = webdriver.ChromeOptions() ua = UserAgent(verify_ssl=False) userAgent = ua.random options.add_argument(f'user-agent={userAgent}') options.add_argument("start-maximized") options.headless=True driver = webdriver.Chrome(ChromeDriverManager().install(), options=options) #Setting up IRS headles browsing page driver.get("https://apps.irs.gov/app/eos/") WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.XPATH, """//*[@id="eos-search-by-select"]"""))) driver.find_element_by_xpath("//select[@name='searchBy']/option[text()='Organization Name']").click() textBox = driver.find_element_by_id('names') textBox.send_keys(orgName) driver.find_element_by_xpath("""//*[@id="s"]""").click() time.sleep(1) irsHbp = driver.window_handles[0] #Setting up other two headless browsing pages driver.execute_script("window.open('');") charNavHbp = driver.window_handles[1] driver.execute_script("window.open('');") newsHbp = driver.window_handles[2] #General Constants credsToI = "According to the IRS website..." credsToII = "According to charitynavigator.org..." symbolsAlert = "Search terms in the Name field can only include letters, numbers, @, &, %, (), *, hyphens, slashes spaces, apostrophes, periods, commas, and quotation marks." meaning = "What does this mean?" readFiveOh = "Read more about 501(c)3 Organizations" ccSummary = "Charity Checker's \"" + str(orgName) + "\" summary:" c1Explain = "\"" + str(orgName) + "\" is not a certified or valid nonprofit and it does not exist on charitynavigator.org, please make sure you are entering the correct name." c2Explain = "\"" + str(orgName) + "\" is a certified and valid nonprofit, but it does not exist on charitynavigator.org so it's stats are unclear." c3Explain = "\"" + str(orgName) + "\" is not a certified or valid nonprofit, but it does exist on charitynavigator.org" c4Explain = "\"" + str(orgName) + "\" is not a certified and valid nonprofit, but it has great ratings." c5Explain = "\"" + str(orgName) + "\" is a certified and valid nonprofit, but does not have any ratings yet." c6Explain = "\"" + str(orgName) + "\" is a certified and valid nonprofit with great ratings." #Circumstance 1, 3, 4, does not exist on IRS site badOrgNotice = "\"" + str(orgName) + "\"" + " is not listed as a 501(c)3 by the IRS." badExplain = "Essentially, the organization you entered is not an actual established nonprofit organization and is not likely to be exempt from federal tax income." #Circumstances 2, 5, 6, does exist on IRS site goodOrgNotice = "\"" + str(orgName) + "\"" + " is listed as a 501(c)3 by the IRS." goodExplain = "Essentially, the organization you entered is an actual established nonprofit organization and can be exempt from federal tax income." goodReadData = "Read more about the Tax Return Copies, Pub 78 Data, Auto-Revocation Lists, Determination Letters, or e-Postcards of the organization you entered" #Circumstances 1 & 2, does not exist on charitynavigator.org noRatingsYetII = "\"" + str(orgName) + "\" did not share any stats yet." #Circumstances 3 & 5, no full info on charitynavigator.org notEnoughInfoII = "The organization you entered has not provided needed information for a complete rating." badCharNavUrlClickMe = "However, you can still see some of the organization's information here" #Circumstances 4 & 6, full info on charitynavigator.org goodCharNavUrlClickMe = "View more of your organization's stats" #Getting news links driver.switch_to_window(newsHbp) news01 = 'https://news.google.com/search?q=' + urlNameNews + '&hl=en-US&gl=US&ceid=US%3Aen' driver.get(news01) newsOffer = "Here is a recent news article about \"" + str(orgName) + "\":" newsMore = "View more" noNews = "No recent news article about \"" + str(orgName) + "\" was found" newsLook = "But you can search for older news articles about your organization here" if len(driver.find_elements_by_xpath('//*[@id="yDmH0d"]/c-wiz/div/div[2]/div[2]/div/main/c-wiz/div[1]/div[1]/div/article/h3/a')) >= 1: global newsTitleA, newsLinkA newsTitleA = driver.find_element_by_xpath('//*[@id="yDmH0d"]/c-wiz/div/div[2]/div[2]/div/main/c-wiz/div[1]/div[1]/div/article/h3/a').text newsLinkA = driver.find_element_by_xpath('//*[@id="yDmH0d"]/c-wiz/div/div[2]/div[2]/div/main/c-wiz/div[1]/div[1]/div/article/h3/a').get_attribute('href') #Checking for status on IRS site driver.switch_to_window(irsHbp) if len(driver.find_elements_by_xpath("//*[contains(text(), 'Your search did not return any results. Please try again.')]")) == 1: #Does not exist on IRS site #Checking for status on charitynavigator.org driver.switch_to_window(charNavHbp) driver.get('https://www.charitynavigator.org/index.cfm?keyword_list=' + urlName + '&bay=search.results') time.sleep(3) if len(driver.find_elements_by_xpath("//*[contains(text(), 'Advanced Search')]")) > 1: #Does not exist on charitynavigator.org #Circumstance 1: Doesn't exist on irs site or charitynavigator.org, now determining news links driver.switch_to_window(newsHbp) if len(driver.find_elements_by_xpath("//*[contains(text(), 'No results found.')]")) == 1: return render_template('index.html', credsToI=credsToI, badOrgNotice=badOrgNotice, meaning=meaning, badExplain=badExplain, readFiveOh=readFiveOh, credsToII=credsToII, noRatingsYetII=noRatingsYetII, noNews=noNews, newsLook=newsLook, ccSummary=ccSummary, c1Explain=c1Explain) else: return render_template('index.html', credsToI=credsToI, badOrgNotice=badOrgNotice, meaning=meaning, badExplain=badExplain, readFiveOh=readFiveOh, credsToII=credsToII, noRatingsYetII=noRatingsYetII, newsOffer=newsOffer, newsLinkA=newsLinkA, newsTitleA=newsTitleA, news01=news01, newsMore=newsMore, ccSummary=ccSummary, c1Explain=c1Explain) else: #Does exist on charitynavigator.org driver.find_element_by_xpath('//*[@id="searchresults"]/table[1]/tbody/tr[1]/td[1]/div/h3/a').click() time.sleep(1) if len(driver.find_elements_by_xpath("//*[contains(text(), 'our old design')]")) > 1: #Exists on charitynavigator.org, does not have full info badCharNavUrl = driver.current_url #Circumstance 3: Doesn't exist on irs site but has partial info on charitynavigator.org, now determining news links driver.switch_to_window(newsHbp) if len(driver.find_elements_by_xpath("//*[contains(text(), 'No results found.')]")) == 1: return render_template('index.html', credsToI=credsToI, badOrgNotice=badOrgNotice, meaning=meaning, badExplain=badExplain, readFiveOh=readFiveOh, credsToII=credsToII, notEnoughInfoII=notEnoughInfoII, badCharNavUrl=badCharNavUrl, badCharNavUrlClickMe=badCharNavUrlClickMe, noNews=noNews, newsLook=newsLook, ccSummary=ccSummary, c3Explain=c3Explain) else: return render_template('index.html', credsToI=credsToI, badOrgNotice=badOrgNotice, meaning=meaning, badExplain=badExplain, readFiveOh=readFiveOh, credsToII=credsToII, notEnoughInfoII=notEnoughInfoII, badCharNavUrl=badCharNavUrl, badCharNavUrlClickMe=badCharNavUrlClickMe, newsOffer=newsOffer, newsLinkA=newsLinkA, newsTitleA=newsTitleA, news01=news01, newsMore=newsMore, ccSummary=ccSummary, c3Explain=c3Explain) else: #Exists on charitynavigator.org, does have full info overallRating1 = "Overall Rating (out of 100): " + driver.find_element_by_xpath("//*[@id='overall']/div[1]/table/tbody/tr/td/div/table/tbody/tr[2]/td[2]").text financialRating1 = "Financial Rating (out of 100): " + driver.find_element_by_xpath("//*[@id='overall']/div[1]/table/tbody/tr/td/div/table/tbody/tr[3]/td[2]").text antRating1 = "Accountability & Transparency Rating (out of 100): " + driver.find_element_by_xpath("//*[@id='overall']/div[1]/table/tbody/tr/td/div/table/tbody/tr[4]/td[2]").text programExpenses1 = "Percent of Charity's total expenses spent on the programs/services it delivers: " + driver.find_element_by_xpath("//*[@id='overall']/div[10]/div/table/tbody/tr[1]/td[3]").text adminExpenses1 = "Administrative Expenses: " + driver.find_element_by_xpath("//*[@id='overall']/div[10]/div/table/tbody/tr[2]/td[3]").text fundraisingExpenses1 = "Fundraising Expenses: " + driver.find_element_by_xpath("//*[@id='overall']/div[10]/div/table/tbody/tr[3]/td[3]").text goodCharNavUrl = driver.current_url #Circumstance 4: Doesn't exist on irs site but has full info on charitynavigator.org, now determining news links driver.switch_to_window(newsHbp) if len(driver.find_elements_by_xpath("//*[contains(text(), 'No results found.')]")) == 1: return render_template('index.html', credsToI=credsToI, badOrgNotice=badOrgNotice, meaning=meaning, badExplain=badExplain, readFiveOh=readFiveOh, credsToII=credsToII, overallRating1=overallRating1, financialRating1=financialRating1, antRating1=antRating1, programExpenses1=programExpenses1, adminExpenses1=adminExpenses1, fundraisingExpenses1=fundraisingExpenses1, goodCharNavUrl=goodCharNavUrl, goodCharNavUrlClickMe=goodCharNavUrlClickMe, noNews=noNews, newsLook=newsLook, ccSummary=ccSummary, c4Explain=c4Explain) else: return render_template('index.html', credsToI=credsToI, badOrgNotice=badOrgNotice, meaning=meaning, badExplain=badExplain, readFiveOh=readFiveOh, credsToII=credsToII, overallRating1=overallRating1, financialRating1=financialRating1, antRating1=antRating1, programExpenses1=programExpenses1, adminExpenses1=adminExpenses1, fundraisingExpenses1=fundraisingExpenses1, goodCharNavUrl=goodCharNavUrl, goodCharNavUrlClickMe=goodCharNavUrlClickMe, newsOffer=newsOffer, newsLinkA=newsLinkA, newsTitleA=newsTitleA, news01=news01, newsMore=newsMore, ccSummary=ccSummary, c4Explain=c4Explain) #Invalid characters in IRS name field elif len(driver.find_elements_by_xpath("//*[contains(text(), 'You have entered invalid characters in the Name field.')]")) == 1: return render_template('index.html', symbolsAlert=symbolsAlert) #Checking for status on IRS site else: #Does show up on IRS site driver.find_element_by_xpath("""/html/body/div[2]/div[2]/div/div/div[1]/div/div[2]/div/ul/li/h3/a""").click() goodIrsInfoUrl = driver.current_url #Checking for status on charitynavigator.org driver.switch_to_window(charNavHbp) driver.get('https://www.charitynavigator.org/index.cfm?keyword_list=' + urlName + '&bay=search.results') time.sleep(3) if len(driver.find_elements_by_xpath("//*[contains(text(), 'Advanced Search')]")) > 1: #Does not exist on charitynavigator.org #Circumstance 2: Does exist on irs site but not charitynavigator.org, now determining news links driver.switch_to_window(newsHbp) if len(driver.find_elements_by_xpath("//*[contains(text(), 'No results found.')]")) == 1: return render_template('index.html', credsToI=credsToI, goodOrgNotice=goodOrgNotice, meaning=meaning, goodExplain=goodExplain, readFiveOh=readFiveOh, goodIrsInfoUrl=goodIrsInfoUrl, goodReadData=goodReadData, credsToII=credsToII, noRatingsYetII=noRatingsYetII, noNews=noNews, newsLook=newsLook, ccSummary=ccSummary, c2Explain=c2Explain) else: return render_template('index.html', credsToI=credsToI, goodOrgNotice=goodOrgNotice, meaning=meaning, goodExplain=goodExplain, readFiveOh=readFiveOh, goodIrsInfoUrl=goodIrsInfoUrl, goodReadData=goodReadData, credsToII=credsToII, noRatingsYetII=noRatingsYetII, newsOffer=newsOffer, newsLinkA=newsLinkA, newsTitleA=newsTitleA, news01=news01, newsMore=newsMore, ccSummary=ccSummary, c2Explain=c2Explain) else: #Does exist on charitynavigator.org driver.find_element_by_xpath('//*[@id="searchresults"]/table[1]/tbody/tr[1]/td[1]/div/h3/a').click() time.sleep(1) if len(driver.find_elements_by_xpath("//*[contains(text(), 'our old design')]")) > 1: #Exists on charitynavigator.org, does not have full info badCharNavUrl = driver.current_url #Circumstance 5: Does exist on irs site but has partial info on charitynavigator.org, now determining news links if len(driver.find_elements_by_xpath("//*[contains(text(), 'No results found.')]")) == 1: return render_template('index.html', credsToI=credsToI, goodOrgNotice=goodOrgNotice, meaning=meaning, goodExplain=goodExplain, readFiveOh=readFiveOh, goodIrsInfoUrl=goodIrsInfoUrl, goodReadData=goodReadData, credsToII=credsToII, notEnoughInfoII=notEnoughInfoII, badCharNavUrl=badCharNavUrl, badCharNavUrlClickMe=badCharNavUrlClickMe, noNews=noNews, newsLook=newsLook, ccSummary=ccSummary, c5Explain=c5Explain) else: return render_template('index.html', credsToI=credsToI, goodOrgNotice=goodOrgNotice, meaning=meaning, goodExplain=goodExplain, readFiveOh=readFiveOh, goodIrsInfoUrl=goodIrsInfoUrl, goodReadData=goodReadData, credsToII=credsToII, notEnoughInfoII=notEnoughInfoII, badCharNavUrl=badCharNavUrl, badCharNavUrlClickMe=badCharNavUrlClickMe, newsOffer=newsOffer, newsLinkA=newsLinkA, newsTitleA=newsTitleA, news01=news01, newsMore=newsMore, ccSummary=ccSummary, c5Explain=c5Explain) else: #Exists on charitynavigator.org, does have full info overallRating1 = "Overall Rating (out of 100): " + driver.find_element_by_xpath("//*[@id='overall']/div[1]/table/tbody/tr/td/div/table/tbody/tr[2]/td[2]").text financialRating1 = "Financial Rating (out of 100): " + driver.find_element_by_xpath("//*[@id='overall']/div[1]/table/tbody/tr/td/div/table/tbody/tr[3]/td[2]").text antRating1 = "Accountability & Transparency Rating (out of 100): " + driver.find_element_by_xpath("//*[@id='overall']/div[1]/table/tbody/tr/td/div/table/tbody/tr[4]/td[2]").text programExpenses1 = "Percent of Charity's total expenses spent on the programs/services it delivers: " + driver.find_element_by_xpath("//*[@id='overall']/div[10]/div/table/tbody/tr[1]/td[3]").text adminExpenses1 = "Administrative Expenses: " + driver.find_element_by_xpath("//*[@id='overall']/div[10]/div/table/tbody/tr[2]/td[3]").text fundraisingExpenses1 = "Fundraising Expenses: " + driver.find_element_by_xpath("//*[@id='overall']/div[10]/div/table/tbody/tr[3]/td[3]").text goodCharNavUrl = driver.current_url #Circumstance 6: Does exist on irs site and has full info on charitynavigator.org, now determining news links if len(driver.find_elements_by_xpath("//*[contains(text(), 'No results found.')]")) == 1: return render_template('index.html', credsToI=credsToI, goodOrgNotice=goodOrgNotice, meaning=meaning, goodExplain=goodExplain, readFiveOh=readFiveOh, goodIrsInfoUrl=goodIrsInfoUrl, goodReadData=goodReadData, credsToII=credsToII, overallRating1=overallRating1, financialRating1=financialRating1, antRating1=antRating1, programExpenses1=programExpenses1, adminExpenses1=adminExpenses1, fundraisingExpenses1=fundraisingExpenses1, goodCharNavUrl=goodCharNavUrl, goodCharNavUrlClickMe=goodCharNavUrlClickMe, noNews=noNews, newsLook=newsLook, ccSummary=ccSummary, c6Explain=c6Explain) else: return render_template('index.html', credsToI=credsToI, goodOrgNotice=goodOrgNotice, meaning=meaning, goodExplain=goodExplain, readFiveOh=readFiveOh, goodIrsInfoUrl=goodIrsInfoUrl, goodReadData=goodReadData, credsToII=credsToII, overallRating1=overallRating1, financialRating1=financialRating1, antRating1=antRating1, programExpenses1=programExpenses1, adminExpenses1=adminExpenses1, fundraisingExpenses1=fundraisingExpenses1, goodCharNavUrl=goodCharNavUrl, goodCharNavUrlClickMe=goodCharNavUrlClickMe, newsOffer=newsOffer, newsLinkA=newsLinkA, newsTitleA=newsTitleA, news01=news01, newsMore=newsMore, ccSummary=ccSummary, c6Explain=c6Explain) else: return render_template('index.html')
def sendwhatmsg_with_selenium(phone_no, message, time_hour, time_min, print_messages=True): """Same as sendwhatmsg() function, but this will not open chrome Most of the process will be hidden, only a console will open ***If this is the first time\nYou must call pywhatkit.load_QRcode() and pywhatkit.add_driver_path(path)\nbefore claing this function or you will get error Make sure whatsapp web is not already opened or you might get your number banned""" global sleeptm, path, headless_mode, curpth if "+" not in phone_no: raise CountryCodeException("Country code missing from phone_no") timehr = time_hour with open("pywhatkit_dbs.txt") as file: for lines in file: if "selpath" in lines: path = lines.replace("selpath : ", "") path = path.strip() if time_hour not in range(0, 25) or time_min not in range(0, 60): print("Invalid time format") if time_hour == 0: time_hour = 24 callsec = (time_hour * 3600) + (time_min * 60) curr = time.localtime() currhr = curr.tm_hour currmin = curr.tm_min currsec = curr.tm_sec currtotsec = (currhr * 3600) + (currmin * 60) + (currsec) lefttm = callsec - currtotsec if lefttm <= 0: lefttm = 86400 + lefttm if lefttm < 60: raise CallTimeException( "Call time must be greater than one minute as web.whatsapp.com takes some time to load" ) date = "%s:%s:%s" % (curr.tm_mday, curr.tm_mon, curr.tm_year) time_write = "%s:%s" % (timehr, time_min) file = open("pywhatkit_dbs.txt", "a") file.write("Date: %s\nTime: %s\nPhone number: %s\nMessage: %s" % (date, time_write, phone_no, message)) file.write("\n--------------------\n") file.close() sleeptm = lefttm - 60 if print_messages: print(f"In {prnt_sleeptm()+60} seconds message will be delivered") time.sleep(sleeptm) options = webdriver.ChromeOptions() options.add_argument("--window-size=1920x1080") options.add_argument( "--user-agent=Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36" ) options.add_argument('--user-data-dir=%s/pywhatkit_data' % curpth) if headless_mode: options.add_argument("--headless") driver = webdriver.Chrome(path, options=options) url = ('https://web.whatsapp.com/send?phone=' + phone_no) driver.get(url) time.sleep(45) msg_box = driver.find_element_by_xpath( '//div[@contenteditable="true"][@data-tab="1"]') time.sleep(14) msg_box.send_keys(message + "\n") if print_messages: print("Message sent\nYou may close the console window now")
def crawl_image_urls(keywords, engine="Google", max_number=10000, face_only=False, safe_mode=False, proxy=None, proxy_type="http", quiet=False, browser="phantomjs", image_type=None, color=None): """ Scrape image urls of keywords from Google Image Search :param keywords: keywords you want to search :param engine: search engine used to search images :param max_number: limit the max number of image urls the function output, equal or less than 0 for unlimited :param face_only: image type set to face only, provided by Google :param safe_mode: switch for safe mode of Google Search :param proxy: proxy address, example: socks5 127.0.0.1:1080 :param proxy_type: socks5, http :param browser: browser to use when crawl image urls from Google & Bing :return: list of scraped image urls """ my_print("\nScraping From {0} Image Search ...\n".format(engine), quiet) my_print("Keywords: " + keywords, quiet) if max_number <= 0: my_print("Number: No limit", quiet) max_number = 10000 else: my_print("Number: {}".format(max_number), quiet) my_print("Face Only: {}".format(str(face_only)), quiet) my_print("Safe Mode: {}".format(str(safe_mode)), quiet) if engine == "Google": query_url = google_gen_query_url(keywords, face_only, safe_mode, image_type, color) elif engine == "Bing": query_url = bing_gen_query_url(keywords, face_only, safe_mode, image_type, color) elif engine == "Baidu": query_url = baidu_gen_query_url(keywords, face_only, safe_mode, color) else: return my_print("Query URL: " + query_url, quiet) if engine != "Baidu": browser = str.lower(browser) if "chrome" in browser: chrome_path = shutil.which("chromedriver") if platform.system() == 'Darwin': chrome_path = "./macos/bin/chromedriver-mac" chrome_options = webdriver.ChromeOptions() if "headless" in browser: chrome_options.add_argument("headless") if proxy is not None and proxy_type is not None: chrome_options.add_argument("--proxy-server={}://{}".format(proxy_type, proxy)) driver = webdriver.Chrome(chrome_path, chrome_options=chrome_options) else: phantomjs_path = shutil.which("phantomjs") phantomjs_path = "./bin/phantomjs" if phantomjs_path is None else phantomjs_path phantomjs_args = [] if proxy is not None and proxy_type is not None: phantomjs_args += [ "--proxy=" + proxy, "--proxy-type=" + proxy_type, ] driver = webdriver.PhantomJS(executable_path=phantomjs_path, service_args=phantomjs_args, desired_capabilities=dcap) if engine == "Google": driver.set_window_size(1920, 1080) driver.get(query_url) image_urls = google_image_url_from_webpage(driver, max_number, quiet) elif engine == "Bing": driver.set_window_size(1920, 1080) driver.get(query_url) image_urls = bing_image_url_from_webpage(driver) else: # Baidu # driver.set_window_size(10000, 7500) # driver.get(query_url) # image_urls = baidu_image_url_from_webpage(driver) image_urls = baidu_get_image_url_using_api(keywords, max_number=max_number, face_only=face_only, proxy=proxy, proxy_type=proxy_type) if engine != "Baidu": driver.close() if max_number > len(image_urls): output_num = len(image_urls) else: output_num = max_number my_print("\n== {0} out of {1} crawled images urls will be used.\n".format( output_num, len(image_urls)), quiet) return image_urls[0:output_num]
def send_file(phone_no, path_to_file, time_hour, time_min, print_messages=True): """Send file of any format (png, mp3, txt etc)""" global sleeptm, path, headless_mode, curpth if "+" not in phone_no: raise CountryCodeException("Country code missing from phone_no") timehr = time_hour if not os.path.exists(path_to_file): raise FilePathException("No file found at %s" % path_to_file) with open("pywhatkit_dbs.txt") as file: for lines in file: if "selpath" in lines: chrpath = lines.replace("selpath : ", "") chrpath = chrpath.strip() if time_hour not in range(0, 25) or time_min not in range(0, 60): print("Invalid time format") if time_hour == 0: time_hour = 24 callsec = (time_hour * 3600) + (time_min * 60) curr = time.localtime() currhr = curr.tm_hour currmin = curr.tm_min currsec = curr.tm_sec currtotsec = (currhr * 3600) + (currmin * 60) + (currsec) lefttm = callsec - currtotsec if lefttm <= 0: lefttm = 86400 + lefttm if lefttm < 60: raise CallTimeException( "Call time must be greater than one minute as web.whatsapp.com takes some time to load" ) date = "%s:%s:%s" % (curr.tm_mday, curr.tm_mon, curr.tm_year) time_write = "%s:%s" % (timehr, time_min) file = open("pywhatkit_dbs.txt", "a") file.write("Date: %s\nTime: %s\nPhone number: %s\nAttachment: %s" % (date, time_write, phone_no, path_to_file)) file.write("\n--------------------\n") file.close() sleeptm = lefttm - 60 if print_messages: print(f"In {prnt_sleeptm()+60} seconds message will be delivered") time.sleep(sleeptm) options = webdriver.ChromeOptions() pth = os.getcwd() + './pywhatkit' options.add_argument("--window-size=1920x1080") options.add_argument( "--user-agent=Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36" ) options.add_argument('--user-data-dir=%s/pywhatkit_data' % curpth) if headless_mode: options.add_argument("--headless") url = ('https://web.whatsapp.com/send?phone=' + phone_no) driver = webdriver.Chrome(chrpath, options=options) driver.get(url) time.sleep(40) driver.find_element_by_xpath('//span[@data-icon="clip"]').click() time.sleep(1) attch = driver.find_element_by_xpath( '//input[@accept="image/*,video/mp4,video/3gpp,video/quicktime"]') attch.send_keys(path_to_file) time.sleep(10) snd = driver.find_element_by_xpath('//span[@data-icon="send"]') time.sleep(4) snd.click() if print_messages: print( "Message sent\nIf it is a big file, it might take longer time to be delivered\nClose console only after message gets delivered." )
def upload(self, file_list, link): filename = 'engine/bilibili.cookie' # title_ = self.r_title videopath = self.assemble_videopath(file_list) # service_log_path = "{}/chromedriver.log".format('/home') options = webdriver.ChromeOptions() options.add_argument('headless') self.driver = webdriver.Chrome(executable_path=engine.chromedrive_path, chrome_options=options) # service_log_path=service_log_path) try: self.driver.get("https://www.bilibili.com") # driver.delete_all_cookies() if os.path.isfile(filename): with open(filename) as f: new_cookie = json.load(f) for cookie in new_cookie: # print(cookie) if isinstance(cookie.get("expiry"), float): cookie["expiry"] = int(cookie["expiry"]) self.driver.add_cookie(cookie) self.driver.get("https://member.bilibili.com/video/upload.html") # print(driver.title) self.add_videos(videopath) # js = "var q=document.getElementsByClassName('content-header-right')[0].scrollIntoView();" # driver.execute_script(js) cookie = self.driver.get_cookies() with open(filename, "w") as f: json.dump(cookie, f) self.add_information(link) self.driver.find_element_by_xpath( '//*[@class="upload-v2-container"]/div[2]/div[3]/div[5]/span[1]' ).click() # screen_shot = driver.save_screenshot('bin/1.png') # print('截图') time.sleep(3) upload_success = self.driver.find_element_by_xpath( r'//*[@id="app"]/div/div[3]/h3').text if upload_success == '': self.driver.save_screenshot('err.png') logger.info('稿件提交失败,截图记录') return else: logger.info(upload_success) # print('稿件提交完成!') # logger.info('%s提交完成!' % title_) self.remove_filelist(file_list) except selenium.common.exceptions.NoSuchElementException: logger.exception('发生错误') # except selenium.common.exceptions.TimeoutException: # logger.exception('超时') except selenium.common.exceptions.TimeoutException: self.login(filename) finally: self.driver.quit() logger.info('浏览器驱动退出')
def __init__(self): options = webdriver.ChromeOptions() options.add_argument('--incognito') # options.add_argument('--headless') self.driver = webdriver.Chrome(chrome_options=options)
def open_chrome(self, executable_path="../driver/chromedriver"): chrome_options = webdriver.ChromeOptions() chrome_options.add_argument('--no-sandbox') # refer: https://stackoverflow.com/questions/43008622/python-linux-selenium-chrome-not-reachable chrome_options.add_argument('--disable-gpu') self.browser = webdriver.Chrome(executable_path=executable_path, chrome_options=chrome_options) self.browser.get(self.init_url)
def get_npc_all(newtai_url_list): driver_path = "./chromedriver" option = webdriver.ChromeOptions() option.add_argument('--windows-size=1280,1024') browser = webdriver.Chrome(executable_path=driver_path, chrome_options=option) browser.get('https://rent.591.com.tw/?kind=0®ion=1') browser.implicitly_wait(1) browser.find_element_by_xpath("//dd[@data-id=3]").click() while True: html_source = browser.page_source soup = bs4.BeautifulSoup(html_source, 'html.parser') next_page = soup.find_all("a", class_="last") print(len(next_page)) if len(next_page) == 0: h3_list = soup.find_all("h3") title_list = [] for i in h3_list: tmp = i.find_all("a") title_list.append(tmp[0]) for j in title_list: tmp_str = re.findall("rent.591\\S+html", str(j))[0] # print(tmp_str) newtai_url_list.append("https://%s" % tmp_str) try: browser.find_element_by_class_name("pageNext").click() except Exception as ex: try: browser.find_element_by_class_name("pageNext").click() except Exception as ex: browser.find_element_by_class_name("pageNext").click() elif len(next_page) == 1: h3_list = soup.find_all("h3") title_list = [] for i in h3_list: tmp = i.find_all("a") title_list.append(tmp[0]) for j in title_list: tmp_str = re.findall("rent.591\\S+html", str(j))[0] # print(tmp_str) newtai_url_list.append("https://%s" % tmp_str) break # newtai_url_list = list(set(newtai_url_list)) # print(newtai_url_list) # print(len(newtai_url_list)) browser.close() browser.quit()
class TumblrReg(): def tf_get_proxy(): # proxies = set() proxy_list = list() try: url = "https://free-proxy-list.net/" adapter = HTTPAdapter(max_retries=2) request_session = requests.Session() request_session.mount(url, adapter) headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36' } r = requests.get(url, headers=headers, verify=False, timeout=5) soup = BeautifulSoup(r.content, 'html.parser') proxy_data = soup.select('td:nth-child(2) , td:nth-child(1)') for i in range(0, 20, 2): proxy_list.append( str(proxy_data[i].text) + ':' + str(proxy_data[i + 1].text)) # proxy = str(proxy_data[i].text) + ':' + str(proxy_data[i + 1].text) # proxies.add(proxy) return proxy_list except Exception as e: print(" --->", e) pass ALL_PROXIES = tf_get_proxy() options = webdriver.ChromeOptions() options.add_argument('--disable-gpu') options.add_argument('--window-position=0,0') options.add_argument('--disable-infobars') options.add_argument('--window-size=1920,1080') def tf_proxy_driver(PROXIES, options=options): pxy = '' # if PROXIES: # pxy = PROXIES[-1] # else: # print("--- Proxies used up (%s)" % len(PROXIES)) # # options.add_argument('--proxy-server=%s' % pxy) opts = ChromeOptions() opts.add_experimental_option("detach", True) # driver = Chrome(chrome_options=opts) Chromedriver = webdriver.Chrome(Chrome_path, chrome_options=opts) Chromedriver.implicitly_wait(Wait_3) print("proxy is - ", pxy) return Chromedriver def tf_use_same_session(ChromeDriver): executor_url = ChromeDriver.command_executor._url # "http://127.0.0.1:60622/hub" session_id = ChromeDriver.session_id # '4e167f26-dc1d-4f51-a207-f761eaf73c31' print(session_id) driver_temp = webdriver.Remote(command_executor=executor_url, desired_capabilities={}) driver_temp.close() driver_temp.session_id = session_id return driver_temp def tf_check_folder_path(new_folder_create=""): path = os.path.dirname(os.getcwd()) path = path + "/" + "source_page_screen_shot_media" new_folder = date.today() directory = path + "/" + str(new_folder) if not os.path.exists(directory): os.makedirs(directory) if new_folder_create != "": directory = directory + "/" + str(new_folder_create) if not os.path.exists(directory): os.makedirs(directory) return directory else: return directory def tf_check_and_rename(file, add=0): original_file = file if add != 0: split = file.split(".") part_1 = split[0] + "_" + str(add) file = ".".join([part_1, split[1]]) if not os.path.isfile(file): os.rename(original_file, file) else: add += 1 TumblrReg.tf_check_and_rename(original_file, add) def tf_screen_shots(driver, scroll_delay=0.3): path = TumblrReg.tf_check_folder_path("screenshot") title = driver.title if title != "": title_length = len(str(title)) if title_length > 26: title = title.replace("@", "").replace("/", "").replace( "$", "").replace(".", "").replace(":", "").replace("|", "") title = str(title)[0:25] else: title = driver.current_url title = title.replace("@", "").replace("/", "").replace( "$", "").replace(".", "").replace(":", "").replace("|", "") file_name = path + "/" + title + ".png" if os.path.exists(file_name): TumblrReg.tf_check_and_rename(file_name) device_pixel_ratio = driver.execute_script( 'return window.devicePixelRatio') total_height = driver.execute_script( 'return document.body.parentNode.scrollHeight') viewport_height = driver.execute_script('return window.innerHeight') total_width = driver.execute_script('return document.body.offsetWidth') viewport_width = driver.execute_script( "return document.body.clientWidth") # this implementation assume (viewport_width == total_width) assert (viewport_width == total_width) # scroll the page, take screenshots and save screenshots to slices offset = 0 # height slices = {} while offset < total_height: if offset + viewport_height > total_height: offset = total_height - viewport_height driver.execute_script('window.scrollTo({0}, {1})'.format( 0, offset)) time.sleep(scroll_delay) img = Image.open(BytesIO(driver.get_screenshot_as_png())) slices[offset] = img offset = offset + viewport_height if total_height < 10000: update_total_height = driver.execute_script( 'return document.body.parentNode.scrollHeight') if total_height != update_total_height: total_height = update_total_height # combine image slices stitched_image = Image.new('RGB', (total_width * device_pixel_ratio, total_height * device_pixel_ratio)) for offset, image in slices.items(): stitched_image.paste(image, (0, offset * device_pixel_ratio)) stitched_image.save(file_name) driver.execute_script('window.scrollTo({0}, {1})'.format(0, 0)) def tf_source_code(driver): path = TumblrReg.tf_check_folder_path("sourcepage") title = driver.title if title != "": title_length = len(str(title)) if title_length > 26: title = title.replace("@", "").replace("/", "").replace( "$", "").replace(".", "").replace(":", "").replace("|", "") title = str(title)[0:25] else: title = title.replace("@", "").replace("/", "").replace( "$", "").replace(".", "").replace(":", "").replace("|", "") else: title = driver.current_url title = title.replace("@", "").replace("/", "").replace( "$", "").replace(".", "").replace(":", "").replace("|", "") TumblrReg.tf_File_name = path + "/" + title + ".html" if os.path.exists(TumblrReg.tf_File_name): TumblrReg.tf_check_and_rename(TumblrReg.tf_File_name) pagesource = driver.page_source.encode('ascii', 'ignore') soup = BeautifulSoup(pagesource, 'html.parser') # Create text file, then write page source to the file fh = open(TumblrReg.tf_File_name, 'w') fh.write(str(soup.prettify())) fh.close() def tf_Type_driver_scroller(Chromedriver): total_height = Chromedriver.execute_script( 'return document.body.parentNode.scrollHeight') viewport_height = Chromedriver.execute_script( 'return window.innerHeight') total_width = Chromedriver.execute_script( 'return document.body.offsetWidth') viewport_width = Chromedriver.execute_script( "return document.body.clientWidth") # this implementation assume (viewport_width == total_width) assert (viewport_width == total_width) # scroll the page, take screenshots and save screenshots to slices offset = 0 # height while offset < total_height: if offset + viewport_height > total_height: offset = total_height - viewport_height Chromedriver.execute_script('window.scrollTo({0}, {1})'.format( 0, offset)) time.sleep(Wait_1) offset = offset + viewport_height if total_height < 10000: update_total_height = Chromedriver.execute_script( 'return document.body.parentNode.scrollHeight') if total_height != update_total_height: total_height = update_total_height Chromedriver.execute_script('window.scrollTo({0}, {1})'.format(0, 0)) #To Url def tf_To_url(): url = 'https://www.tumblr.com/' return url #Find UI item - Find UI element and click must be put inside the same function #Use a separate function to store find UI element might not work in some situations. #Click my account def tf_Click_Sign_up(Chromedriver): try: Sign_up = Chromedriver.find_element_by_xpath( '//*[@id="signup_forms_submit"]/span[1]') Sign_up.click() time.sleep(Wait_2) except: pass def tf_Type_email(Chromedriver): try: email = Chromedriver.find_element_by_id('signup_email') email.clear() email.send_keys("*****@*****.**") time.sleep(Wait_1) except: pass def tf_Type_password(Chromedriver): try: password = Chromedriver.find_element_by_id('signup_password') password.clear() password.send_keys("Xysbsg@1238#76Bd") time.sleep(Wait_1) except: pass def tf_Type_user_name(Chromedriver): try: username = Chromedriver.find_element_by_id('signup_username') username.clear() username.send_keys("johnbradman2019") time.sleep(Wait_1) except: pass def tf_Type_user_name_suggest(Chromedriver): try: username_suggest = Select( Chromedriver.find_element_by_id('suggested_usernames')) username_suggest.select_by_index(0) time.sleep(Wait_1) except: pass def tf_Type_do_signup(Chromedriver): try: do_signup = Chromedriver.find_element_by_xpath( '//*[@id="signup_forms_submit"]/span[3]') do_signup.click() time.sleep(Wait_1) except: pass def tf_Type_Age_Confirm(Chromedriver): try: age_input = Chromedriver.find_element_by_id("signup_age") age_input.clear() age_input.send_keys("29") time.sleep(Wait_1) except: pass def tf_Type_User_TandC(Chromedriver): try: t_and_c = Chromedriver.find_element_by_id("signup_tos") t_and_c.click() time.sleep(Wait_1) except: pass def tf_do_signup_done(Chromedriver): try: do_signup = Chromedriver.find_element_by_xpath( '//*[@id="signup_forms_submit"]/span[4]') do_signup.click() time.sleep(Wait_1) except: pass def tf_recaptch_click(Chromedriver): try: recaptch = Chromedriver.find_element_by_id("recaptcha-anchor") recaptch.click() time.sleep(Wait_2) except: pass def tf_do_almost_done(Chromedriver): try: almost_done = Chromedriver.find_element_by_xpath( '//*[@id="signup_forms_submit"]/span[5]/span') almost_done.click() time.sleep(Wait_1) except: pass def tf_Type_skip_1(Chromedriver): try: skip_1 = Chromedriver.find_element_by_xpath( '//*[@id="onboarding_actions_index"]/div[2]/div[3]/div[2]/button[1]' ) skip_1.click() time.sleep(Wait_1) except: pass
from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.common.exceptions import TimeoutException driver_option = webdriver.ChromeOptions() driver_option.add_argument(" - incognito") chromedriver_path = '/home/larri/Downloads/chromedriver' def create_webdriver(): return webdriver.Chrome(executable_path=chromedriver_path, chrome_options=driver_option)
# -*- coding: utf-8 -*- from selenium import webdriver import time chrome_opt = webdriver.ChromeOptions() prefs = {"profile.managed_default_content_settings.images": 2} # 不加载图片 chrome_opt.add_experimental_option('prefs', prefs) browser = webdriver.Chrome( executable_path="/Users/pengtuo/code/Python/ArticleSpider/chromedriver", chrome_options=chrome_opt) # selenium模拟登陆知乎 browser.get('https://www.zhihu.com/#signin') browser.find_element_by_css_selector( '.view-signin input[name="account"]').send_keys('xxx') browser.find_element_by_css_selector( '.view-signin input[name="password"]').send_keys('xxx') browser.find_element_by_css_selector('.view-signin button.sign-button').click() # selenium模拟登陆微博 browser.get('http://weibo.com/') time.sleep(5) # 等待页面加载再寻找元素 browser.find_element_by_css_selector('input[id="loginname"]').send_keys('xxx') browser.find_element_by_css_selector( '.info_list.password input[node-type="password"]').send_keys('xxx') browser.find_element_by_css_selector( '.info_list.login_btn a[node-type="submitBtn"]').click()
def __init__(self, username, password): self.browserProfile = webdriver.ChromeOptions() self.browserProfile.add_experimental_option('prefs', {'intl.accept.languages': 'en,en_US'}) self.browser = webdriver.Chrome('chromedriver.exe', chrome_options=self.browserProfile) self.username = username self.password = password
def setUp(self): self.options = webdriver.ChromeOptions() self.options.add_argument('--incognito') self.options.add_argument('--start-maximized') self.drivers = []
import pandas as pd from selenium import webdriver import time import os import chromedriver_binary # sin necesidad de añadir a Path import shutil import os opciones = webdriver.ChromeOptions() prefs = { 'download.default_directory': 'C:\\Users\\luisb\\Code-Font\\prueba\\dataset\\' } opciones.add_experimental_option('prefs', prefs) browser = webdriver.Chrome(options=opciones) browser.get( "https://www.worldbank.org/en/projects-operations/procurement/debarred-firms#" ) time.sleep(20) # espera para buscar el boton #verifico si el archivo existe if os.path.isfile( 'C:/Users/luisb/Code-Font/prueba/dataset/Sanctioned individuals and firms.xlsx' ): print("Eliminando archivo anterior...") os.remove( 'C:/Users/luisb/Code-Font/prueba/dataset/Sanctioned individuals and firms.xlsx' ) #renombro para no borrar y no dañar el siguiente browser.find_element_by_class_name("k-grid-excel").click()
def Process(self, filepath, name): options = webdriver.ChromeOptions() profile = { "plugins.plugins_list": [{ "enabled": False, "name": "Chrome PDF Viewer" }], # Disable Chrome's PDF Viewer "download.default_directory": filepath, "download.extensions_to_open": "applications/pdf" } options.add_experimental_option("prefs", profile) self._driver = webdriver.Chrome(chrome_options=options) wait = WebDriverWait(self._driver, 60) # Optional argument, if not specified will search path. self._driver.get('https://www.au10tixportalusa.com/VanillaRest/') wait.until( EC.element_to_be_clickable( (By.XPATH, '/html/body/div/article/div[2]/form/button'))) ##log in self._driver.find_element_by_name('j_username').send_keys( self._username) self._driver.find_element_by_name('j_password').send_keys( self._password) self._driver.find_element_by_xpath( '/html/body/div/article/div[2]/form/button').click() ##upload wait.until( EC.element_to_be_clickable( (By.XPATH, '/html/body/div/article/ng-view/div/section/div/section/a'))) self._driver.find_element_by_xpath( '/html/body/div/article/ng-view/div/section/div/section/a').click( ) wait.until( EC.presence_of_element_located(( By.XPATH, '//*[@id="addFilesOneSide"]/div/div/div[2]/div/div[1]/div/div/div[2]/div/input' ))) self._driver.find_element_by_xpath( '//*[@id="addFilesOneSide"]/div/div/div[2]/div/div[1]/div/div/div[2]/div/input' ).send_keys(filepath + name) wait.until( EC.element_to_be_clickable( (By.XPATH, '//*[@id="addFilesOneSide"]/div/div/div[3]/button[2]'))) self._driver.find_element_by_xpath( '//*[@id="addFilesOneSide"]/div/div/div[3]/button[2]').click() wait.until( EC.element_to_be_clickable( (By.XPATH, '//*[@id="mainTable"]/table/tbody/tr[1]/td[1]/div/label'))) self._driver.find_element_by_xpath( '//*[@id="mainTable"]/table/tbody/tr/td[3]').click() handles = self._driver.window_handles self._driver.switch_to.window(handles[1]) wait.until( EC.presence_of_element_located( (By.XPATH, '/html/body/div/article/ng-view/div/section[1]/div/span'))) self._this_page = self._driver.page_source time.sleep(1) #wait.until(EC.element_to_be_clickable((By.XPATH, '/html/body/div/article/ng-view/div/section[2]/div[2]'))) try: self._driver.find_element_by_xpath( '/html/body/div/article/ng-view/div/section[2]/div[2]').click( ) #WebDriverWait(self._driver, 60).until(lambda x: x.find_element_by_xpath('/html/body/div/article/ng-view/div/section[2]/div[2]')).click() self._driver.find_element_by_xpath( '/html/body/div/article/ng-view/div/section[2]/div[2]/div/ul/li/a' ).click() self._result = BeautifulSoup(self._this_page, 'lxml').find('span', class_='ng-binding').text wait.until( EC.presence_of_element_located( (By.XPATH, "//*[text()='PDF']"))) #print(str(BeautifulSoup(self._this_page,'lxml').find(text='PDF').parent)) self._downloadname = re.findall( r"\d/(.+).pdf", str( BeautifulSoup(self._this_page, 'lxml').find(text='PDF').parent))[0] #self._downloadname = re.findall(r"\d/(.+).pdf", pdf_url)[0] #old = max([f for f in os.listdir(filepath)], key=os.path.getctime) old = filepath + self._downloadname + '.pdf' #print(old) while not os.path.exists(old): time.sleep(1) newfilepath = filepath + os.path.basename( os.path.dirname(filepath)) + '-Au10tix.pdf' if not os.path.exists(newfilepath): os.rename(old, newfilepath) else: print('File ' + os.path.basename(os.path.dirname(filepath)) + '-Au10tix.pdf' + ' exists.') except: self._result = 'aborted' print('Processing Request Rejected') self._driver.quit()
# import web driver from selenium import webdriver from parsel import Selector import urllib import os import sched import time from selenium.webdriver.common.keys import Keys OUTPUT_FOLDER = 'real_captcha_dataset' # specifies the options to the chromedriver.exe options = webdriver.ChromeOptions() #options.add_argument('--headless') url = 'https://www.tis.bizfile.gov.sg' driver = webdriver.Chrome('/Users/merlinegalite/Desktop/octobot/Scraping/LinkedInScraping/chromedriver', options=options) driver.get('https://www.bizfile.gov.sg/ngbbizfileinternet/faces/oracle/webcenter/portalapp/pages/BizfileHomepage.jspx?_afrWindowId=null&_afrLoop=11499874782621942&_afrWindowMode=0&_adf.ctrl-state=10irrn140w_4#%40%3F_afrWindowId%3Dnull%26_afrLoop%3D11499874782621942%26_afrWindowMode%3D0%26_adf.ctrl-state%3D2w324sfb3_4') query_button = driver.find_element_by_xpath('//*[@class="search_Icon2 af_commandImageLink p_AFTextOnly"]') query_button.click() time.sleep(4) sel = Selector(text=driver.page_source)
def __init__(self): scrapy.Spider.__init__(self) options = webdriver.ChromeOptions() options.add_argument('headless') self.browser = webdriver.Chrome('chromedriver', chrome_options=options)