def deleteEmail(email, selection, system): driver = Chrome("chromedriver") driver.get(index_url) driver.find_element_by_id(selection).click() time.sleep(1) driver.find_element_by_id("email").send_keys(email) driver.find_element_by_id("password").send_keys(password) if (system is not None): driver.find_element_by_id(system).click() driver.find_element_by_id("signInButton").click() time.sleep(4) driver.switch_to.alert.accept() time.sleep(1) driver.switch_to.alert.accept() emailNum = driver.find_elements_by_class_name("deleteKey") for i in range(len(emailNum)): delete = driver.find_element_by_id("deleteKey0") delete.click() driver.switch_to.alert.accept() time.sleep(4) driver.switch_to.alert.accept() emailNum = driver.find_elements_by_class_name("deleteKey") if (len(emailNum) != 0): assert emailNum != 0, "Not all emails deleted" time.sleep(1) driver.close()
def __init__(self): self.list_urls = [] for area in neighbourhoods: try: webdriver = '/Users/RitaFigueiredo/Documents/chromedriver' driver = Chrome(webdriver) url = 'https://rightmove.co.uk' driver.get(url) search = WebDriverWait(driver, DELAY).until( EC.presence_of_element_located((By.ID, 'searchLocation'))) search.send_keys(area) rent = driver.find_element_by_id('rent') rent.click() time.sleep(1) if driver.find_element_by_id( 'locationIdentifier').is_displayed(): select_location = Select( driver.find_element_by_id('locationIdentifier')) select_location.select_by_index(0) select_min_bedrooms = Select( driver.find_element_by_id('minBedrooms')) select_min_bedrooms.select_by_value('0') select_max_bedrooms = Select( driver.find_element_by_id('maxBedrooms')) select_max_bedrooms.select_by_value('2') select_added_to_site = Select( driver.find_element_by_id('maxDaysSinceAdded')) select_added_to_site.select_by_value('1') rent = driver.find_element_by_id('submit') rent.click() time.sleep(2) home_links = driver.find_elements_by_class_name( 'propertyCard-link') for link in home_links: self.list_urls.append(link.get_attribute('href')) time.sleep(2) while True: next_button = driver.find_element_by_class_name( 'pagination-button.pagination-direction.pagination-direction--next' ) if not next_button.is_enabled(): break next_button.click() time.sleep(3) home_links = driver.find_elements_by_class_name( 'propertyCard-link') for link in home_links: self.list_urls.append(link.get_attribute('href')) time.sleep(3) except TimeoutException: print("Loading took too much time!") time.sleep(2)
class RecipeScrape: def __init__(self, recipe_link): self.recipe_link = recipe_link options = webdriver.ChromeOptions() if IS_HEADLESS_BROWSER: options.add_argument('--headless') prefs = {"profile.managed_default_content_settings.images": 2} options.add_experimental_option("prefs", prefs) options.add_argument("--disable-gpu") # options.add_argument("--no-sandbox") # options.add_argument("start-maximized") # options.add_argument("enable-automation") options.add_argument("--disable-infobars") options.add_argument("--disable-dev-shm-usage") options.add_argument("--disable-popup-blocking") if IS_LOCAL_CHROMEDRIVER: self.driver = Chrome(WEBDRIVER_FILE, options=options) else: self.driver = Chrome(options=options) def scrape(self): try: self.driver.get(self.recipe_link) print("URL opened: {}".format(self.recipe_link)) if( "<body></body>" in self.driver.page_source ): raise InstanceIPBlacklistedException() self.determine_scrape_version() self.scrape_version.scrape(self.driver) except Exception: raise finally: self.driver.quit() def determine_scrape_version(self): # We use some random indicator on the page to determine, but may not be entirely robust; may want to find some other indicator later if( len(self.driver.find_elements_by_class_name("author-block")) > 0 ): self.scrape_version = ScrapeRecipeV1(MAX_REVIEW_SCRAPE_PER_RECIPE) elif( len(self.driver.find_elements_by_class_name("recipe-container-outer")) > 0 ): self.scrape_version = ScrapeRecipeV2(MAX_REVIEW_SCRAPE_PER_RECIPE) else: raise Exception("Invalid or unexpected version of recipe page")
def _get_product_image(chrome_driver: webdriver.Chrome) -> str: """ Retrieve product image URL :param chrome_driver: chrome web driver instance :return: product image url """ product_photos = chrome_driver.find_elements_by_class_name( config["PRODUCT_PHOTOS"]) item = None if len(product_photos) != 1: product_photos.reverse() while product_photos and not item: for product_photo in product_photos: hover_to_photos(chrome_driver, product_photo) soup = BeautifulSoup(chrome_driver.page_source, "html.parser") item = soup.find(class_=config["PRODUCT_DETAIL_PHOTO"]) if item: break else: product_photos.pop() item_image = item.get("style") if item_image: item_image = item_image.split(" ")[1].split('"')[1] return item_image return ""
def get_ad_info_from_link(url: str, browser: webdriver.Chrome): """ With the link to ads page on Google Transparency Report (from the csv), get the assets, vis_url and description for the ads :param url: :param browser: :return: """ browser.get(url) time.sleep(1.5) el_ad_ctnr = browser.find_elements_by_class_name('ad-container') if not el_ad_ctnr: return None soup_ad_ctnr = BeautifulSoup(el_ad_ctnr[0].get_attribute("innerHTML"), 'html.parser') _all_divs = soup_ad_ctnr.find_all("div") if len(_all_divs) != 4: print("Malformed Webpage: {}".format(url)) return None assets = _all_divs[0].get_text() url = _all_divs[1].get_text() url = url.split(" ")[1] desc = _all_divs[3].get_text() return assets, url, desc
def push_replay_button(driver: webdriver.Chrome): try: close_button = WebDriverWait(driver, 5).until( EC.visibility_of_element_located(( By.CSS_SELECTOR, "div.prize-overlay div.popup-box div.popup-header button.close-button", ))) button_click(driver, close_button) except TimeoutException: pass try: checkbox_element = WebDriverWait(driver, 5).until( EC.visibility_of_element_located( (By.CSS_SELECTOR, "span.mc-checkmark"))) button_click(driver, checkbox_element) main_button = driver.find_elements_by_class_name("main-button")[0] button_click(driver, main_button) except TimeoutException: pass try: custom_buttons = WebDriverWait(driver, 5).until( EC.visibility_of_all_elements_located( (By.CSS_SELECTOR, "button.custom-button"))) custom_button = custom_buttons[-1] button_click(driver, custom_button) except TimeoutException: pass
def _sanity_checks(driver: webdriver.Chrome): elems = driver.find_elements_by_class_name("igc-legend-label") labels = [e.text for e in elems] if labels[0] != "Fået 1. vaccine" or labels[1] != "Modtaget 2. vaccine*": raise Exception( "First graph structure has changed. Consider manually checking the axis labels in the browser." )
def lookup_game_price(driver: webdriver.Chrome, game_name: str): driver.get(f"https://www.g2a.com/") wait_set_time_plus_random_time(2) cookie_clicker(driver) driver.get( f"https://www.g2a.com/search?query={game_name.replace(' ', '%2B')}") WebDriverWait(driver, 800).until( expected_conditions.visibility_of_element_located( (By.CLASS_NAME, "offers-list"))) print("found offer site") offer_list_elements = driver.find_elements_by_class_name( "offers-list__element") for list_element in offer_list_elements: transactions = int( list_element.find_element_by_class_name( "seller-info__transactions").text) print(transactions) if transactions > min_transactions: offer_price = list_element.find_element_by_class_name( "offer__price").text print(offer_price) return offer_price wait_set_time_plus_random_time(10)
def on_button_clicked(): browser = Chrome(options=opts) for word in sentence.split(): data['word'] = [] browser.get('https://www.megalobiz.com/lrc/maker/download-music-lyrics-lrc-generated-files') search_form = browser.find_element_by_id('lrc_search_phrase') search_form.send_keys(word) time.sleep(2) #results = browser.find_elements_by_class_name("lyrics_member_box") lyricsList = browser.find_element_by_class_name("more_content") for lyrics in lyricsList: lyrics.click() #print(results[0].text) download_button = browser.find_elements_by_class_name("lyrics_button") download_button[0].click() time.sleep(1) listofFiles = glob.glob(os.path.join(lyricPath, "*")) latestFile = max(listofFiles, key=os.path.getctime) latestFile = latestFile.rsplit('\\', 1)[-1] print(latestFile)
def get_games_id(comp): dates = [d for d in date_range(START_DATE, END_DATE)] games_id = [] chrome_options = Options() chrome_options.add_argument('--dns-prefetch-disable') driver = Chrome(chrome_options=chrome_options) for day in dates: driver.get( 'http://www.espn.com.ar/futbol/resultados/_/liga/{}/fecha/{}'. format(comp, day) ) game_link_driver = driver.find_elements_by_class_name( 'mobileScoreboardLink ' ) for game_driver in game_link_driver: game_id = game_driver.get_attribute('href')[46:53] games_id.append((game_id, day)) driver.quit # print(games_id) return games_id
def insert_jd_target_sku(browser: Chrome): elements = browser.find_elements_by_class_name('gl-item') print(f'当前页面共有{len(elements)}个商品') for element in elements: # 获取当前商品SKU编号 current_sku: str = element.get_attribute('data-sku') TargetSku.get_or_create(source='京东', sku=current_sku)
def scroll(driver: Chrome, wait: WebDriverWait): scroll_div = wait.until( EC.visibility_of_element_located((By.CLASS_NAME, SCROLLBOX))) reviews_count = len(driver.find_elements_by_class_name(REVIEW)) timeout = time.time() + SCROLL_TIMEOUT while time.time() < timeout: driver.execute_script(SCROLLING_SCRIPT, scroll_div) time.sleep(1) scrolled_reviews_count = len( driver.find_elements_by_class_name(REVIEW)) if reviews_count < scrolled_reviews_count: reviews_count = scrolled_reviews_count timeout = time.time() + SCROLL_TIMEOUT
def get_games_id(comp): dates = [d for d in date_range(START_DATE, END_DATE)] games_id = [] chrome_options = Options() chrome_options.add_argument('--dns-prefetch-disable') driver = Chrome(chrome_options=chrome_options) for day in dates: driver.get( 'http://www.espn.com.ar/futbol/resultados/_/liga/{}/fecha/{}'. format(comp, day)) game_link_driver = driver.find_elements_by_class_name( 'mobileScoreboardLink ') for game_driver in game_link_driver: game_id = game_driver.get_attribute('href')[46:53] games_id.append((game_id, day)) driver.quit # print(games_id) return games_id
def start_quiz(driver: webdriver.Chrome, status: int): try: checkbox_element = WebDriverWait(driver, 10).until( EC.visibility_of_element_located( (By.CSS_SELECTOR, "span.mc-checkmark"))) button_click(driver, checkbox_element) main_button = driver.find_elements_by_class_name("main-button")[0] button_click(driver, main_button) name_input = WebDriverWait(driver, 2).until( EC.visibility_of_element_located( (By.CSS_SELECTOR, "input.name-input"))) button_click(driver, name_input) name_input.send_keys(get_random_word()) except (NoSuchElementException, TimeoutException): pass if status > -3: try: custom_button = WebDriverWait(driver, 10).until( EC.visibility_of_element_located( (By.CSS_SELECTOR, "button.custom-button"))) button_click(driver, custom_button) except TimeoutException: pass
def test_item_has_add_to_cart_button(browser: webdriver.Chrome): browser.get( "http://selenium1py.pythonanywhere.com/catalogue/coders-at-work_207/") time.sleep(30) count_basket_btn = browser.find_elements_by_class_name( "btn-add-to-basket").__len__() assert count_basket_btn == 1, f"Expected 1 basket button, got {count_basket_btn}"
def set_bundle_data(driver: webdriver.Chrome, bundle: Bundle) -> List[str]: driver.get(bundle.url) try: bundle.name = driver.find_element_by_class_name('hero-title').text bundle.site = 'HumbleBundle' game_name_containers = driver.find_elements_by_class_name('dd-image-box-text') opts = Main.get_chromedriver_options() with managed_chromedriver(opts) as nested_driver: nested_driver.set_window_size(1800, 1070) nested_driver.implicitly_wait(2) for container in game_name_containers: game = Game() game.site = 'HumbleBundle' game.name = container.text game.review_count = SteamHandler.get_game_review_number(game.name, nested_driver) game.g2a_price = G2AHandler.get_price_of(game, nested_driver) bundle.games.append(game) print(f'G2A price of {game.name}: {str(game.g2a_price)}') bundle.sale_price = _get_total_sale_price(driver) for game in bundle.games: bundle.g2a_price = bundle.g2a_price + game.g2a_price bundle.after_commission_price = bundle.after_commission_price + Utility.calculate_net_price( game.g2a_price) Product.set_bundle_meta_data(bundle) except NoSuchElementException: logging.getLogger().info(f"Can't find games in bundle with url {bundle.url}, it might be a Humble Monthly")
class LoginTest(TestCase): def setUp(self): self.driver = Chrome() def tearDown(self): self.driver.close() def test_log_in_as_admin(self): self.driver.get('http://localhost:8000/admin/') self.driver.find_element_by_name('username').send_keys('Администратор') input = self.driver.find_element_by_name('password') input.send_keys('Админ_123') input.send_keys(Keys.ENTER) assert self.driver.current_url == 'http://localhost:8000/admin/' def test_login_error(self): self.driver.get('http://localhost:8000/admin/') self.driver.find_element_by_name('username').send_keys('А') input = self.driver.find_element_by_name('password') input.send_keys('А') input.send_keys(Keys.ENTER) assert self.driver.current_url == 'http://localhost:8000/admin/login/?next=/admin/' error = self.driver.find_elements_by_class_name('errornote')[0] assert error.text == "Пожалуйста, введите корректные имя пользователя и" \ " пароль учётной записи. Оба поля могут быть чувствительны к регистру."
def search_1(a, b, c): ''' :param a: movie name :param b: empty list for future use :param c: empty dict for future use :return: ''' browser = Chrome(options=opts) browser.get('https://duckduckgo.com') search_form = browser.find_element_by_id('search_form_input_homepage') search_form.send_keys(a + ' кинопоиск') search_form.submit() browser.find_element_by_id('r1-0').click() browser.find_element_by_class_name('styles_link__21QPj').click() results = browser.find_elements_by_class_name('gray') for r in results: r = r.text r = r.strip() r = r.lower() if r == '': continue if r in b: continue else: b.append(r) browser.close() c[a] = b return c
def _parse_boosters(self, driver: webdriver.Chrome) -> tuple: elems = driver.find_elements_by_class_name("number_revac_info") elem = [ e for e in elems if "Всего" in e.find_element_by_xpath("..").text ][0] total_boosters = clean_count(elem.text) return total_boosters
def crawl(driver: webdriver.Chrome) -> List[Game]: driver.get('https://www.humblebundle.com/store/search?sort=bestselling&filter=onsale') game_list = [] game_boxes = driver.find_elements_by_class_name('entity-link') game_list.append(_get_filled_game(game_boxes)) counter = 0 while True: counter += 1 try: driver.get(f'https://www.humblebundle.com/store/search?sort=bestselling&filter=onsale&page={counter}') time.sleep(2) driver.find_element_by_class_name('entity-link') game_boxes = driver.find_elements_by_class_name('entity-link') game_list.append(_get_filled_game(game_boxes)) except NoSuchElementException: logging.getLogger().info(f'Found {len(game_list)} games on {counter} pages on HumbleBundle') return game_list
def _get_total_sale_price(driver: webdriver.Chrome) -> Decimal: headlines = driver.find_elements_by_class_name('dd-header-headline') prices = [] for headline in headlines: matches = re.findall('€\\d.?\\d?\\d?', headline.text) if len(matches) > 0: prices.append(Decimal(matches[0].replace('€', ''))) return max(prices)
def scrape_sites(): websites = pd.read_csv("lsa.csv") lsa_websites = [] lsa_dict = { 'website': [], 'low': [], "medium": [], "high": [], "score": [] } for website in websites["LSA Sites"]: lsa_websites.append(website) URL = f"https://www.accessi.org/?{website}" options = Options() options.headless = True driver = Chrome(chrome_options=options) driver.get(URL) sleep(15) high_score = 0 high = 0 medium_score = 0 medium = 0 low_score = 0 low = 0 cats = driver.find_elements_by_class_name("test-result-block") for i in cats: if i.get_attribute("style") == "": header = i.find_element_by_class_name("test-result-header") num_impacts = header.find_element_by_class_name( "test-result-count") num_impacts = str(num_impacts.text) if "high" in num_impacts: number = num_impacts.split(" ")[0] high += int(number) high_score += 3 * int(number) elif "medium" in num_impacts: number = num_impacts.split(" ")[0] medium += int(number) medium_score += 2 * int(number) elif "low" in num_impacts: number = num_impacts.split(" ")[0] low += int(number) low_score += int(number) lsa_dict["website"].append(website) lsa_dict["low"].append(low) lsa_dict["medium"].append(medium) lsa_dict["high"].append(high) lsa_dict["score"].append(medium_score + high_score) print(website) print("low:", low) print("medium:", medium) print("high:", high) print("final score:", medium_score + high_score) print() driver.close() df = pd.DataFrame(lsa_dict) df.to_csv("output.csv")
def insert_sn_target_sku(browser: Chrome): elements = browser.find_elements_by_class_name('item-wrap') print(f'当前页面共有{len(elements)}个商品') for element in elements: # 获取当前商品SKU编号 content = element.get_attribute('id').split('-') shop_code = content[0] sku = content[1] SNTargetSku.get_or_create(shop_code=shop_code, sku=sku)
def __init__(self, driver: webdriver.Chrome): print("") # Find pizza images by class name imagesTest = driver.find_elements_by_class_name("pizzaPictures") # For loop that loops through all the items in the div and prints their source for item in imagesTest: print(item.get_attribute("src"))
def main(): webdriver = os.path.join(r"drive", "chromedriver") driver = Chrome(webdriver) url = "https://www.waytostay.com/paris-apartments/" driver.get(url) driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") info = driver.find_elements_by_class_name("tile") prices = [] details = [] for j in range(len(info)): prices.append( driver.find_elements_by_class_name('price-person')[j].text) details.append( driver.find_elements_by_class_name('concise-details')[j].text) print(prices, details) driver.close() driver.quit()
def _find_right_game(game: Game, driver: webdriver.Chrome, search_query): time.sleep(2) product_grids = driver.find_elements_by_class_name('products-grid__item') for product_grid in product_grids: if _find_proper_card(product_grid, game.name, search_query): time.sleep(1) offers = driver.find_elements_by_class_name('offer') for offer in offers: if 'Official developer' not in offer.get_attribute( 'innerHTML'): try: rating_count = _get_g2a_rating_count(offer) except ValueError: continue if game.review_count < 500: return _get_price(offer) if rating_count > 1000: return _get_price(offer) return Decimal(0)
def parse_friends(driver: Chrome, user_link: str) -> List[User]: sleep(delay()) driver.get(f"{user_link}/friends") driver.find_element_by_tag_name("body").send_keys(Keys.ESCAPE) scroll_page(driver, driver.execute_script("return document.body.scrollHeight")) friend_items = driver.find_elements_by_class_name("_698") friend_list = [User(friend_item) for friend_item in friend_items] return friend_list
def fetch(): options = ChromeOptions() options.add_argument('headless') driver = Chrome( executable_path= "/Users/tuanthanhtran/Desktop/training/bug-bounty/hackerone-reports/chromedriver", options=options) reports = [] with open('data.csv', 'r', newline='', encoding='utf-8') as file: reader = csv.DictReader(file) for row in reader: reports.append(dict(row)) first_report_link = reports[0]['link'] driver.get(hacktivity_url) driver.implicitly_wait(page_loading_timeout) counter = 0 page = 0 last_height = driver.execute_script("return document.body.scrollHeight") while True: driver.execute_script( "window.scrollTo(0, document.body.scrollHeight);") time.sleep(page_loading_timeout) new_height = driver.execute_script("return document.body.scrollHeight") if new_height == last_height: counter += 1 if counter > 1: break else: counter = 0 last_height = new_height raw_reports = driver.find_elements_by_class_name('fade') new_reports = extract_reports(raw_reports) found = False for i in range(len(new_reports)): if new_reports[i]['link'] == first_report_link: reports = new_reports[:i] + reports found = True break if found: break page += 1 print('Page:', page) driver.close() with open('data.csv', 'w', newline='', encoding='utf-8') as file: keys = reports[0].keys() writer = csv.DictWriter(file, fieldnames=keys) writer.writeheader() writer.writerows(reports)
def test_one(self): browser = Chrome('/usr/local/bin/chromedriver') browser.get('https://duckduckgo.com') search_form = browser.find_element_by_id('search_form_input_homepage') search_form.send_keys('real python') search_form.submit() results = browser.find_elements_by_class_name('result') assert 'https://realpython.com' in results[0].text browser.close()
def parse_jobs(driver: Chrome, user_link: str) -> List[str]: sleep(delay()) driver.get(user_link) driver.find_element_by_tag_name("body").send_keys(Keys.ESCAPE) # закроем поп-ап с предложением об оповещениях driver.find_element(By.XPATH, "//a[@data-tab-key='about']").click() sleep(delay()) driver.find_element(By.XPATH, "//a[@data-testid='nav_edu_work']").click() sleep(delay()) experience_list = driver.find_elements_by_class_name("experience") jobs = [parse_job(item) for item in experience_list] return jobs
def test_search(): browser = Chrome() browser.get('https://duckduckgo.com') search_form = browser.find_element_by_id('search_form_input_homepage') search_form.send_keys('real python') search_form.submit() results = browser.find_elements_by_class_name('result') first_result_text = results[0].text print(first_result_text) browser.close() assert first_result_text == 'Python Tutorials - Real Python\nPython "while" Loops (Indefinite Iteration) In this tutorial, you\'ll learn about indefinite iteration using the Python while loop. You\'ll be able to construct basic and complex while loops, interrupt loop execution with break and continue, use the else clause with a while loop, and deal with infinite loops.\nhttps://realpython.com'
def extract_publication(profile_url, verbose=verbose_citation_list): """ this function crawl the publication list from the google scholar profile @param[in] profile_url the link of google scholar profile you want to crawl @param[in] verbose the level of information you want to scrawl. By default, we will scraw the detailed citation list for each of your publicaiton @return the list of pulication as a list, where each entry is a dictionary """ # scholar's artical list browser=Browser() browser.get(profile_url) publication={} while True: publication_list=browser.find_elements_by_class_name('gsc_a_tr') for publication_item in publication_list: title=publication_item.find_element_by_class_name('gsc_a_at').text print title author=publication_item.find_elements_by_class_name('gs_gray')[0].text.split(', ') vendor=publication_item.find_elements_by_class_name('gs_gray')[1].text try: citation=int(publication_item.find_element_by_class_name('gsc_a_ac').text) link=publication_item.find_element_by_class_name('gsc_a_ac').get_attribute('href') except: citation=0 link=None try: year=int(publication_item.find_element_by_class_name('gsc_a_h').text) except: year=None """ # to get citation for every paper, but will be detected as robot if citation>0 and verbose>=verbose_citation_list: print 'and its citation list', # to solve anti-crawl, but not work # time.sleep(2) cited_by=extract_citation_for_publication(link) else: cited_by=None print 'finished' publication[title]={'link':link,'author':author,'vendor':vendor,'citation':citation, 'cited by': cited_by, 'year':year} """ publication[title]={'link':link,'author':author,'vendor':vendor,'citation':citation, 'cited by': citation, 'year':year} if not next_page_new(browser): break browser.close() return publication
def extract_movies(max_page_num=5): browser = Browser() browser.get(URL) movies = {} while True: movie_list = browser.find_elements_by_class_name("item") for movie in movie_list: title = movie.find_element_by_tag_name("p").text.strip() rating = movie.find_element_by_tag_name("strong").text.strip() movies[title] = rating if max_page_num > 0: max_page_num -= 1 if not have_more(browser): break else: break browser.close() return movies
def extract_hongren(max_page_num=5): suffix = "hongren" # 正常情况下,把driver文件所在路径加到Path环境变量里就可以了 # 但是我这里不知道怎么回事就是不行,干脆放在代码所在目录下面了 browser = Browser('chromedriver.exe') browser.get(BASE_URL + suffix) items = {} while True: item_list = browser.find_elements_by_class_name('wall_item') for item in item_list: href = item.find_element(By.CSS_SELECTOR, ".pic_box.pic").get_attribute("href") desc = item.find_elements_by_class_name("desc")[0].text.strip() items[href] = desc if max_page_num > 0: max_page_num -= 1 if not scroll_to_next(browser): break else: break browser.close() return items