def fetch_image_urls(query: str, max_links_to_fetch: int, wd: webdriver, sleep_between_interactions: int = 1): def scroll_to_end(wd): wd.execute_script("window.scrollTo(0, document.body.scrollHeight);") time.sleep(sleep_between_interactions) # build the google query search_url = "https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img" # load the page wd.get(search_url.format(q=query)) image_urls = set() image_count = 0 results_start = 0 while image_count < max_links_to_fetch: scroll_to_end(wd) # get all image thumbnail results thumbnail_results = wd.find_elements_by_css_selector("img.rg_ic") number_results = len(thumbnail_results) print( f"Found: {number_results} search results. Extracting links from {results_start}:{number_results}" ) for img in thumbnail_results[results_start:number_results]: # try to click every thumbnail such that we can get the real image behind it try: img.click() time.sleep(sleep_between_interactions) except Exception: continue # extract image urls actual_images = wd.find_elements_by_css_selector('img.irc_mi') for actual_image in actual_images: if actual_image.get_attribute('src'): image_urls.add(actual_image.get_attribute('src')) image_count = len(image_urls) if len(image_urls) >= max_links_to_fetch: print(f"Found: {len(image_urls)} image links, done!") break else: print("Found:", len(image_urls), "image links, looking for more ...") time.sleep(1) load_more_button = wd.find_element_by_css_selector(".ksb") if load_more_button: wd.execute_script("document.querySelector('.ksb').click();") # move the result startpoint further down results_start = len(thumbnail_results) return image_urls
def fetch_image_urls(query: str, max_links_to_fetch: int, wd: webdriver, sleep_between_interactions: int = 1): #search for query wd.get('https://www.google.ae/imghp?hl=en&ogbl') search_box = wd.find_element_by_css_selector('input.gLFyf') search_box.send_keys(query + ' company logo') search_box.submit() image_urls = set() image_count = 0 results_start = 0 while image_count < max_links_to_fetch: # get all image thumbnail results thumbnail_results = wd.find_elements_by_css_selector("img.Q4LuWd") number_results = len(thumbnail_results) print( f"Found: {number_results} search results. Extracting links from {results_start}:{number_results}" ) for img in thumbnail_results[results_start:number_results]: # try to click every thumbnail such that we can get the real image behind it try: time.sleep(sleep_between_interactions) img.click() time.sleep(sleep_between_interactions) except Exception: continue # extract image urls actual_images = wd.find_elements_by_css_selector('img.n3VNCb') for actual_image in actual_images: if actual_image.get_attribute( 'src') and 'http' in actual_image.get_attribute('src'): image_urls.add(actual_image.get_attribute('src')) image_count = len(image_urls) if len(image_urls) >= max_links_to_fetch: print(f"Found: {len(image_urls)} image links, done!") break else: print("Found:", len(image_urls), "image links, looking for more ...") time.sleep(30) return load_more_button = wd.find_element_by_css_selector(".mye4qd") if load_more_button: wd.execute_script("document.querySelector('.mye4qd').click();") # move the result startpoint further down results_start = len(thumbnail_results) return image_urls
def get_full_address(driver: webdriver) -> []: address_list = [] addresses_1 = driver.find_elements_by_css_selector('.bfg-gallery-address') addresses_2 = driver.find_elements_by_css_selector('.bfg-gallery-address2') for add_1, add_2 in zip(addresses_1, addresses_2): address_list.append(add_1.text + add_2.text) return address_list
def fetch_image_urls(query: str, max_links_to_fetch: int, wd: webdriver, sleep_between_interactions: int = 1): def scroll_to_end(wd): wd.execute_script("window.scrollTo(0, document.body.scrollHeight);") time.sleep(sleep_between_interactions) # build the google query search_url = "https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img" # manual google image url # {q} - search string we wanted to searvj # https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q=dog&oq=dog&gs_l=img # load the page - this give you the dog images wd.get(search_url.format(q=query)) # this function opens up the browser with the searched string image_urls = set() # we want the unique url and not the duplicate ones ie why set is used. image_count = 0 results_start = 0 while image_count < max_links_to_fetch: scroll_to_end(wd) # makes you browser scroll down # get all image thumbnail results thumbnail_results = wd.find_elements_by_css_selector("img.Q4LuWd") # binary format of the results number_results = len(thumbnail_results) # length of the results print(f"Found: {number_results} search results. Extracting links from {results_start}:{number_results}") for img in thumbnail_results[results_start:number_results]: # try to click every thumbnail such that we can get the real image behind it try: img.click() # clicks the image time.sleep(sleep_between_interactions) # wait for some time so image will be loaded except Exception: continue # extract image urls actual_images = wd.find_elements_by_css_selector('img.n3VNCb') # after clicking fetch the image for actual_image in actual_images: if actual_image.get_attribute('src') and 'http' in actual_image.get_attribute('src'): # if it is holding a valid url then add image_urls.add(actual_image.get_attribute('src')) image_count = len(image_urls) if len(image_urls) >= max_links_to_fetch: print(f"Found: {len(image_urls)} image links, done!") break else: print("Found:", len(image_urls), "image links, looking for more ...") time.sleep(30) return load_more_button = wd.find_element_by_css_selector(".mye4qd") if load_more_button: wd.execute_script("document.querySelector('.mye4qd').click();") # move the result startpoint further down results_start = len(thumbnail_results) return image_urls
def fetch_image_urls(query: str, max_links_to_fetch: int, wd: webdriver, sleep_between_interactions: int = 1): def scroll_to_end(wd): wd.execute_script("window.scrollTo(0, document.body.scrollHeight);") time.sleep(sleep_between_interactions) search_url = "https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img" wd.get(search_url.format(q=query)) image_urls = set() image_count = 0 results_start = 0 while image_count < max_links_to_fetch: scroll_to_end(wd) thumbnail_results = wd.find_elements_by_css_selector("img.Q4LuWd") number_results = len(thumbnail_results) print( f"Found: {number_results} search results. Extracting links from {results_start}:{number_results}" ) for img in thumbnail_results[results_start:number_results]: try: img.click() time.sleep(sleep_between_interactions) except Exception: continue actual_images = wd.find_elements_by_css_selector('img.n3VNCb') for actual_image in actual_images: if actual_image.get_attribute( 'src') and 'http' in actual_image.get_attribute('src'): image_urls.add(actual_image.get_attribute('src')) image_count = len(image_urls) if len(image_urls) >= max_links_to_fetch: print(f"Found: {len(image_urls)} image links, done!") break else: print("Found:", len(image_urls), "image links, looking for more ...") time.sleep(30) return load_more_button = wd.find_element_by_css_selector(".mye4qd") if load_more_button: wd.execute_script("document.querySelector('.mye4qd').click();") results_start = len(thumbnail_results) return image_urls
def fetch_image_urls(query:str,max_links_to_fetch:int,wd:webdriver,sleep_between_interaction: int =1): def scroll_to_end(wd): wd.execute_script("window.scrollTo(0,document.body.scrollHeight);") time.sleep(sleep_between_interaction) search_url = "https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img" #load the page wd.get(search_url.format(q=query)) image_urls=set() image_count= 0 result_start=0 while image_count <max_links_to_fetch: scroll_to_end(wd) #get all the images thumbnails results thumbnails_result= wd.find_elements_by_css_selector("img.Q4LuWd") #print(thumbnails_result) num_results= len(thumbnails_result) print(f"Found: {num_results} search results. Extracting links from {result_start} :{num_results}") for img in thumbnails_result[result_start:num_results]: #try to click every thumbnail and get the image page behind it. try: img.click() time.sleep(sleep_between_interaction) except Exception: continue #extract images urls actual_images=wd.find_elements_by_css_selector('img.n3VNCb') for actual_image in actual_images: if actual_image.get_attribute('src') and 'http' in actual_image.get_attribute('src'): image_urls.add(actual_image.get_attribute('src')) image_count=len(image_urls) if len(image_urls) >= max_links_to_fetch: print(f"Found: {len(image_urls)} image links ,Done!!") break else: print(f"Found {len(image_urls)} images links ,Looking for more") time.sleep(30) return load_more_button=wd.find_element_by_css_selector('.mye4qd') if load_more_button: wd.execute_script("document.querySelector('.mye4qd').click();") #move the result startpoint further down result_start=len(thumbnails_result) return image_urls
def fetch_image_urls(query: str, max_links_to_fetch: int, wd: webdriver, sleep_between_interactions: int = 1): """Allows the webdriver to look for a query in Google Image and fetches a number of image links corresponding to the query.""" # build the google query search_url = "https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img" # load the page wd.get(search_url.format(q=query)) image_urls = set() image_count = 0 results_start = 0 while image_count < max_links_to_fetch: _scroll_to_end(wd) # get all image thumbnail results thumbnail_results = wd.find_elements_by_css_selector("img.Q4LuWd") number_results = len(thumbnail_results) print(f"Found: {number_results} search results. Extracting links from {results_start}:{number_results}") for img in thumbnail_results[results_start:number_results]: # try to click every thumbnail such that we can get the real image behind it try: img.click() time.sleep(sleep_between_interactions) except Exception: continue # extract image urls actual_images = wd.find_elements_by_css_selector('img.n3VNCb') for actual_image in actual_images: if actual_image.get_attribute('src') and 'http' in actual_image.get_attribute('src'): if actual_image.get_attribute('src') is not None: image_urls.add(actual_image.get_attribute('src')) else: pass image_count = len(image_urls) if len(image_urls) >= max_links_to_fetch: print(f"Found: {len(image_urls)} image links, done!") break else: print("Found:", len(image_urls), "image links, looking for more ...") time.sleep(30) load_more_button = wd.find_element_by_css_selector(".mye4qd") if load_more_button: wd.execute_script("document.querySelector('.mye4qd').click();") # move the result start point further down results_start = len(thumbnail_results) return image_urls
def fetch_image_urls(query: str, max_links_to_fetch: int, wd: webdriver, sleep_between_interactions: float = 1.0): def scroll_to_end(wd): wd.execute_script("window.scrollTo(0, document.body.scrollHeight);") time.sleep(sleep_between_interactions) search_url = "https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img" # load the page wd.get(search_url.format(q=query)) image_urls = set() image_count = 0 results_start = 0 while image_count < max_links_to_fetch: scroll_to_end(wd) # get all image thumbnail results thumbnail_results = wd.find_elements_by_css_selector("img.Q4LuWd") number_results = len(thumbnail_results) logger.info("Found: %i search results. Extracting links from %i:%i" % (number_results, results_start, number_results)) for img in thumbnail_results[results_start:number_results]: # try to click every thumbnail such that we can get the real image behind it try: img.click() time.sleep(sleep_between_interactions) except Exception: continue # extract image urls actual_images = wd.find_elements_by_css_selector("img.n3VNCb") for actual_image in actual_images: if actual_image.get_attribute("src") and "http" in actual_image.get_attribute("src"): image_urls.add(actual_image.get_attribute("src")) image_count = len(image_urls) if len(image_urls) >= max_links_to_fetch: logger.info("Found: %i image links, done!" % len(image_urls)) break else: logger.info("Found: %i image links, looking for more ..." % len(image_urls)) time.sleep(30) load_more_button = wd.find_element_by_css_selector(".mye4qd") if load_more_button: wd.execute_script("document.querySelector('.mye4qd').click();") # move the result startpoint further down results_start = len(thumbnail_results) return image_urls
def fetch_image_urls(search_term: str, n_links: int, web_driver: webdriver, sleep_between_interactions: int = 1): def scroll_to_end(web_driver): web_driver.execute_script( "window.scrollTo(0, document.body.scrollHeight);") time.sleep(sleep_between_interactions) search_url = f"https://www.google.com/search?tbm=isch&q={'+'.join(search_term.split())}" web_driver.get(search_url) image_counter = 0 res_start = 0 image_urls = set() while(image_counter < n_links): thumbnail_results = web_driver.find_elements_by_css_selector( 'img.Q4LuWd') number_results = len(thumbnail_results) scroll_to_end(web_driver) print( f'Found {number_results} search results. Extracting links {res_start} to {number_results}') for img in thumbnail_results[res_start:number_results]: try: img.click() time.sleep(sleep_between_interactions) except: continue actual_image = web_driver.find_elements_by_css_selector( 'img.n3VNCb') for img in actual_image: src = img.get_attribute('src') if src and 'http' in src: image_urls.add(src) image_counter = len(image_urls) if image_counter >= n_links: print(f"Found {n_links} image links.") break else: print("Found:", len(image_urls), "image links, looking for more ...") time.sleep(3) load_more_button = web_driver.find_element_by_css_selector( ".mye4qd") if load_more_button: web_driver.execute_script( "document.querySelector('.mye4qd').click();") res_start = len(thumbnail_results) return image_urls
def fetch_image_urls(search_keyword: str, download_number: int, wd: webdriver): print('------------------------------------------------') print('Start getting thumnails') fetch_thumbnail_count = 0 thumbnails = None # fetch thumbnails up to number of images to download while fetch_thumbnail_count < download_number: wd.execute_script("window.scrollTo(0, document.body.scrollHeight);") time.sleep(1) thumbnails = wd.find_elements_by_css_selector('img.Q4LuWd') fetch_thumbnail_count = len(thumbnails) # break if fetched thumbnails number exceed number of images to download if len(thumbnails) >= download_number: print('Success: Fetched thumbnails count', download_number) break else: # load more thumbnails when load_more_button appears load_more_button = wd.find_element_by_css_selector(".mye4qd") if load_more_button: wd.execute_script("document.querySelector('.mye4qd').click();") # brek when end_text appears ( this is the limit of thumbnails that can be fetched ) end_text = wd.find_element_by_class_name('OuJzKb') if end_text and end_text.text == 'Looks like you\'ve reached the end': print('Success: Fetched maximum thumbnails count', len(thumbnails)) break print('Start getting image urls') image_urls = [] # extract the image url from the elements displayed by clicking the thumbnails for thumbnail in thumbnails[:download_number]: try: thumbnail.click() time.sleep(1) except Exception: continue # extract only the original image url because there are some urls thumbnail_alt = thumbnail.get_attribute('alt') images = wd.find_elements_by_css_selector('img.n3VNCb') for image in images: image_alt = image.get_attribute('alt') if thumbnail_alt == image_alt and 'http' in image.get_attribute('src'): image_urls.append(image.get_attribute('src')) print('Success: Fetched image urls count', len(image_urls)) return image_urls
def fetch_image_urls(query: str, max_links_to_fetch: int, wd: webdriver, sleep_between_interactions: int = 3): def scroll_to_end(wd, scroll_point): wd.execute_script(f"window.scrollTo(0, {scroll_point});") time.sleep(sleep_between_interactions) # to build out python package use input function in search_url variable # build the unsplash query search_url = f"https://www.google.com/search?q=google+images+{query}&rlz=1C1CHBF_enUS830US830&sxsrf=ALeKk03SgL8-qRAfeZd1QDzweydJ4MlDgg:1628187781073&source=lnms&tbm=isch&sa=X&ved=2ahUKEwi61sWSwJryAhXNTDABHYhPDpsQ_AUoAXoECAEQAw&biw=1536&bih=722&dpr=1.25" # load the page wd.get(search_url) time.sleep(sleep_between_interactions) image_urls = set() image_count = 0 number_results = 0 for i in range(1, 20): scroll_to_end(wd, i * 1000) time.sleep(5) thumb = wd.find_elements_by_css_selector("img") time.sleep(5) for img in thumb: print(img) print(img.get_attribute('src')) image_urls.add(img.get_attribute('src')) image_count = len(image_urls) number_results = image_count time.sleep(.5) print( f"Found: {number_results} search results. Extracting links...") return image_urls
def get_niche_grade(driver: webdriver, property_id: int): neighborhood = 'WASHINGTON' # getNeighborhood(property_id) result = [] niche_base_url = 'https://www.niche.com/places-to-live/' url = niche_base_url + 'washington-dc-district-of-columbia-dc' driver.get(url) grades = driver.find_elements_by_css_selector( '.profile-grade--two .niche__grade') keys = [ 'public_school', 'safety', 'jobs', 'nightlife', 'cost_of_living', 'housing' ] values = [] for i in grades: text_list = i.text.split() if len(text_list) > 0: values.append(text_list[1]) result = {} for k, v in zip(keys, values): result[k] = [v] result['neighborhood'] = [neighborhood] df = pd.DataFrame.from_dict(result) return result
def fetch_image_unsplash(query: str, max_links_to_fetch: int, wd: webdriver, sleep_between_interactions: int = 3): def scroll_to_end(wd, scroll_point): wd.execute_script(f"window.scrollTo(0, {scroll_point});") time.sleep(sleep_between_interactions) # build the unsplash query search_url = f"https://unsplash.com/s/photos/{query}" # load the page wd.get(search_url) time.sleep(sleep_between_interactions) image_urls = set() image_count = 0 number_results = 0 for i in range(1, 20): scroll_to_end(wd, i * 1000) time.sleep(5) thumb = wd.find_elements_by_css_selector("img._2UpQX") time.sleep(5) for img in thumb: image_urls.add(img.get_attribute('src')) image_count = len(image_urls) number_results = image_count time.sleep(.5) print(f"Found: {number_results} search results. Extracting links...") return image_urls
def launch_twitter(driver: webdriver): """ :type driver: selenium.webdriver.firefox.webdriver.WebDriver """ twitter_url = "https://twitter.com/search?f=tweets&vertical=default&q=gleam.io&src=typd&lang=en" driver.get(twitter_url) URLs = driver.find_elements_by_css_selector("li[data-item-id]") lurl = [] for tweet in URLs: if tweet.find_elements_by_class_name("twitter-timeline-link"): linkr = tweet.find_element_by_class_name("twitter-timeline-link") text = linkr.get_attribute("href") if len(tweet.find_elements_by_class_name("card2")) == 0 and len( text) == 0: if len( tweet.find_elements_by_xpath( ".//*[starts-with(@id,'xdm')]")) != 0: frame = tweet.find_element_by_xpath( ".//*[starts-with(@id,'xdm')]") driver.switch_to.frame(frame) link = driver.find_element_by_xpath("/html/body/div/div/a") text = link.get_attribute("href") driver.switch_to.default_content() lurl.append(text) return lurl
def get_image_urls(wd: webdriver, max_links, query): def scroll_to_end(wd): wd.execute_script("window.scrollTo(0, document.body.scrollHeight);") search_url = f"https://www.google.com/search?tbm=isch&sxsrf=ALeKk00SMu3Udk8ijCHEDJ_BC6AHQG0Leg%3A1612191933731&source=hp&biw=1920&bih=979&ei=vRgYYPiiKpm-9QO8kIzACg&q={query}&oq={query}&gs_lcp=CgNpbWcQAzIFCAAQsQMyBQgAELEDMgUIABCxAzIFCAAQsQMyBQgAELEDMgUIABCxAzIFCAAQsQMyBQgAELEDMgIIADIFCAAQsQM6CAgAELEDEIMBUNYCWO8RYNgSaABwAHgAgAGkAogB1hKSAQUwLjUuNpgBAKABAaoBC2d3cy13aXotaW1n&sclient=img&ved=0ahUKEwi49ZH8-sjuAhUZX30KHTwIA6gQ4dUDCAY&uact=5" wd.get(search_url) image_urls = [] count = 0 while count < max_links: scroll_to_end(wd) thumbnail_results = wd.find_elements_by_css_selector("img.Q4LuWd") num_results = len(thumbnail_results) print( f"Found {num_results} search result. Getting source of {num_results}:..{max_links}" ) for img in thumbnail_results: try: img.click() except: pass time.sleep(3) #change according to resolution actual_images = wd.find_elements_by_css_selector("img.n3VNCb") for actual in actual_images: if "http" in actual.get_attribute( 'src') and "encrypted" not in actual.get_attribute( 'src'): image_urls.append(actual.get_attribute('src')) print(f"{count+1}:{actual.get_attribute('src')}") count = len(image_urls) if count >= max_links: print(f"Fetched {count} urls...Downloading!") return image_urls else: print("Found:", len(image_urls), "image links, looking for more ...") time.sleep(30) return load_more_button = wd.find_element_by_css_selector(".mye4qd") if load_more_button: wd.execute_script("document.querySelector('.mye4qd').click();") return image_urls
def retrieve_image_url(search: str, max_links: int, wd: webdriver, sleep_bw_interact: float = 1): #output set of image urls img_urls = set() google_img_url = 'https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img' #print('final_url_to_search- ',google_img_url.format(q=search)) #time to get the page wd.get(google_img_url.format(q=search)) img_count = 0 start_ind = 0 while_loop = 0 while img_count < max_links: while_loop += 1 print('count started: ', while_loop) thumbnails = wd.find_elements_by_css_selector('img.Q4LuWd') scroll_to_end(wd) print('thumbnails found ', len(thumbnails)) results_found = len(thumbnails) print('start with: {a} and end with : {b} '.format(a=start_ind, b=results_found)) for i in thumbnails[start_ind:results_found]: try: i.click() time.sleep(sleep_bw_interact) except Exception as e: print(e, 'while doing thumbnail click') continue #after click extract image url actual_img = wd.find_elements_by_css_selector('img.n3VNCb') for j in actual_img: if j.get_attribute('src') and 'http' in j.get_attribute('src'): img_urls.add(j.get_attribute('src')) img_count = len(img_urls) if img_count >= max_links: print('got enough links ', img_count) break start_ind = len(thumbnails) return img_urls
def get_price_list(driver: webdriver) -> []: price_list = [] asking_prices = driver.find_elements_by_css_selector('.bfg-gallery-price') for i in asking_prices: price_str = i.text.split()[2][1:].replace(',', '') price_list.append(float(price_str)) return price_list
def extract_info_from_iframe(browser: webdriver) -> []: try: # open all other pages with comments button = browser.find_element_by_css_selector( "button[data-dot='strankovani/nacist_dalsi']") while button: try: action = ActionChains(browser) action.move_to_element(button).click().perform() except (ElementClickInterceptedException, StaleElementReferenceException): pass sleep(0.5 * SETTINGS["lazy_factor"]) button = browser.find_element_by_css_selector( "button[data-dot='strankovani/nacist_dalsi']") except NoSuchElementException: pass try: # open threads of subcomments for button in browser.find_elements_by_css_selector( "button[data-dot='nacist_nove_podkomentare']"): try: action = ActionChains(browser) action.move_to_element(button).click().perform() except (ElementClickInterceptedException, StaleElementReferenceException): pass sleep(0.3 * SETTINGS["lazy_factor"]) except NoSuchElementException: pass # now we can use familiar beautiful soup soup = BeautifulSoup(browser.page_source, "html.parser") authors = soup.select("a[class='f_bO'] span") texts = soup.select("p[class='d_aJ']") reactions = soup.select("a[class='f_cQ']") comments = [] progress_bar = tqdm(total=len(authors), desc="Comments", position=0) for i in range(len(authors)): author = authors[i].text text = texts[i].text # expect that reaction can be missing in some comments reactions_count = 0 if len(reactions) > i: reactions_count = reactions[i].text comments.append( Comment(author=author, text=text, reactions=reactions_count)) progress_bar.update(1) # browser doesn't need to be open from now on browser.quit() return comments
def fetch_image_urls(query: str, max_links_to_fetch: int, wd: webdriver, sleep_between_interactions: int = 1): def scroll_to_end(wd): wd.execute_script("window.scrollTo(0, document.body.scrollHeight);") time.sleep(sleep_between_interactions) search_url = "https://www.google.com/search?q={q}&source=lnms&tbm=isch" # load the page wd.get(search_url.format(q=query)) image_urls = set() image_count = 0 results_start = 0 while image_count < max_links_to_fetch: for _ in range(10): scroll_to_end(wd) # get all image thumbnail results thumbnail_results = wd.find_elements_by_css_selector("img.Q4LuWd") number_results = len(thumbnail_results) for img in thumbnail_results[results_start:number_results]: if img.get_attribute('src') and 'http' in img.get_attribute('src'): image_urls.add(img.get_attribute('src')) if img.get_attribute('src') and 'data' in img.get_attribute('src'): image_urls.add(img.get_attribute('src')) image_count = len(image_urls) if len(image_urls) >= max_links_to_fetch: print(f"Found: {len(image_urls)} image links, done!") break else: print("Found:", len(image_urls), "image links, looking for more ...") # return load_more_button = wd.find_element_by_css_selector(".mye4qd") if load_more_button: wd.execute_script("document.querySelector('.mye4qd').click();") time.sleep(3) # end_of_page = wd.find_element_by_xpath("//div[@class='OuJzKb Yu2Dnd']") end_of_page = wd.find_elements_by_xpath( "//*[ contains (text(), 'Looks like') ]") if end_of_page: print("end of the page") break # move the result startpoint further down results_start = len(thumbnail_results) return image_urls
def get_documents(self, browser: webdriver, url: str): browser.get(url) try: WebDriverWait(browser, DELAY).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div.headlineContaniner > a'))) except TimeoutException: return [] try: links = [link.get_attribute('href') for link in browser.find_elements_by_css_selector('div.headlineContaniner > a')] except StaleElementReferenceException: time.sleep(2) links = [link.get_attribute('href') for link in browser.find_elements_by_css_selector('div.headlineContaniner > a')] documents = [] for link in links: documents.append(self.get_document_details(browser, link)) return documents
def get_price(driver: webdriver) -> Tuple[datetime, str, Set[str]]: driver.get(URL) WebDriverWait(driver, 10).until( EC.presence_of_element_located( (By.LINK_TEXT, "Book") # Button like 'Book' or 'Confirm' )) # CSS selectors for currency and price elements on page currency = driver.find_elements_by_css_selector("p.price_info span") prices = driver.find_elements_by_css_selector("p.price_info span.num") timestamp = datetime.now() driver.get_screenshot_as_file( f"{SCREENSHOT_PATH}/ticket-prices_{timestamp.strftime('%d-%m-%Y_%H-%M-%S')}.png" ) return timestamp, currency[0].text, {i.text for i in prices}
def scrape_followers( driver: webdriver, username: str, cookies: List[Dict[str, Any]] = None) -> Tuple[str, str, Set[str], Set[str]]: # CSS Selector for followers and following lists list_css: str = "div[role='dialog'] a.notranslate" if cookies: # Load any page before setting cookies driver.get("https://www.instagram.com/data/manifest.json") for cookie in cookies: driver.add_cookie(cookie) # Load account page driver.get(f"https://www.instagram.com/{username}/") num_followers: str = driver.find_element_by_css_selector( "a[href*='followers'] span").text num_following: str = driver.find_element_by_css_selector( "a[href*='following'] span").text # Click the 'Followers' link driver.find_element_by_partial_link_text("followers").click() WebDriverWait(driver, 10).until( EC.visibility_of_all_elements_located((By.CSS_SELECTOR, list_css))) # TODO: Scrolling Magic here _followers: List = driver.find_elements_by_css_selector(list_css) followers: Set[str] = {i.text for i in _followers} driver.find_element_by_css_selector( "div[role='dialog'] button span[aria-label='Close']").click() # Click the 'Following' link driver.find_element_by_partial_link_text("following").click() WebDriverWait(driver, 10).until( EC.visibility_of_all_elements_located((By.CSS_SELECTOR, list_css))) # TODO: Scrolling Magic here _following: List = driver.find_elements_by_css_selector(list_css) following: Set[str] = {i.text for i in _following} return (num_followers, num_following, followers, following)
def get_img(driver: webdriver) -> List[str]: try: driver.implicitly_wait(5) imgs: List[str] = driver.find_elements_by_css_selector( 'div.smallImg>ul>li>a>img') src: List[str] = [] for img in imgs: src.append(img.get_attribute('src').replace('_S', '_L')) return src except Exception as e: print('Can\'t get Images SRC List. Reason %s.' % e)
def get_img(self, driver: webdriver) -> List[str]: try: driver.implicitly_wait(5) imgs: List[str] = driver.find_elements_by_css_selector( 'body > div.page-popup.exhibited-vehicle > div.clfix > div.vehicle-photo-wrap > div.vehicle-thumbnail > ul > li > a > img' ) src: List[str] = [] for img in imgs: src.append(img.get_attribute('src')) return src except Exception as e: print('Can\'t get Images SRC List. Reason %s.' % e)
def find_sizes(self, driver: webdriver) -> List[Size]: # pylint: disable=missing-function-docstring sizes = driver.find_elements_by_css_selector( '.size-list .product-size') obj_sizes = [] for size in sizes: deque_size_types = collections.deque( size.get_attribute('data-name').split(' ('), 2) obj_size = Size(deque_size_types.popleft(), self.clean_nubmer_size(deque_size_types.popleft()), "disabled" in size.get_attribute("class")) obj_sizes.append(obj_size) return obj_sizes
def fetch_image_urls_from_google(query: str, wd: webdriver, sleep_between_interactions: int = 1): """ Fetches all the urls of images found on the first result page for received search query. :param query: query to search for :param wd: selenium web driver :param sleep_between_interactions: time for browser to load photos :return: set of found urls """ # Google search - large images search_url = "https://www.google.com/search?q={q}&tbm=isch&hl=en-US&hl=en-US&tbs=isz%3Al&client=ubuntu&hs=hdu&ved" \ "=0CAEQpwVqFwoTCKDZh9KqmOgCFQAAAAAdAAAAABAD&biw=1908&bih=955 " image_urls = set() # load the page wd.get(search_url.format(q=query)) wd.execute_script("window.scrollTo(0, document.body.scrollHeight);") time.sleep(sleep_between_interactions) # get all image thumbnail results thumbnails = wd.find_elements_by_css_selector("img.Q4LuWd") for thumbnail in thumbnails: try: # get big image from the thumbnail thumbnail.click() time.sleep(sleep_between_interactions) except Exception: continue # extract image urls images = wd.find_elements_by_css_selector('img.n3VNCb') for img in images: if img.get_attribute('src') and 'http' in img.get_attribute('src'): image_urls.add(img.get_attribute('src')) print(f"Found: {len(image_urls)} image links for the search query: {query}") return image_urls
def get_neighborhood_state_zip(driver: webdriver) -> []: state_list = [] zipcode_list = [] neighborhood_list = [] addresses_2 = driver.find_elements_by_css_selector('.bfg-gallery-address2') for t in addresses_2: t_list = t.text.split() neighborhood_list.append(t_list[0][:-1]) state_list.append(t_list[1]) zipcode_list.append(int(t_list[2])) return neighborhood_list, state_list, zipcode_list
def launch_twitter(driver: webdriver): """ :type driver: selenium.webdriver.firefox.webdriver.WebDriver """ twitter_url = "https://twitter.com/search?f=tweets&vertical=default&q=discord.gg&src=unkn" driver.get(twitter_url) URLs = driver.find_elements_by_css_selector("li[data-item-id]") lurl = [] for tweet in URLs: if tweet.find_elements_by_class_name("twitter-timeline-link"): linkr = tweet.find_element_by_class_name("twitter-timeline-link") text = linkr.get_attribute("href") lurl.append(text) return lurl
def get_img_str(driver: webdriver) -> str: try: driver.implicitly_wait(5) imgs: List[str] = driver.find_elements_by_css_selector( 'div.smallImg>ul>li>a>img') img_str: str = '' for img in imgs: src_str = img.get_attribute('src') str_arr = str(src_str).split('/') str_arr.reverse() img_str += str_arr[0].replace( '_S', '_L') + '[:param:][alt=' + driver.find_element_by_css_selector( 'h1.vehicle-Tit' ).text + '][title=' + driver.find_element_by_css_selector( 'h1.vehicle-Tit').text + ']|' return img_str[:-1] except Exception as e: print('Can\'t get Images str. Reason %s' % e)
def retrieve_child_elements(self, web_driver, parent_element: im_webdriver, target_identity: str, identity_type: int): """ find child elements :args: - parent_element: parent element used to find child elements - target_identity: partial url of child elements - identity_type: type of identity 0: href link 1: class name :return: - child elements list """ if identity_type == 0: child_elements = parent_element.find_elements_by_css_selector( 'div[class="div_t"]>a') return child_elements