def getFormItems(driver: webdriver): tags = [] inputs = driver.find_elements_by_tag_name('input') for element in inputs: tag = {} tag['label'] = getLabel(element) tag['id'] = element.get_attribute('id') tag['name'] = element.get_attribute('name') tag['type'] = element.get_attribute('type') tag['value'] = element.get_attribute('value') location = element.location tag['location_x'] = location['x'] tag['location_y'] = location['y'] tags.append(tag) selects = driver.find_elements_by_tag_name('select') for element in selects: tag = {} tag['label'] = getLabel(element) tag['id'] = element.get_attribute('id') tag['name'] = element.get_attribute('name') tag['type'] = 'select' tag['value'] = getSelectOptions(element) location = element.location tag['location_x'] = location['x'] tag['location_y'] = location['y'] tags.append(tag) return tags
def get_image_links(driver: webdriver, site: str) -> Links: # On a side note, I am genuinely surprised how they managed to obfuscate the images on each of their sites. # On KissComics, they have all the links in some script located in the page, this script holds the image links # On KissManga, they have img tags that somehow only load the image links once the page is loaded in a browser # viewing the raw HTML leads to the images somehow not being there, I don't know enough WebDev to say how they do it image_links = [] if site == 'comics': regex = re.compile('lstImages.push\\("(.*?)"') for a in driver.find_elements_by_tag_name('script'): img_set = re.findall(regex, a.get_attribute('innerHTML')) if not img_set == []: image_links.append(img_set) elif site == 'manga': elements = driver.find_elements_by_xpath( '//img[@onerror="onErrorImg(this)"]') for elem in elements: src = elem.get_attribute('src') image_links.append(src) print(image_links) return np.array(image_links).flatten().tolist()
def execSearch(browser: webdriver, inifile): # 検索ワード抽出 browser.get(_get(inifile, 'search', 'url_trend')) contents = browser.find_elements_by_tag_name('h1') searchWords = [] for content in contents: searchWords.append(content.text) searchWords.pop(0) searchWords.pop(0) # ログインボタンの押下(検索、メールDEポイント共通) browser.get(_get(inifile, 'search', 'url') + '/Web?qt=' + searchWords[0]) searchWords.pop(0) browser.find_element_by_link_text(_get(inifile, 'search', 'login_context')).click() login_user = browser.find_element_by_name(_get(inifile, 'user', 'id_name')) login_user.send_keys(_get(inifile, 'user', 'id')) login_password = browser.find_element_by_name( _get(inifile, 'user', 'pass_name')) login_password.send_keys(_get(inifile, 'user', 'pass')) browser.find_element_by_name("submit").click() sleep(_getRandomNum()) ## 検索ワードを一つずつ処理 for index, searchWord in enumerate(searchWords): search_box = browser.find_element_by_name("qt") search_box.clear() search_box.send_keys(searchWord) browser.find_element_by_id('searchBtn').click() sleep(_getRandomNum())
def collectLinks(driver: webdriver): tags = [] elements = driver.find_elements_by_tag_name('a') for element in elements: tag = {} tag['location'] = element.location tag['text'] = element.text tag['href'] = element.get_attribute('href') tags.append(tag) return tags
def click_a_element_by_languagecode_name(self, browser: webdriver, name: str): """ """ elements = browser.find_elements_by_tag_name('a') for elem in elements: if elem.get_attribute("languagecode") == name: elem.click() break
def getCourseList(driver: webdriver) -> []: l = driver.find_elements_by_tag_name('li') id = [] for i in l: j = i.find_element_by_tag_name('div').get_attribute("class").split( '-')[1] name = i.find_element_by_tag_name('h2').find_element_by_tag_name( 'div').text print("Are you currently enrolled in", name, "(Y/N)? ", end='') if input().lower() == 'y': id.append(["https://classroom.google.com/u/0/c/" + j, name]) return id
def download_images(driver: webdriver) -> []: images = driver.find_elements_by_tag_name('img') inc = 0 images = images[1:] list_of_images = [] for image in images: inc += 1 # Change naming method name = "images/%s%d.png" % ((image.get_attribute('src'))[69:( len(image.get_attribute('src')) - 4)], inc) urllib.request.urlretrieve(image.get_attribute('src'), name) list_of_images.append(name) return list_of_images
def removeKataomoi(browser: webdriver, safe_accounts): global remain_remove_count """ 片思いったーにログインする :param browser: webdriver """ # 片思いったーにアクセス browser.get('http://kataomoi.net/redirect.php') sleep(1) url = browser.current_url is_confirm = url.startswith( "https://api.twitter.com/oauth/authorize?oauth_token") if is_confirm: submit_btn = browser.find_element_by_id("allow") submit_btn.click() sleep(1) else: # ログイン情報の入力 username_or_email = browser.find_element_by_xpath( "//*[@id='username_or_email']") username_or_email.send_keys(USER_NAME) password = browser.find_element_by_xpath("//*[@id='password']") password.send_keys(PASSWORD) # ログイン password.submit() sleep(1) browser.get('http://kataomoi.net/find_one_way.php') sleep(1) trs = browser.find_elements_by_tag_name("tr") # 古い順位に並び替え reversed_trs = reversed(trs) print('▼フォロー解除中…▼') for tr in reversed_trs: if remain_remove_count <= 0: break tds = tr.find_elements_by_tag_name("td") if len(tds) > 1 and not (tds[1].find_element_by_tag_name("a").text in safe_accounts): print(tds[1].find_element_by_tag_name("a").text) tr.find_elements_by_tag_name("span")[0].click() remain_remove_count = remain_remove_count - 1 sleep(0.5)
def scrape_once(driver: webdriver, save_into_dict: bool, tagDict=[]): tagCount = 0 postCount = 0 driver.get('https://www.instagram.com/explore') WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.TAG_NAME, 'article'))) all_links = driver.find_elements_by_tag_name('a') for link in all_links: href = link.get_attribute('href') if (href.endswith('?explore=true')): media_html = requests.get(href).text soup = BeautifulSoup(media_html, 'lxml') timestamp: int = time.time() hashtags = soup.findAll(attrs={"property": "instapp:hashtags"}) for tagElement in hashtags: tag = tagElement.get('content') if save_into_dict: tagDict[tag] += 1 put_tag(tag, timestamp) tagCount += 1 postCount += 1 print('%d tags processed in %d posts' % (tagCount, postCount))
def __duo_authenticator(driver: webdriver, duo_bypass: str): """Deals with the DUO Authenticator step of auth flow""" logger.info('Authenticating with Duo') iframes = driver.find_elements_by_tag_name("iframe") if len(iframes) > 0: # duo needs to be authenticated logger.info('Authenticating with Duo bypass code...') driver.switch_to.frame(iframes[0]) duo_passcode_button = driver.find_element_by_xpath( "//button[@id='passcode']") duo_passcode_button.click() passcode_field = driver.find_element_by_xpath( "//input[@class='passcode-input']") passcode_field.clear() passcode_field.send_keys(duo_bypass) login_button = driver.find_element_by_xpath( '//button[text()="Log In"]') login_button.click() logger.info('Successfully authenticated with Duo bypass code') time.sleep(2) return
def fetch_image_urls(self, query: str, wd: webdriver, sleep_between_interactions: int = 1, max_timeout=5, imgs_offset=5): def scroll_to_end(wd): wd.execute_script( "window.scrollTo(0, document.body.scrollHeight);") time.sleep(sleep_between_interactions) # build the google query search_url = "https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img" # load the page wd.get(search_url.format(q=query)) image_urls = list() reference_imgs_urls = list() wait = WebDriverWait(wd, max_timeout) results_start = 0 #wd.window_handles while len(image_urls) < (self.imgs2download + imgs_offset): scroll_to_end(wd) # get all image thumbnail results #thumbnail_results = wd.find_elements_by_css_selector("img.rg_ic") wait.until(EC.presence_of_element_located((By.ID, "islrg"))) thumbnail_div = wd.find_element_by_id('islrg') WebDriverWait(thumbnail_div, max_timeout).until( EC.presence_of_element_located((By.CLASS_NAME, "islrc"))) thumbnail_div = thumbnail_div.find_elements_by_class_name( 'islrc')[0] WebDriverWait(thumbnail_div, max_timeout).until( EC.presence_of_element_located((By.TAG_NAME, "div"))) div_with_link_img = thumbnail_div.find_elements_by_tag_name('div') n_found_divs = len(div_with_link_img) for div_of_img in div_with_link_img[results_start:n_found_divs]: try: if (len(image_urls) >= (self.imgs2download + imgs_offset)): break wd.switch_to_window(wd.window_handles[0]) possible_img_link = div_of_img.find_elements_by_tag_name( 'a') #try: for pos_link in possible_img_link: possible_imgs = pos_link.find_elements_by_tag_name( 'img') p_links = pos_link.get_attribute("href") print("PL: ", p_links) if (p_links != None): reference_imgs_urls.append(p_links) #Click on imgs in order to let the link appear for img in possible_imgs: w, h = int(img.get_attribute("width")), int( img.get_attribute("width")) if (w < 60 or h < 60): possible_imgs.remove(img) continue else: img.click() #Wait until click have had effect time.sleep(sleep_between_interactions) if (len(possible_imgs) > 0): new_img_url = pos_link.get_attribute("href") if (new_img_url != None): #print("URL IMG: ", new_img_url) wd.execute_script("window.open()") wd.switch_to_window(wd.window_handles[1]) wd.get(new_img_url) #wait until load new page time.sleep(sleep_between_interactions) wait.until( EC.presence_of_element_located( (By.TAG_NAME, "img"))) big_imgs = wd.find_elements_by_tag_name('img') for big_img_index in range(len(big_imgs)): w, h = int( big_imgs[big_img_index].get_attribute( "width")), int( big_imgs[big_img_index]. get_attribute("width")) if (w < 60 or h < 60): continue else: print( "IMG:", big_imgs[big_img_index]. get_attribute("src")) image_urls.append( big_imgs[big_img_index]. get_attribute("src")) break wd.close() wd.switch_to_window(wd.window_handles[0]) except Exception as e: print(f"ERROR - {e} (continue ...)") for i in range(1, len(wd.window_handles)): wd.close() wd.switch_to_window(wd.window_handles[0]) #LOAD MORE #Press load button if not enough imgs if len(image_urls) >= (self.imgs2download + imgs_offset): print(f"Found: {len(image_urls)} image links, done!") break else: print("Found:", len(image_urls), "image links, looking for more ...") load_more_button = wd.find_element_by_css_selector(".mye4qd") if load_more_button: wd.execute_script( "document.querySelector('.mye4qd').click();") # move the result startpoint further down results_start = n_found_divs return image_urls, reference_imgs_urls