def read_single_goods_commodity(self, element: WebElement) \ -> (Commodity, None): """ :param element:A item of the list witch gain by method: get_goods_list(). :return: An instance of Commodity which haven't set the value of keyword or None, if an error occurred. """ try: # 获取商品url,可能失败 comm = Commodity() comm.item_url = self.get_item_url(element) # XXX: Keyword未指定 # 获取商品title comm.item_title = self.get_item_name(element) # 获取商品name comm.item_name = comm.item_title # 获取商品分类(列表下是不存在的) except NoSuchElementException: logging.warning('[Get single goods, No such element]:' + (element.get_attribute("outerHTML") or "None ") + (traceback.print_exc() or 'None')) return None try: # 获取店铺名 comm.store_name = self.get_store_name(element) # 获取店铺url comm.store_url = self.get_store_url(element) # 默认访问次数为0 except NoSuchElementException: logging.warning("Get single goods, Can't get store info:" + (element.get_attribute("outerHTML") or "None ") + (traceback.print_exc() or 'None')) return comm
def download_lesson(driver: WebDriver, lesson: WebElement, module_path: pathlib.Path) -> None: lesson_url = lesson.get_attribute('href') course_page = driver.current_window_handle driver.execute_script('window.open()') driver.switch_to.window(driver.window_handles[1]) driver.get(lesson_url) youtube_url = get_youtube_url(driver) if youtube_url is not None: # BUG: the output option is not working; fix it; output_path = module_path / '%(title)s.%(ext)s' ydl_opts = { 'output': str(output_path), 'retries': 10, } try: with youtube_dl.YoutubeDL(ydl_opts) as ydl: ydl.download([youtube_url]) except youtube_dl.utils.DownloadError as e: print(e) driver.close() driver.switch_to.window(course_page)
def __init__(self, element: WebElement) -> None: if "accordion__section" not in element.get_attribute("class"): raise Exception("Element is not an accordion section") self._element = element self._header_element: WebElement = self._element.find_element_by_xpath( "a[contains(concat(' ', normalize-space(@class), ' '), ' accordion__section__title ')]" ) # Expand the accordian if not already expanded if "active" not in self._element.get_attribute("class"): self._header_element.click() # Accordion needs a bit of time to expand sleep(0.5) title_element: WebElement = self._header_element.find_element_by_tag_name( "span") self._content_element: WebElement = self._element.find_element_by_xpath( "div[contains(" "concat(' ', normalize-space(@class), ' '),' accordion__section__content '" ")]") self.title: str = title_element.text
def enterElem(e:WebElement, func): prev_url = driver.current_url uprint('prev_url='+prev_url) next_url = e.get_attribute('href') uprint('next_url='+next_url) driver.get(next_url) func(prev_url)
def get_text(self, locator="", locator_type="id", element: WebElement = None, info=""): """ NEW METHOD Get 'Text' on an element Either provide element or a combination of locator and locatorType """ try: if locator: # This means if locator is not empty self.log.debug("In locator condition") element = self.get_element_(locator, locator_type) self.log.debug("Before finding text") text = element.text self.log.debug("After finding element, size is: " + str(len(text))) if len(text) == 0: text = element.get_attribute("innerText") if len(text) != 0: self.log.info("Getting text on element :: " + info) self.log.info("The text is :: '" + text + "'") text = text.strip() except: self.log.error("Failed to get text on element " + info) print_stack() text = None return text
def analyze_element(self, el: WebElement): if "featured-plan" in el.get_attribute("class").split(" "): term, price, product_name = self.__featured_analyze_element(el) else: term_element = el.find_element_by_xpath(".//ul/li[1]") match = re.search(r"\b\d+\b", term_element.text) if match: term = match.group() else: term = 1 price_element = el.find_element_by_css_selector("div.product2 h2") price = re.search(r"(\d+(\.\d+)?)", price_element.text).group() plan_element = el.find_element_by_css_selector("div.product2 h4") product_name = plan_element.text efl_download_link_element = el.find_element_by_xpath(".//a[1]") efl_url = efl_download_link_element.get_attribute("href") self.__download_pdf(efl_url) return { "term": term, "price": price, "product_name": product_name, }
def getElemAttrib(e:WebElement,attName:str, trialCount=20,pollInterval=3,default=None) -> str: for trial in range(trialCount): try: return e.get_attribute(attName) except (StaleElementReferenceException, NoSuchElementException, TimeoutException): sleep(pollInterval) return default
def _get_status_from_class_string(status_element: WebElement) -> JobStatus: """Get JobStatus from @class string""" class_string = status_element.get_attribute('class') for status in JobStatus: if status.value in class_string: return status raise KeyError('Job status not found in class string: %s' % str(class_string))
def findVidSrc(vidEl: WebElement): postersrc = vidEl.get_attribute('data-postersrc') if not postersrc: return ['', ''] vid_ids = re.findall(r'^.*\/(.*?)\..*?$', postersrc) return [ f'https://i.kinja-img.com/gawker-media/image/upload/{vid_ids[0]}.mp4', f'{vid_ids[0]}.mp4' ]
def submit_form(self, form: WebElement) -> None: form_selector = ("." + ".".join(form.get_attribute("class").split(" ")) if form else 'form') submit_button_selector = ( f'{form_selector} input[type="submit"], ' '.form-footer__content__main-controls button[id="submit-form"]') submit_button = self.wait_for_element(submit_button_selector) submit_button.click()
def get_value_of_elem(elem: WebElement): class_name = elem.get_attribute('class') if 'blank' in class_name: return 'o' elif 'bombflagged' in class_name: return 'x' elif 'bombdeath' in class_name: raise Exception('It\'s a boy!') else: return int(class_name[-1])
def parse_link(story_element: WebElement) -> Tuple: with open('./parser/story.html', 'w') as file: file.write(story_element.get_attribute('outerHTML')) # if not story_element.find_elements_by_class_name("story__title-link"): # return link = story_element.find_element_by_class_name("story__title-link") href = link.get_attribute('href') return href, link.text
def handle_script(script: WebElement) -> str: """Returns download url of highest quality version""" script_text = script.get_attribute("innerHTML") mobj = config_regex.search(script_text) unparsed_json = mobj.groups()[0] parsed_json = json.loads(unparsed_json) files = parsed_json["request"]["files"]["progressive"] top_quality = sorted(files, key=lambda f: f["height"], reverse=True)[1]["url"] return top_quality
def __init__(self, element: WebElement) -> None: if "tiles__tile" not in element.get_attribute("class"): raise Exception("Element is not a tile") self._link_element: WebElement = element.find_element_by_xpath("a") self._title_element: WebElement = self._link_element.find_element_by_tag_name("h2") self._summary_element: WebElement = self._link_element.find_element_by_tag_name("p") self.title: str = self._title_element.text self.summary: str = self._summary_element.text self.url: str = self._link_element.get_attribute("href")
def __web_element_to_dict(self, element: WebElement) -> dict: float_locator = By.XPATH, './/span[contains(text(), "Float")]//parent::div' paint_locator = By.XPATH, './/span[contains(text(), "Paint")]//parent::div' price_locator = By.XPATH, './/span[@class="pricetext"]' wear_locator = By.XPATH, './/div[@class="quality"]' stattrack_locator = By.XPATH, './/div[@class="stattrak"]' locked_locator = By.XPATH, './/div[@class="bot-icon"]' return { 'name': str(element.get_attribute('data-original-title')).strip(), 'game': str(element.get_attribute('data-appid')).strip(), 'floatvalue': self.__value_of(self.__get_text(element, float_locator)), 'paint': self.__value_of(self.__get_text(element, paint_locator)), 'price': self.__value_of(self.__get_text(element, price_locator)), 'wear': self.__get_text(element, wear_locator), 'stattrack': True if self.__get_text(element, stattrack_locator) else False, 'locked': self.__get_text(element, locked_locator) }
def get_attribute_from_element(self, attribute_name: str, locator_type: By = None, locator_value: str = None, element: WebElement = None, from_element: WebElement = None, time_to_wait: int = TIME_TO_WAIT) -> str: if not element: element = self.find_element_by(locator_type, locator_value, from_element, time_to_wait) return element.get_attribute(attribute_name)
def findImgSrc(imgEl: WebElement): srcset = imgEl.get_attribute('srcset') if not srcset: srcset = imgEl.get_attribute('data-srcset') if not srcset: src = imgEl.get_attribute('src') if not src: return ['', ''] src_ids = re.findall(r'^.*\/(.*?)$', src) if len(src_ids) == 0 or src_ids[0].startswith('http'): return ['', ''] return [ f'https://i.kinja-img.com/gawker-media/image/upload/{src_ids[0]}', src_ids[0] ] img_ids = re.findall(r'^.*\/(.*?)\s80w', srcset) return [ f'https://i.kinja-img.com/gawker-media/image/upload/{img_ids[0]}', img_ids[0] ]
def getElemAttr(elem:WebElement, attr:str, timeOut:float=30, pollFreq:float=1) -> str: timeElapsed=0.0 while timeElapsed < timeOut: beginTime=time.time() try: return elem.get_attribute(attr) except StaleElementReferenceException: pass time.sleep(pollFreq) timeElapsed+=(time.time()- beginTime) raise TimeoutException('[getElemAttr] Time out elem='+str(elem))
def getElemAttrib(e: WebElement, attName: str, trialCount=20, pollInterval=3, default=None) -> str: for trial in range(trialCount): try: return e.get_attribute(attName) except (StaleElementReferenceException, NoSuchElementException, TimeoutException): sleep(pollInterval) return default
def clickDownloadableElem(elem:WebElement)->str: pollFreq=1 filesOld=os.listdir(dlDir) fileName=getElemText(elem) href = elem.get_attribute('href') assert href is not None uprint('fileName="%s" href="%s"'%(fileName,href)) sql("UPDATE dlink SET href=:href WHERE" " file_name=:fileName",locals()) uprint("UPDATE dlink SET href=%(href)s WHERE" " file_name=%(fileName)s"%locals()) return
def _get_team_id_from_link(link: WebElement) -> str: href_attribute = link.get_attribute("href") team_id = href_attribute.split("teamId=")[-1] try: _ = int(team_id) except ValueError as e: logger.error( f"Team ID {team_id} does not seem correct (not an integer)") raise RuntimeError( f"Could not get team ID from `href` attribute: {href_attribute}" ) from e return team_id
def _get_team_id_from_class(element: WebElement) -> str: class_attribute = element.get_attribute("class") team_id = class_attribute.split("teamId-")[-1] try: _ = int(team_id) except ValueError as e: logger.error( f"Team ID {team_id} does not seem correct (not an integer)") raise RuntimeError( f"Could not get team ID from `class` attribute: {class_attribute}" ) from e return team_id
def from_web_element(self, web_element: WebElement): session_text = web_element.text session_tokens = session_text.split(' ') self.status = session_tokens[0] self.session_id = session_tokens[1] self.session_name = web_element.find_element_by_class_name('title').text self.url = web_element.get_attribute('data-vote-url').replace('/vote', '') speaker_text = " ".join(session_tokens[2:-3]) speaker_text = speaker_text.replace(self.session_name, '') speaker_text = speaker_text.replace('about', '').strip() self.speaker_text = speaker_text
def element_to_image_info(element: Element, with_thumbs: bool) -> Optional[ImageInfo]: url = json.loads(element.get_attribute('data-bem') or "{}").get('serp-item', {}).get('img_href', None) if not url: return None thumb = find(element, 'serp-item__thumb') if with_thumbs else None thumb_png = thumb_png.screenshot_as_png if thumb else None if with_thumbs and not thumb_png: return None return ImageInfo(url=url, thumb_png=thumb_png)
def getElemAttr(elem: WebElement, attr: str, timeOut: float = 30, pollFreq: float = 1) -> str: timeElapsed = 0.0 while timeElapsed < timeOut: beginTime = time.time() try: return elem.get_attribute(attr) except StaleElementReferenceException: pass time.sleep(pollFreq) timeElapsed += (time.time() - beginTime) raise TimeoutException('[getElemAttr] Time out elem=' + str(elem))
def _get_player_id_from_class(element: WebElement) -> str: class_attribute = element.get_attribute("class") player_id = class_attribute.split("playerNameId-")[-1].split(" ")[0] try: _ = int(player_id) except ValueError as e: logger.error( f"Player ID {player_id} does not seem correct (not an integer)" ) raise RuntimeError( f"Could not get player ID from `class` attribute: {class_attribute}" ) from e return player_id
def robust_input(self, ele: WebElement, text: str, use_paste=False, ensure_value=False) -> bool: ele.clear() if use_paste: import os os.system("echo {}|clip".format(text)) ele.send_keys(Keys.CONTROL, "v") else: ele.send_keys(text) if ensure_value: if ele.get_attribute("value") == text: return True else: logger_driver.warning({ "target input": text, "inputted": ele.get_attribute("value") }) return False else: return True
def get_attrs(self, element: WebElement, method: str = 'bs4', verbose: int = 0): """ get list of attributes from an element. https://stackoverflow.com/questions/27307131/selenium-webdriver-how-do-i-find-all-of-an-elements-attributes somehow, webdriver doesn't have an API for this :param element: :param method: :return: """ """ note: for html '<div class="login-greeting">Hi LCA Editor Tester,</div>' bs4 will give: {'class': ['login-greeting']} js will give: {'class': 'login-greeting' } """ if method == 'bs4': html: str = element.get_attribute('outerHTML') if verbose: tplog(f"outerHTML={html}") if html: attrs = {} soup = BeautifulSoup(html, 'html.parser') # https://www.crummy.com/software/BeautifulSoup/bs4/doc/#attributes for element in soup(): # soup() is a generator # element.attrs is a dict attrs.update(element.attrs) return attrs else: return {} elif method == 'js': # java script. duplicate attributes will be overwritten js_script = 'var items = {}; ' \ 'for (index = 0; index < arguments[0].attributes.length; ++index) { ' \ ' items[arguments[0].attributes[index].name] = arguments[0].attributes[index].value' \ '};' \ 'return items;' attrs = self.driver.execute_script(js_script, element) return attrs else: raise RuntimeError( f"unsupported method={method}. accepted: bs4 or js")
def analyze_element(self, el: WebElement): data_product_promo = el.get_attribute("data-product-promo") if data_product_promo not in self.plans_promo_blacklist: term_element = el.find_element_by_css_selector( "div.card-body div ul") term = term_element.text match = re.search(r"(\d+)\s+Months", term) if match: term = match.groups()[0] else: raise Exception("Term could not match. (%s)" % term) else: term = "1" price_element = el.find_element_by_css_selector( "div.card-body div.modal-body h1" + ",div.card-body div.product2.cards_div2 h2") price = re.search(r"(\d+(\.\d+)?)", price_element.text.split("¢")[0]).groups()[0] product_name = el.find_element_by_css_selector( "div.card-body div.modal-body h2" + ",div.card-body div.product2.cards_div2 h4").text efl_download_link_element = el.find_element_by_css_selector( "div.modal-footer a") efl_download_link_element.click() self.client.switch_to_window(self.client.window_handles[1]) pdf_url = self.client.find_element_by_tag_name("iframe").get_attribute( "src") self.client.get(pdf_url) self.client.close() self.client.switch_to_window(self.client.window_handles[0]) self.wait_for() return { "term": term, "price": price, "product_name": product_name, }
def send_keys(self, # pylint: disable=no-self-use element: WebElement, value: str, retries: int = 10) -> None: '''Helper to _really_ send keys to an element. For some yet unexplained reason when running in non-headless mode, the interactions with text elements do not always register by the browser. This causes input elements to remain empty even after sending the keys. This method sends the keys and then ensures that the value of the element is the expected string, retrying if necessary. ''' for _ in range(retries): element.clear() element.send_keys(value) if element.get_attribute('value') == value: return logging.error('Failed to send keys to the element')
def get_info_from_page(self, elm: WebElement, input_info, flag): """ 从网页上提取需要的信息 :param elm: 需要提取信息的页面元素对象 :param input_: 根据元素属性取值时为元素的属性类型如:value 属性 :param output: 需要提取的内容并作为输出内容存储的临时变量名称 :param temp_info: 提取的内容存储的临时变量名称 :param judge_flag: 提取类型判断标识 :return: 提取内容的字典 """ # 取元素文本值 if "TEXT" == flag: the_info = elm.text # 取元素属性值 elif "ATTRIBUTE" == flag: the_info = elm.get_attribute(input_info) else: raise UnsupportedValueException('flag', flag, ['TEXT', 'ATTRIBUTE']) return the_info
def get_value_with_type( self, input_el: WebElement, **data: InputValue) -> t.Tuple[InputValue, InputType]: input_name = self.get_input_name(input_el) try: datum = data[input_name] except KeyError: raise MissingDataException(f"Form is missing '{input_name}' " f"input in {data}") if type(datum) == tuple: el_type, value = datum # type: ignore # TODO: this regex should be more flexible / defined in settings elif self.is_valid_date(value): el_type, value = InputTypes.Date, value elif input_el.get_attribute('type') in ['radio', 'checkbox']: el_type, value = InputTypes.Checkbox, [value] else: value = datum el_type = InputTypes.Default return value, el_type
def set_val(self, element: WebElement, val: Any, send_enter: bool = False, send_keys: bool = False, **kw) -> None: """Set input box value by send_keys or exec script - Need id to set by script Parameters ---------- element : WebElement val : Any send_enter : bool, optional press enter after sending, default False send_keys : bool, optional default False """ driver = self.driver val = str(val).replace("'", "\\'").replace('"', '\\"').replace('\n', '\\n') try: if send_keys: element.send_keys(val) else: driver.execute_script( "document.getElementById('{}').value='{}'".format( element.get_attribute('id'), val)) if send_enter: element.send_keys( Keys.ENTER ) # date fields need an ENTER to set val properly except: log.warning(f'couldn\'t set value: {val}') if self.suppress_errors: return element.send_keys(val)
def _parse_root(self, category: WebElement): # Root 이름 root_name: str = category.text # root_name = text.replace('/', '-') logging.info('rootName : ' + root_name) for exclude_category in self.crawl_config.exclude_category: if eq(root_name, exclude_category): return None class_att = category.get_attribute('class') click_xpath = '//*[@id="home_{0}"]'.format(class_att) self.driver.implicitly_wait(5) # 먼저 클릭해봄. self.driver.find_element_by_xpath(click_xpath).send_keys(Keys.ENTER) # class_att 맞춰 내부 xPath 설정 time.sleep(1) xpath_cate = '//*[@id="home_{0}_inner"]/div[1]'.format(class_att) # Root Category element: WebElement = None while 1: if element is not None: break else: # 클릭 이벤트가 정상적으로 안들어오면 계속 클릭하자.. self.driver.find_element_by_xpath(click_xpath).send_keys( Keys.ENTER) self.driver.implicitly_wait(4) time.sleep(1) element = self.driver.find_element_by_xpath(xpath_cate) self._insert(None, root_name, None, True) # Root -> sub co_col_elements = element.find_elements(By.CLASS_NAME, 'co_col') self._parse_co_col(co_col_elements, root_name)
def is_expanded(self, node: WebElement): return 'dynatree-expanded' in node.get_attribute('class')
def elem_has_class(elem: WebElement, className: str): classes = elem.get_attribute('class') classes = classes.split() return className in classes
def get_value(element: WebElement) -> str: return element.get_attribute('value')