def analyze_element(self, el: WebElement): # term term_full_text = el.find_element_by_xpath( ".//ul[@class='price-bullet']/li").text try: term = re.search(r'\b\d+\b', term_full_text).group() except Exception: term = 1 # price price = el.find_element_by_xpath( ".//div[@class='price-container']/h4/span").text # product product_name = el.find_element_by_xpath('.//h3').text # download # close the survey dialog box if it is open link = el.find_elements_by_xpath( './/div[@class="gridPlanLinks"]/a')[-1] link.click() return { 'term': term, 'price': price, 'product_name': product_name, }
def __get_all_similar_question_in_row(self, row: WebElement): result = [] locator = "./td[1]/div[1]//span[@draggable]" similar_questions = row.find_elements_by_xpath(locator) for question in similar_questions: result.append(question.text) return result
def get_times_items(tbody: WebElement): # collecting applicable items items = tbody.find_elements_by_xpath("tr") # getting times from the items times = [] for row in items: str_time = row.find_element_by_xpath("td[2]").text.upper() datetime_obj = datetime.strptime(str_time, TIME_FORMAT) times.append((row, datetime_obj, str_time)) # returning tuple(WebElement, datetime, str_time) return times
def find_child_hrefs(button: WebElement) -> List[str]: children = button.find_elements_by_xpath(".//*") hrefs = [] for child in children: try: href = child.get_attribute("href") hrefs.append(href) except: continue hrefs = [href for href in hrefs if href is not None] return hrefs
def get_caption(node: WebElement): caption = '' children = node.find_elements_by_xpath('child::*') for child in children: txt = MediumScraper.safe_get_attribute(child, 'text', '') caption += child_reformat(txt) return caption
def _get_children(element: WebElement) -> typing.List[WebElement]: try: children = element.find_elements_by_xpath('./*') children_of_children = [] for child in children: children_of_children += WebElementHandler._get_children(child) children += children_of_children except InvalidSelectorException: children = [] return children
def _get_parents(element: WebElement) -> typing.List[WebElement]: try: parents = element.find_elements_by_xpath('..') parents_of_parents = [] for parent in parents: parents_of_parents += WebElementHandler._get_parents(parent) parents += parents_of_parents except InvalidSelectorException: parents = [] return parents
def make_card_instance(self, html_element: WebElement, card: MagicCard, store: MagicStore): html_instance_attributes = html_element.find_elements_by_xpath(".//td") matching = re.search(r' (.*?) (.*) (.*?) unid. (.*) (.*$)', html_element.text, re.S) quantity = int(matching.group(3)) if quantity == 0: return None edition = html_instance_attributes[0].find_element_by_xpath( ".//a/img").get_attribute("title") language = matching.group(1) quality = next(filter(lambda x: x in matching.group(2), self.qualities)) price = float(matching.group(5).replace(".", "").replace(",", ".")) foil = True if "Foil" in matching.group(2) else False return MagicCardInstance(price, quality, quantity, language, edition, foil, card, store)
def node_find_elements_by_xpath(node: WebElement, xpath: str, raise_exc: bool = True) -> Union[None, List[WebElement]]: if node is None: return [] try: return node.find_elements_by_xpath(xpath) except WebDriverException as exc: if raise_exc: raise exc else: return None
def analyze_element(self, el: WebElement): card_body_elements = el.find_elements_by_xpath( './/div[contains(@class, "card")]' '/div[contains(@class, "card-body")]/div') # term term_element = card_body_elements[1].find_element_by_xpath( ".//div/span") term = term_element.text.lstrip("Term:").rstrip("months").strip() # price price_element = card_body_elements[0].find_element_by_xpath(".//div") price = price_element.text.rstrip('¢') # product product_name = el.find_element_by_xpath('.//div/div/h2').text # download # close the survey dialog box if it is open footer_element = el.find_element_by_xpath( './/div[contains(@class, "card")]' '/div[contains(@class, "card-footer")]') dropdown_element = footer_element.find_element_by_xpath('.//button') self.wait_for() dropdown_element.click() link = footer_element.find_element_by_xpath(".//div/a") retries = 0 while not link and retries < 5: self.wait_for() link = footer_element.find_element_by_xpath(".//div/a") retries = 0 if link: link.click() self.wait_for() return { 'term': term, 'price': price, 'product_name': product_name, }
def parse_row(tr: WebElement) -> Tuple[str, str]: """Use for parsing trade card or lot card""" cells = tr.find_elements_by_xpath('./td') if len(cells) == 2: field_name = cells[0].get_attribute('innerText').replace('.', ' ').strip() inner_tables = cells[1].find_elements_by_tag_name('table') if inner_tables: '''Бывает, что ячейка содержит историю изменений в виде table ''' field_value = inner_tables[0].find_element_by_tag_name( 'td').get_attribute('innerText') else: field_value = cells[1].get_attribute('innerText') elif len(cells) == 1: field_name = cells[0].find_element_by_tag_name('b').get_attribute( 'innerText').strip() field_value = cells[0].find_element_by_tag_name('div').get_attribute( 'innerText') else: raise ValueError(f'Wrang amount of cells') return field_name, field_value
def get_child_nodes(self, node: WebElement) -> list: return node.find_elements_by_xpath('./*')
def parse_experience_row(experience_row: WebElement) -> dict: experience = {'positions': []} # ONE POSITION try: experience['company'] = clean_company_name( experience_row.find_element_by_xpath( selectors['profile_company_name_with_one_position']).text) experience['duration_summary'] = parse_duration(experience_row) position = { 'name': parse_one_position_name(experience_row), 'location': parse_location(experience_row), 'description': parse_description(experience_row), 'dates': parse_dates_from_to(experience_row) } position['dates']['duration'] = experience['duration_summary'] experience['positions'].append(position) except NoSuchElementException as e: experience['company'] = '' logging.debug( f"profile_company_name_with_one_position not found (maybe because it's many positions?) {e}" ) except Exception as e: experience['company'] = '' logging.debug(f"Unknown Exception {e}") # MANY POSITIONS try: experience['company'] = clean_company_name( experience_row.find_element_by_xpath( selectors['profile_company_name_with_many_positions']).text) try: experience[ 'duration_summary'] = experience_row.find_element_by_xpath( selectors[ 'profile_company_summary_duration_with_many_positions'] ).text except NoSuchElementException as e: experience['duration_summary'] = '' logging.debug( f"Can't find profile_company_summary_duration_with_many_positions {e}" ) print( f"Can't find profile_company_summary_duration_with_many_positions" ) except Exception as e: experience['duration_summary'] = '' logging.debug(f"Unknown Exception {e}") try: for role in experience_row.find_elements_by_xpath( selectors['profile_experience_role_for_many_positions']): scroll_to_element( role, 'profile_experience_role_for_many_positions role') position = { 'name': parse_many_position_name(role), 'description': parse_description(role), 'dates': parse_dates_from_to(role), 'location': parse_location(role) } position['dates']['duration'] = parse_duration(role) experience['positions'].append(position) except NoSuchElementException as e: experience['positions'].append({ 'name': '', 'location': '', 'description': '', 'dates': { 'from': '', 'to': '', 'duration': '' } }) logging.debug( f"Can't find profile_experience_role_for_many_positions {e}") except Exception as e: logging.debug(f"Unknown Exception {e}") experience['positions'].append({ 'name': '', 'location': '', 'description': '', 'dates': { 'from': '', 'to': '', 'duration': '' } }) except NoSuchElementException as e: logging.debug( f'profile_company_name_with_many_positions not found (its normal!) {e}' ) except Exception as e: logging.debug(f"Unknown Exception {e}") return experience
def scrape_post(self, e: WebElement, url: str) -> dict: date = e.find_element_by_css_selector("span.tojvnm2t.a6sixzi8.abs2jz4q.a8s20v7p.t1p8iaqh.k5wvi7nf.q3lfd5jv.pk4s997a.bipmatt0.cebpdrjk.qowsmv63.owwhemhu.dp1hu0rb.dhp61c6y.iyyx5f41").text print(0, date) if not date: #except NoSuchElementException: date = e.find_element_by_css_selector("a.oajrlxb2.g5ia77u1.qu0x051f.esr5mh6w.e9989ue4.r7d6kgcz.rq0escxv.nhd2j8a9.nc684nl6.p7hjln8o.kvgmc6g5.cxmmr5t8.oygrvhab.hcukyx3x.jb3vyjys.rz4wbd8a.qt6c0cv9.a8nywdso.i1ao9s8h.esuyzwwr.f1sip0of.lzcic4wl.gmql0nx0.gpro0wi8.b1v8xokw").text print(1,date) locale.setlocale(locale.LC_ALL, 'nl_NL.UTF-8') if re.search(r'\du', date): date = datetime.datetime.today() print(2,date) elif re.search(r'\dm',date): date = datetime.datetime.today() print(3,date) elif 'Gisteren' in date: datum = datetime.datetime.today() date = datum - datetime.timedelta(days=1) print(4,date) elif re.search(r'om', date): m = re.match(r"(\d+ \w+) om (\d\d:\d\d)", date) date, time = m.groups() date = f"{date} 2020 {time}" date = datetime.datetime.strptime(date, "%d %B %Y %H:%M") print(5, date) else: m = re.match(r"(\d+ \w+)", date) date = m[1] date = f"{date} 2020" print(9,date) date = datetime.datetime.strptime(date, "%d %B %Y") # date = datetime.datetime.strptime(date, "%d %B") print(6, date) try: headline = e.find_element_by_css_selector("a.oajrlxb2.g5ia77u1.qu0x051f.esr5mh6w.e9989ue4.r7d6kgcz.rq0escxv.nhd2j8a9.nc684nl6.p7hjln8o.kvgmc6g5.cxmmr5t8.oygrvhab.hcukyx3x.jb3vyjys.rz4wbd8a.qt6c0cv9.a8nywdso.i1ao9s8h.esuyzwwr.f1sip0of.lzcic4wl.oo9gr5id.gpro0wi8.lrazzd5p").text except NoSuchElementException: logging.debug(f"No headline by: {e}") headline = "-" print(headline) try: url = e.find_element_by_css_selector(".fsm > ._5pcq") url = fbposturl(url.get_attribute("href")) except NoSuchElementException: logging.debug(f"No url by: {e}") url = "-" print(url) try: date = e.find_element_by_css_selector("abbr._5ptz") date = date.get_attribute("title") date = datetime.strptime(date, "%d-%m-%Y %H:%M") except NoSuchElementException: logging.debug(f"No headline by: {e}") date = "01-01-1990 00:00" date = datetime.strptime(date, "%d-%m-%Y %H:%M") print(date) article = dict(title=headline, date=date, text=msg, url=url) try: msg = e.find_element_by_css_selector("div.kvgmc6g5.cxmmr5t8.oygrvhab.hcukyx3x.c1et5uql.ii04i59q").text except NoSuchElementException: logging.debug(f"No message by: {e}") msg = "-" if msg.strip() == "": logging.debug(f"No message by: {e}") msg = "-" #url = e.find_element_by_css_selector("a.oajrlxb2.g5ia77u1.qu0x051f.esr5mh6w.e9989ue4.r7d6kgcz.rq0escxv.nhd2j8a9.nc684nl6.p7hjln8o.kvgmc6g5.cxmmr5t8.oygrvhab.hcukyx3x.jb3vyjys.rz4wbd8a.qt6c0cv9.a8nywdso.i1ao9s8h.esuyzwwr.f1sip0of.lzcic4wl.oo9gr5id.gpro0wi8.lrazzd5p") #url = fbposturl(url.get_attribute("href")) article = dict(title=headline, date=date, text=msg, url=url, medium="dtvnieuws") # print(f"artikel is {headline},{date}") try: lijst = [x.text for x in e.find_elements_by_xpath(".//div[@class='oajrlxb2 g5ia77u1 qu0x051f esr5mh6w e9989ue4 r7d6kgcz rq0escxv nhd2j8a9 nc684nl6 p7hjln8o kvgmc6g5 cxmmr5t8 oygrvhab hcukyx3x jb3vyjys rz4wbd8a qt6c0cv9 a8nywdso i1ao9s8h esuyzwwr f1sip0of lzcic4wl l9j0dhe7 abiwlrkh gpro0wi8 dwo3fsh8 ow4ym5g4 auili1gw du4w35lb gmql0nx0']")] for i in lijst: if 'opmerkingen' in i: remarks = i article['nremarks'] = fbnumber(remarks) else: article['nremarks'] = 0 if 'gedeeld' in i: share = i article['nshares'] = fbnumber(share) else: article['nshares'] = 0 except NoSuchElementException: logging.debug(f"No remarks by: {e}") try: share = e.find_element_by_xpath("//div[@class='bp9cbjyn m9osqain j83agx80 jq4qci2q bkfpd7mw a3bd9o3v kvgmc6g5 wkznzc2l oygrvhab dhix69tm jktsbyx5 rz4wbd8a osnr6wyh a8nywdso s1tcr66n']").text article["shares"] = fbnumber(share) except NoSuchElementException: logging.debug(f"No shares by: {e}") try: link = e.find_element_by_css_selector("._52c6") link = link.get_attribute("href") article["article_url"]= fburl(link) except NoSuchElementException: logging.debug(f"No link by: {e}") return article
async def analyzecomment(comment: WebElement): # print(comment.text) files.analyzed_comments += 1 commenttext = comment.find_element_by_xpath("div[1]") commenttexttext = str(commenttext.text) commentid = str(comment.find_element_by_xpath('..').get_attribute("id")) commentauthorid: str = comment.find_element_by_xpath("h3/a").get_attribute( "href").replace("https://mbasic.facebook.com/", "") if "profile.php" in commentauthorid: commentauthorid = commentauthorid.replace("profile.php?id=", "").split("&")[0] else: commentauthorid = commentauthorid.split("?")[0] try: tag = commenttext.find_element_by_xpath("a") except NoSuchElementException: return href = tag.get_attribute("href") tagtext = tag.text if "Djy No" in tagtext or "Paolo Curetti" in tagtext: # Ya un truc vraiment chelou avec son nom et le bot le signale en boucle du coup pour l'instant je vais juste l'ignorer return if "mbasic.facebook.com/" in href and "/groups/" not in href and "/hashtag/" not in href and tagtext not in href: if len(commenttexttext) < len(tagtext) + 10: temphistory = await files.readhistory() try: if str(commentid) in str( temphistory["warnings"][commentauthorid]): # print("---- Already Seen Tag: \"" + tagtext.replace("\n", "") + "\"\n" + driver.current_url) return except KeyError: pass await switchtab(3) # Vérification de page driver.get(href) if "profile picture" not in driver.find_element_by_css_selector( "div#root").find_element_by_xpath( "div/div/div[2]/div/div/div/a/img").get_attribute( "alt"): print("---- Page Tag: \"" + tagtext.replace("\n", "") + "\"\n" + driver.current_url) await switchtab(1) # Post Facebook & Commentaires return await switchtab(1) # Post Facebook & Commentaires answerlink = "" for element in comment.find_elements_by_xpath("div[3]/a"): if "répon" in element.text.lower(): answerlink = element.get_attribute("href") await switchtab(2) # Réponse à un commentaire if answerlink != "": driver.get(answerlink) with open('messages.json', encoding="utf-8") as messages_json: messages = json.load(messages_json, encoding="utf-8") driver.find_element_by_css_selector( "#composerInput").send_keys( (messages["prefix"] + messages["wildtag"][random.randint( 0, len(messages["wildtag"]) - 1)] + messages["suffix"]).replace("{}", commentid)) driver.find_element_by_xpath( "//input[@type='submit'][@value='Répondre']").click() driver.get_screenshot_as_file("screenshots/" + commentid + ".png") else: print("---- NO ANSWER BUTTON " + driver.current_url) return print("---- Potential Tag: \"" + tagtext.replace("\n", "") + "\"\n" + driver.current_url) warning_content = { commentid: { "date": str(date.today()), "comment": commenttexttext, "publication": driver.current_url } } await files.addtohistory("warnings", commentauthorid, warning_content) await files.printstats() await switchtab(1) # Post Facebook & Commentaires
def nf_get_all_posts_on_element(element: WebElement) -> List[WebElement]: return element.find_elements_by_xpath('//a[starts-with(@href, "/p/")]')