def save_page(url, title, intro): try: w = '/wiki/' name = url[url.find(w) + len(w):] file = open(name + '.txt', 'a', encoding='utf-8') file.write(url + "\n") file.write(clean(h2t(title)) + ": " + "\n") file.write(clean(h2t(intro))) file.close() except Exception as e: print(str(e)) return "Error" return name
def get_card(soup, final, kwargs): """Getting cards if present, here started the pain""" # common card if card := soup.find("div", class_="g mnr-c g-blk"): if desc := card.find("span", class_="hgKElc"): final.append(s(None, "Google Info Card:", h2t(str(desc)))) return
def parse_reference(self, text, full_chap, title, emb_color): # Remove cross references for sup in text.find_all("sup", {"class": "crossreference"}): sup.decompose() # Change headers to markdown for h3 in text.find_all("h3"): h3.name = "b" for h4 in text.find_all("h4"): h4.name = "b" text = h2t(str(text)) pages = [] raw = list(pagify(text, page_length=4000)) size = len(raw) for i, page in enumerate(raw, 1): emb = discord.Embed(title=title, description=page, colour=emb_color) emb.url = full_chap emb.set_footer( text=f"Powered by Biblegateway.com | Page {i}/{size}") pages.append(emb) return pages
def parseContent(self): if self.html is not None: data = h2t(self.html.decode('UTF-8')) self.logger.info("Content Parsed") return data else: return None
def parser_text(self, text, soup=None, cards: bool = True): """My bad logic for scraping""" if not soup: soup = BeautifulSoup(text, features="html.parser") final = [] kwargs = {"stats": h2t(str(soup.find("div", id="result-stats")))} if cards: get_card(soup, final, kwargs) for res in soup.findAll("div", class_="g"): if name := res.find("div", class_="yuRUbf"): url = name.a["href"] if title := name.find("h3", "LC20lb DKV0Md"): title = title.text else: title = url
def html2text(s): s = re.compile('</*en-media[^>]*?>').sub('', s) return h2t(s)
return query def get_card(soup, final, kwargs): """Getting cards if present, here started the pain""" # common card if card := soup.find("div", class_="g mnr-c g-blk"): if desc := card.find("span", class_="hgKElc"): final.append(s(None, "Google Info Card:", h2t(str(desc)))) return # another webpull card: what is the language JetBrains made? TODO fix this, depends on too many classes as of now if card := soup.find("div", class_="kp-blk c2xzTb"): if head := card.find("div", class_="Z0LcW XcVN5d AZCkJd"): if desc := card.find("div", class_="iKJnec"): final.append(s(None, f"Answer: {head.text}", h2t(str(desc)))) return # calculator card if card := soup.find("div", class_="tyYmIf"): if question := card.find("span", class_="vUGUtc"): if answer := card.find("span", class_="qv3Wpe"): tmp = h2t(str(question)).strip("\n") final.append( s(None, "Google Calculator:", f"**{tmp}** {h2t(str(answer))}")) return # sidepage card if card := soup.find("div", class_="liYKde g VjDLd"): if thumbnail := card.find("g-img", attrs={"data-lpage": True}):
#! python3 from html2text import html2text as h2t import pyperclip pyperclip.copy(h2t(pyperclip.paste()))
def html2text(value): """Convert HTML text to plaintext""" return h2t(value)
def html2text(text): return h2t(text).replace('#','').replace('**','').replace('__','')
def raw_text(self, html_field): # makes practically raw text from html #delete additional signs after h2t function return sub('[\t\n\r*#]+', ' ', h2t(html_field))
if cards: get_card(soup, final, kwargs) for res in soup.findAll("div", class_="g"): if name := res.find("div", class_="yuRUbf"): url = name.a["href"] if title := name.find("h3", "LC20lb DKV0Md"): title = title.text else: title = url else: url = None title = None if desc := res.find("div", class_="IsZvec"): if remove := desc.find("span", class_="f"): remove.decompose() if final_desc := desc.find_all("span"): desc = h2t(str(final_desc[-1]))[:500] else: desc = "Nothing found" else: desc = "Not found" if title: final.append(s(url, title, desc.replace("\n", " "))) return final, kwargs def parser_image(self, html): # first 2 are google static logo images return self.link_regex.findall(html)[2:], {}