Пример #1
0
def save_page(url, title, intro):
    try:
        w = '/wiki/'
        name = url[url.find(w) + len(w):]
        file = open(name + '.txt', 'a', encoding='utf-8')
        file.write(url + "\n")
        file.write(clean(h2t(title)) + ": " + "\n")
        file.write(clean(h2t(intro)))
        file.close()
    except Exception as e:
        print(str(e))
        return "Error"
    return name
Пример #2
0
def get_card(soup, final, kwargs):
    """Getting cards if present, here started the pain"""
    # common card
    if card := soup.find("div", class_="g mnr-c g-blk"):
        if desc := card.find("span", class_="hgKElc"):
            final.append(s(None, "Google Info Card:", h2t(str(desc))))
            return
Пример #3
0
    def parse_reference(self, text, full_chap, title, emb_color):
        # Remove cross references
        for sup in text.find_all("sup", {"class": "crossreference"}):
            sup.decompose()

        # Change headers to markdown
        for h3 in text.find_all("h3"):
            h3.name = "b"
        for h4 in text.find_all("h4"):
            h4.name = "b"

        text = h2t(str(text))
        pages = []
        raw = list(pagify(text, page_length=4000))
        size = len(raw)

        for i, page in enumerate(raw, 1):
            emb = discord.Embed(title=title,
                                description=page,
                                colour=emb_color)
            emb.url = full_chap
            emb.set_footer(
                text=f"Powered by Biblegateway.com | Page {i}/{size}")
            pages.append(emb)
        return pages
Пример #4
0
 def parseContent(self):
     if self.html is not None:    
         data = h2t(self.html.decode('UTF-8'))
         self.logger.info("Content Parsed")
         return data
     else:
         return None
Пример #5
0
    def parser_text(self, text, soup=None, cards: bool = True):
        """My bad logic for scraping"""
        if not soup:
            soup = BeautifulSoup(text, features="html.parser")

        final = []
        kwargs = {"stats": h2t(str(soup.find("div", id="result-stats")))}

        if cards:
            get_card(soup, final, kwargs)

        for res in soup.findAll("div", class_="g"):
            if name := res.find("div", class_="yuRUbf"):
                url = name.a["href"]
                if title := name.find("h3", "LC20lb DKV0Md"):
                    title = title.text
                else:
                    title = url
Пример #6
0
def html2text(s):
    s = re.compile('</*en-media[^>]*?>').sub('', s)
    return h2t(s)
Пример #7
0
    return query


def get_card(soup, final, kwargs):
    """Getting cards if present, here started the pain"""
    # common card
    if card := soup.find("div", class_="g mnr-c g-blk"):
        if desc := card.find("span", class_="hgKElc"):
            final.append(s(None, "Google Info Card:", h2t(str(desc))))
            return
    # another webpull card: what is the language JetBrains made? TODO fix this, depends on too many classes as of now
    if card := soup.find("div", class_="kp-blk c2xzTb"):
        if head := card.find("div", class_="Z0LcW XcVN5d AZCkJd"):
            if desc := card.find("div", class_="iKJnec"):
                final.append(s(None, f"Answer: {head.text}", h2t(str(desc))))
                return

    # calculator card
    if card := soup.find("div", class_="tyYmIf"):
        if question := card.find("span", class_="vUGUtc"):
            if answer := card.find("span", class_="qv3Wpe"):
                tmp = h2t(str(question)).strip("\n")
                final.append(
                    s(None, "Google Calculator:",
                      f"**{tmp}** {h2t(str(answer))}"))
                return

    # sidepage card
    if card := soup.find("div", class_="liYKde g VjDLd"):
        if thumbnail := card.find("g-img", attrs={"data-lpage": True}):
Пример #8
0
#! python3

from html2text import html2text as h2t
import pyperclip

pyperclip.copy(h2t(pyperclip.paste()))
Пример #9
0
def html2text(value):
    """Convert HTML text to plaintext"""
    return h2t(value)
Пример #10
0
def html2text(s):
    s = re.compile('</*en-media[^>]*?>').sub('', s)
    return h2t(s)
Пример #11
0
def html2text(text):
    return h2t(text).replace('#','').replace('**','').replace('__','')

    
Пример #12
0
 def raw_text(self, html_field): # makes practically raw text from html
     #delete additional signs after h2t function
     return sub('[\t\n\r*#]+', ' ', h2t(html_field))
Пример #13
0
        if cards:
            get_card(soup, final, kwargs)

        for res in soup.findAll("div", class_="g"):
            if name := res.find("div", class_="yuRUbf"):
                url = name.a["href"]
                if title := name.find("h3", "LC20lb DKV0Md"):
                    title = title.text
                else:
                    title = url
            else:
                url = None
                title = None
            if desc := res.find("div", class_="IsZvec"):
                if remove := desc.find("span", class_="f"):
                    remove.decompose()
                if final_desc := desc.find_all("span"):
                    desc = h2t(str(final_desc[-1]))[:500]
                else:
                    desc = "Nothing found"
            else:
                desc = "Not found"
            if title:
                final.append(s(url, title, desc.replace("\n", " ")))
        return final, kwargs

    def parser_image(self, html):
        # first 2 are google static logo images
        return self.link_regex.findall(html)[2:], {}