Python HTML2Text 예제들, html2text.html2text.HTML2Text Python 예제들

예제 #1

0

파일 보기

def strip_html_from_markdown(org_html, keep_empty_lines=False):
    """
    Take an `org_html` string and return a Markdown string. Empty lines are
    removed from the result, unless `keep_empty_lines` is set to `True`.
    
    >>> strip_html_from_markdown(u"<p>this <strong>was</strong> a <em>triumph</em>!</p>")
    u'this **was** a _triumph_!\\n'
    """

    if not org_html:
        return u""

    # assert isinstance(org_html, unicode), "Input `org_html` is not Unicode"

    # disable the escaping of Markdown-sensitive characters
    html2text.escape_md_section = escape_md_section_override
    h2t = html2text.HTML2Text()
    h2t.body_width = 0
    md_text = h2t.handle(org_html)
    clean_md = u""
    if keep_empty_lines:
        clean_md = md_text
    else:
        # remove white lines
        for line in md_text.split(u"\n"):
            if line:
                clean_md += (line + u"\n")

    # undo the html2text escaping of dots (which interferes
    # with the creation of ordered lists)
    dot_regex = re.compile(ur"(\d+)\\(\.\s)")
    clean_md = re.sub(dot_regex, ur"\g<1>\g<2>", clean_md)

    # this is needed to keep inner parentheses and whitespace in links and
    # images from prematurely ending the Markdown syntax; e.g.
    # ![](image (1).jpg) would otherwise break on the whitespace in the
    # filename and the inner parentheses, so we change it to
    # ![](image&#32;&#40;1&#41;.jpg) to prevent this from happening
    left_paren_regex = re.compile(ur"\\\(")
    clean_md = replace_link_img_matches(
            left_paren_regex, u"&#40;", clean_md)
    right_paren_regex = re.compile(ur"\\\)")
    clean_md = replace_link_img_matches(
            right_paren_regex, u"&#41;", clean_md)
    whitespace_regex = re.compile(ur"\s+")
    clean_md = replace_link_img_matches(
            whitespace_regex, u"&#32;", clean_md)

    # assert isinstance(clean_md, unicode)
    return clean_md

예제 #2

0

파일 보기

파일: base.py 프로젝트: vvkvivekl/omic-django-herald

    def get_html2text_converter():
        try:
            import html2text
        except ImportError:
            raise Exception(
                "HTML2Text is required for sending an EmailNotification with auto HTML to text conversion."
            )

        h = html2text.HTML2Text()

        if hasattr(settings, 'HERALD_HTML2TEXT_CONFIG'):
            for k, v in settings.HERALD_HTML2TEXT_CONFIG.items():
                setattr(h, k, v)

        return h

예제 #3

0

파일 보기

def activBotRSS():

    try:
        con = mdb.connect('localhost', '**!**', '**!**', '**!**')
    except Exception:
        print "Error: connect db"
        return False

    with con:

        cur = con.cursor(mdb.cursors.DictCursor)
        cur.execute("SELECT * FROM sites WHERE status='1' AND feed_type='rss'")

        rows = cur.fetchall()

        for row in rows:
            try:
                feed = feedparser.parse(row["feed_url"])
            except Exception:
                print "Error: loading feeds"
                continue  #Оператор continue начинает следующий проход цикла, минуя оставшееся тело цикла
            print "url=%s amnt=%s \n" % (row["feed_url"], len(feed['entries']))

            try:
                c = dsc.Client(row["pod_url"], row["usrnm"], row["pswrd"])
            except Exception as inst:
                print "Error: connect pod ", sys.exc_info()[0]
                print inst
                continue  #Оператор continue начинает следующий проход цикла, минуя оставшееся тело цикла

            entries = []
            for pFone in feed['entries']:
                strTgs = ""
                dataP = [{
                    "tags": "",
                    "published": "",
                    "author": "",
                    "summary": "",
                    "link": "",
                    "title": "",
                    "dt": ""
                }]
                if hasattr(pFone, 'published'):
                    dataP[0]["published"] = pFone.published
                if hasattr(pFone, 'author'):
                    dataP[0]["author"] = pFone.author
                if hasattr(pFone, 'tags'):
                    strTgs = "\n_ _ _ \n"
                    for tag in pFone.tags:
                        try:
                            tgX = tag.term.replace(" ", "-")
                            strTgs += " #" + tgX
                        except Exception:
                            # просто пропускаем
                            erStp = 1
                    dataP[0]["tags"] = strTgs
                if hasattr(pFone, 'link'):
                    dataP[0]["link"] = pFone.link
                if hasattr(pFone, 'summary'):
                    dataP[0]["summary"] = pFone.summary
                dataP[0]["title"] = pFone.title
                try:
                    dataP[0]["dt"] = datetime.strptime(
                        str(dataP[0]["published"]), "%Y-%m-%d %H:%M:%S")
                except Exception:
                    dataP[0]["dt"] = datetime.now()
                entries.extend(dataP)

            #sorted_entries = sorted(entries, key=lambda entry: entry["published"])
            sorted_entries = sorted(entries, key=lambda entry: entry["dt"])
            sorted_entries.reverse()  # for most recent entries first

            insertNewFeedAmnt = 0

            for post in sorted_entries:
                try:
                    oneURLcheck = hashlib.sha256(
                        post["link"].encode('cp866')).hexdigest()
                except Exception:
                    print "Error: hash URL link"
                    continue  #Оператор continue начинает следующий проход цикла, минуя оставшееся тело цикла

                curPost = con.cursor(mdb.cursors.DictCursor)
                curPost.execute(
                    "SELECT COUNT(*) as amnt FROM feeds WHERE hash=%s",
                    (oneURLcheck))
                #curPost.execute("SELECT COUNT(*) as amnt FROM feeds WHERE idst=%s hash=%s",(row["id"],oneURLcheck))
                rowsPost = curPost.fetchall()
                for rowPost in rowsPost:
                    if rowPost["amnt"] == 0:
                        curPostEx = con.cursor(mdb.cursors.DictCursor)
                        curPostEx.execute(
                            "INSERT INTO feeds (`hash`, `idusr`, `idst`) VALUES (%s, %s, %s)",
                            (oneURLcheck, row["idusr"], row["id"]))
                        #curPostEx.execute("INSERT INTO feeds (`hash`, `idusr`) VALUES (%s, %s)", (oneURLcheck, row["idusr"]))
                        insertNewFeedAmnt += 1

                        h = h2t.HTML2Text()
                        retTxt = h.handle(remove_img_tags(post["summary"]))

                        strTgs = post["tags"]
                        if post["author"] != "":
                            authPost = post["author"]
                        else:
                            authPost = "none"
                        strAuthDate = "*" + post[
                            "published"] + ", by " + authPost + "*" + "\n"

                        if post["link"] != "":
                            if row["view_url"] == 1:
                                postD = "### [" + post["title"] + "](" + post[
                                    "link"] + ") \n" + strAuthDate + "\n" + retTxt.replace(
                                        '\n', "\n") + "\n" + strTgs
                            else:
                                postD = "### " + post[
                                    "title"] + "\n" + strAuthDate + "\n" + retTxt.replace(
                                        '\n', "\n") + "\n" + strTgs
                        else:
                            postD = "### " + post[
                                "title"] + " \n" + strAuthDate + "\n" + retTxt.replace(
                                    '\n', "\n") + "\n" + strTgs

                        if row["string_footer"] != "":
                            dataUTF8 = row["string_footer"]
                            udata = dataUTF8.decode("utf-8", "ignore")
                            asciidata = udata.encode("ascii", "ignore")
                            postD += "\n\n %s" % (asciidata)

                        #print postD + "\n";
                        try:
                            c.post(postD)
                        except Exception:
                            print "Error: post message"
                            continue  #Оператор continue начинает следующий проход цикла, минуя оставшееся тело цикла

                        curPostEx.execute(
                            "UPDATE feeds SET addfeed = %s WHERE hash = %s AND idusr = %s",
                            ("1", oneURLcheck, row["idusr"]))

            if insertNewFeedAmnt != 0:
                print "....add feeds = %s" % (insertNewFeedAmnt)
    return True

예제 #4

0

파일 보기

import requests
import json
import feedparser
from time import mktime
from html2text import html2text
import html2text

text_maker = html2text.HTML2Text()
text_maker.ignore_images = True
text_maker.ignore_tables = True
text_maker.ignore_links = True


def clean(content):
    content = text_maker.handle(content)
    return content.replace('\n\n', ':::').replace('\n',
                                                  ' ').replace(':::',
                                                               '\n\n').strip()


def fetch_articles(url, source='dailycal'):
    d = feedparser.parse(url)
    articles = []
    for entry in d.entries:
        if source == 'dailycal':
            summary = clean(entry['summary']).replace(
                'Read More…', '').replace('The Daily Californian', '').strip()
            summary = ' '.join(summary.split('\n')[1:]).strip()
        else:
            summary = clean(entry['summary']).replace(
                'Read More…', '').replace('The Daily Californian', '').strip()