def strip_html_from_markdown(org_html, keep_empty_lines=False): """ Take an `org_html` string and return a Markdown string. Empty lines are removed from the result, unless `keep_empty_lines` is set to `True`. >>> strip_html_from_markdown(u"<p>this <strong>was</strong> a <em>triumph</em>!</p>") u'this **was** a _triumph_!\\n' """ if not org_html: return u"" # assert isinstance(org_html, unicode), "Input `org_html` is not Unicode" # disable the escaping of Markdown-sensitive characters html2text.escape_md_section = escape_md_section_override h2t = html2text.HTML2Text() h2t.body_width = 0 md_text = h2t.handle(org_html) clean_md = u"" if keep_empty_lines: clean_md = md_text else: # remove white lines for line in md_text.split(u"\n"): if line: clean_md += (line + u"\n") # undo the html2text escaping of dots (which interferes # with the creation of ordered lists) dot_regex = re.compile(ur"(\d+)\\(\.\s)") clean_md = re.sub(dot_regex, ur"\g<1>\g<2>", clean_md) # this is needed to keep inner parentheses and whitespace in links and # images from prematurely ending the Markdown syntax; e.g. # ![](image (1).jpg) would otherwise break on the whitespace in the # filename and the inner parentheses, so we change it to # ![](image (1).jpg) to prevent this from happening left_paren_regex = re.compile(ur"\\\(") clean_md = replace_link_img_matches( left_paren_regex, u"(", clean_md) right_paren_regex = re.compile(ur"\\\)") clean_md = replace_link_img_matches( right_paren_regex, u")", clean_md) whitespace_regex = re.compile(ur"\s+") clean_md = replace_link_img_matches( whitespace_regex, u" ", clean_md) # assert isinstance(clean_md, unicode) return clean_md
def get_html2text_converter(): try: import html2text except ImportError: raise Exception( "HTML2Text is required for sending an EmailNotification with auto HTML to text conversion." ) h = html2text.HTML2Text() if hasattr(settings, 'HERALD_HTML2TEXT_CONFIG'): for k, v in settings.HERALD_HTML2TEXT_CONFIG.items(): setattr(h, k, v) return h
def activBotRSS(): try: con = mdb.connect('localhost', '**!**', '**!**', '**!**') except Exception: print "Error: connect db" return False with con: cur = con.cursor(mdb.cursors.DictCursor) cur.execute("SELECT * FROM sites WHERE status='1' AND feed_type='rss'") rows = cur.fetchall() for row in rows: try: feed = feedparser.parse(row["feed_url"]) except Exception: print "Error: loading feeds" continue #Оператор continue начинает следующий проход цикла, минуя оставшееся тело цикла print "url=%s amnt=%s \n" % (row["feed_url"], len(feed['entries'])) try: c = dsc.Client(row["pod_url"], row["usrnm"], row["pswrd"]) except Exception as inst: print "Error: connect pod ", sys.exc_info()[0] print inst continue #Оператор continue начинает следующий проход цикла, минуя оставшееся тело цикла entries = [] for pFone in feed['entries']: strTgs = "" dataP = [{ "tags": "", "published": "", "author": "", "summary": "", "link": "", "title": "", "dt": "" }] if hasattr(pFone, 'published'): dataP[0]["published"] = pFone.published if hasattr(pFone, 'author'): dataP[0]["author"] = pFone.author if hasattr(pFone, 'tags'): strTgs = "\n_ _ _ \n" for tag in pFone.tags: try: tgX = tag.term.replace(" ", "-") strTgs += " #" + tgX except Exception: # просто пропускаем erStp = 1 dataP[0]["tags"] = strTgs if hasattr(pFone, 'link'): dataP[0]["link"] = pFone.link if hasattr(pFone, 'summary'): dataP[0]["summary"] = pFone.summary dataP[0]["title"] = pFone.title try: dataP[0]["dt"] = datetime.strptime( str(dataP[0]["published"]), "%Y-%m-%d %H:%M:%S") except Exception: dataP[0]["dt"] = datetime.now() entries.extend(dataP) #sorted_entries = sorted(entries, key=lambda entry: entry["published"]) sorted_entries = sorted(entries, key=lambda entry: entry["dt"]) sorted_entries.reverse() # for most recent entries first insertNewFeedAmnt = 0 for post in sorted_entries: try: oneURLcheck = hashlib.sha256( post["link"].encode('cp866')).hexdigest() except Exception: print "Error: hash URL link" continue #Оператор continue начинает следующий проход цикла, минуя оставшееся тело цикла curPost = con.cursor(mdb.cursors.DictCursor) curPost.execute( "SELECT COUNT(*) as amnt FROM feeds WHERE hash=%s", (oneURLcheck)) #curPost.execute("SELECT COUNT(*) as amnt FROM feeds WHERE idst=%s hash=%s",(row["id"],oneURLcheck)) rowsPost = curPost.fetchall() for rowPost in rowsPost: if rowPost["amnt"] == 0: curPostEx = con.cursor(mdb.cursors.DictCursor) curPostEx.execute( "INSERT INTO feeds (`hash`, `idusr`, `idst`) VALUES (%s, %s, %s)", (oneURLcheck, row["idusr"], row["id"])) #curPostEx.execute("INSERT INTO feeds (`hash`, `idusr`) VALUES (%s, %s)", (oneURLcheck, row["idusr"])) insertNewFeedAmnt += 1 h = h2t.HTML2Text() retTxt = h.handle(remove_img_tags(post["summary"])) strTgs = post["tags"] if post["author"] != "": authPost = post["author"] else: authPost = "none" strAuthDate = "*" + post[ "published"] + ", by " + authPost + "*" + "\n" if post["link"] != "": if row["view_url"] == 1: postD = "### [" + post["title"] + "](" + post[ "link"] + ") \n" + strAuthDate + "\n" + retTxt.replace( '\n', "\n") + "\n" + strTgs else: postD = "### " + post[ "title"] + "\n" + strAuthDate + "\n" + retTxt.replace( '\n', "\n") + "\n" + strTgs else: postD = "### " + post[ "title"] + " \n" + strAuthDate + "\n" + retTxt.replace( '\n', "\n") + "\n" + strTgs if row["string_footer"] != "": dataUTF8 = row["string_footer"] udata = dataUTF8.decode("utf-8", "ignore") asciidata = udata.encode("ascii", "ignore") postD += "\n\n %s" % (asciidata) #print postD + "\n"; try: c.post(postD) except Exception: print "Error: post message" continue #Оператор continue начинает следующий проход цикла, минуя оставшееся тело цикла curPostEx.execute( "UPDATE feeds SET addfeed = %s WHERE hash = %s AND idusr = %s", ("1", oneURLcheck, row["idusr"])) if insertNewFeedAmnt != 0: print "....add feeds = %s" % (insertNewFeedAmnt) return True
import requests import json import feedparser from time import mktime from html2text import html2text import html2text text_maker = html2text.HTML2Text() text_maker.ignore_images = True text_maker.ignore_tables = True text_maker.ignore_links = True def clean(content): content = text_maker.handle(content) return content.replace('\n\n', ':::').replace('\n', ' ').replace(':::', '\n\n').strip() def fetch_articles(url, source='dailycal'): d = feedparser.parse(url) articles = [] for entry in d.entries: if source == 'dailycal': summary = clean(entry['summary']).replace( 'Read More…', '').replace('The Daily Californian', '').strip() summary = ' '.join(summary.split('\n')[1:]).strip() else: summary = clean(entry['summary']).replace( 'Read More…', '').replace('The Daily Californian', '').strip()