def wikimarkdown(text, include_toc=True, target=None): from r2.lib.cssfilter import legacy_s3_url def img_swap(tag): name = tag.get('src') name = custom_img_url.search(name) name = name and name.group(1) if name and c.site.images.has_key(name): url = c.site.images[name] url = legacy_s3_url(url, c.site) tag['src'] = url else: tag.extract() nofollow = True text = snudown.markdown(_force_utf8(text), nofollow, target, g.domain, renderer=snudown.RENDERER_WIKI) # TODO: We should test how much of a load this adds to the app soup = BeautifulSoup(text.decode('utf-8')) images = soup.findAll('img') if images: [img_swap(image) for image in images] if include_toc: tocdiv = generate_table_of_contents(soup, prefix="wiki") if tocdiv: soup.insert(0, tocdiv) text = str(soup) return SC_OFF + WIKI_MD_START + text + WIKI_MD_END + SC_ON
def wikimarkdown(text): from r2.lib.cssfilter import legacy_s3_url def img_swap(tag): name = tag.get('src') name = custom_img_url.search(name) name = name and name.group(1) if name and c.site.images.has_key(name): url = c.site.images[name] url = legacy_s3_url(url, c.site) tag['src'] = url else: tag.extract() nofollow = True target = None text = snudown.markdown(_force_utf8(text), nofollow, target, renderer=snudown.RENDERER_WIKI, enable_toc=True) # TODO: We should test how much of a load this adds to the app soup = BeautifulSoup(text) images = soup.findAll('img') if images: [img_swap(image) for image in images] text = str(soup) return SC_OFF + WIKI_MD_START + text + WIKI_MD_END + SC_ON
def safemarkdown(text, nofollow=False, target=None, lang=None, wrap=True): from r2.lib.c_markdown import c_markdown from r2.lib.py_markdown import py_markdown if c.user.pref_no_profanity: text = profanity_filter(text) if not text: return None if c.cname and not target: target = "_top" if lang is None: lang = g.markdown_backend if lang == "snudown": text = snudown.markdown(_force_utf8(text), nofollow, target) elif lang == "c": text = c_markdown(text, nofollow, target) elif lang == "py": text = py_markdown(text, nofollow, target) else: raise ValueError("weird lang [%s]" % lang) if wrap: return SC_OFF + MD_START + text + MD_END + SC_ON else: return text
def normalize_markdown_text(parser, source): rendered = markdown(unicode(source).encode('utf-8')) html_body = ' '.join(rendered.splitlines()) soup = BeautifulSoup(html_body) text = ' '.join(soup.findAll(text=True)) text = parser.unescape(text) return unicode(' '.join(text.splitlines()).replace(',', ' ')).encode('utf-8')
def safemarkdown(text, nofollow=False, wrap=True, **kwargs): from r2.lib.utils import generate_affiliate_link, domain if not text: return None target = kwargs.get("target", None) text = snudown.markdown(_force_utf8(text), nofollow, target) to_affiliate = kwargs.get("affiliate", False) if to_affiliate: soup = BeautifulSoup(text.decode('utf-8')) links = soup.findAll('a') update_text = False def detect_affiliate(markdown_link): return domain(markdown_link.get('href'))\ in g.merchant_affiliate_domains for link in filter(detect_affiliate, links): update_text = True link['class'] = 'affiliate' link['data-href-url'] = link.get('href') link['data-affiliate-url'] = generate_affiliate_link( link.get('href') ) if update_text: text = str(soup) if wrap: return SC_OFF + MD_START + text + MD_END + SC_ON else: return SC_OFF + text + SC_ON
def hello(): messages = rds.zrevrangebyscore('goygoy', '+inf', '-inf') msgs = [] for i in messages: msg = json.loads(i) msgs.append(dict( msg = _force_unicode(snudown.markdown(_force_utf8(msg['msg']))), username='******' )) return render_template('index.html', messages=msgs)
def strip_markdown(text): """Extract text from a markdown string. """ html = markdown(text.encode('utf-8')) soup = BeautifulSoup( html, "html.parser", from_encoding='utf8' ) return "".join(soup.findAll(text=True))
def extract_urls_from_markdown(md): "Extract URLs that will be hot links from a piece of raw Markdown." html = snudown.markdown(_force_utf8(md)) links = SoupStrainer("a") for link in BeautifulSoup(html, parseOnlyThese=links): url = link.get('href') if url: yield url
def safemarkdown(text, nofollow=False, wrap=True, **kwargs): if not text: return None target = kwargs.get("target", None) text = snudown.markdown(_force_utf8(text), nofollow, target) if wrap: return SC_OFF + MD_START + text + MD_END + SC_ON else: return SC_OFF + text + SC_ON
def runTest(self): output = snudown.markdown(self.input) for i, (a, b) in enumerate(zip(repr(self.expected_output), repr(output))): if a != b: io = StringIO.StringIO() print >> io, "TEST FAILED:" print >> io, " input: %s" % repr(self.input) print >> io, " expected: %s" % repr(self.expected_output) print >> io, " actual: %s" % repr(output) print >> io, " %s" % (" " * i + "^") self.fail(io.getvalue())
def safemarkdown(text, nofollow=False, target=None, wrap=True): if not text: return None if c.cname and not target: target = "_top" text = snudown.markdown(_force_utf8(text), nofollow, target) if wrap: return SC_OFF + MD_START + text + MD_END + SC_ON else: return text
def wikimarkdown(text, include_toc=True, target=None): from r2.lib.template_helpers import make_url_protocol_relative # this hard codes the stylesheet page for now, but should be parameterized # in the future to allow per-page images. from r2.models.wiki import ImagesByWikiPage from r2.lib.utils import UrlParser from r2.lib.template_helpers import add_sr page_images = ImagesByWikiPage.get_images(c.site, "config/stylesheet") def img_swap(tag): name = tag.get('src') name = custom_img_url.search(name) name = name and name.group(1) if name and name in page_images: url = page_images[name] url = make_url_protocol_relative(url) tag['src'] = url else: tag.extract() nofollow = True text = snudown.markdown(_force_utf8(text), nofollow, target, renderer=snudown.RENDERER_WIKI) # TODO: We should test how much of a load this adds to the app soup = BeautifulSoup(text.decode('utf-8')) images = soup.findAll('img') if images: [img_swap(image) for image in images] def add_ext_to_link(link): url = UrlParser(link.get('href')) if url.is_reddit_url(): link['href'] = add_sr(link.get('href'), sr_path=False) if c.render_style == 'compact': links = soup.findAll('a') [add_ext_to_link(a) for a in links] if include_toc: tocdiv = generate_table_of_contents(soup, prefix="wiki") if tocdiv: soup.insert(0, tocdiv) text = str(soup) return SC_OFF + WIKI_MD_START + text + WIKI_MD_END + SC_ON
def safemarkdown(text, nofollow=False, wrap=True, **kwargs): if not text: return None # this lets us skip the c.cname lookup (which is apparently quite # slow) if target was explicitly passed to this function. target = kwargs.get("target", None) if "target" not in kwargs and c.cname: target = "_top" text = snudown.markdown(_force_utf8(text), nofollow, target) if wrap: return SC_OFF + MD_START + text + MD_END + SC_ON else: return SC_OFF + text + SC_ON
def runTest(self): output = snudown.markdown(self.input) for i, (a, b) in enumerate(zip(repr(self.expected_output), repr(output))): if a != b: try: io = StringIO.StringIO() except: io = StringIO() print("TEST FAILED:", file=io) print(" input: %s" % repr(self.input), file=io) print(" expected: %s" % repr(self.expected_output), file=io) print(" actual: %s" % repr(output), file=io) print(" %s" % (' ' * i + '^'), file=io) self.fail(io.getvalue())
def process_self(self, submission): html = snudown.markdown(submission.selftext.encode('UTF-8')) soup = BeautifulSoup(html) refs = {} # Iterate through all links, get xkcd json for link in soup.find_all('a'): href = link.get('href') if not href: continue j = self.xkcd_fetcher.get_json(href) if not j: logger.warn( 'Data could not be fetched for {url}'.format(url=href)) continue refs[int(j.get('num', -1))] = {'data': j, 'href': href} return self.process_references(submission, refs)
def wikimarkdown(text, include_toc=True, target=None): from r2.lib.cssfilter import legacy_s3_url nofollow = True text = snudown.markdown(_force_utf8(text), nofollow, target, g.domain ) # TODO: We should test how much of a load this adds to the app soup = BeautifulSoup(text.decode('utf-8')) if include_toc: tocdiv = generate_table_of_contents(soup, prefix="wiki") if tocdiv: soup.insert(0, tocdiv) text = str(soup) return SC_OFF + WIKI_MD_START + text + WIKI_MD_END + SC_ON
def process_self(self, submission): html = snudown.markdown(submission.selftext.encode('UTF-8')) soup = BeautifulSoup(html) refs = {} # Iterate through all links, get xkcd json for link in soup.find_all('a'): href = link.get('href') if not href: continue j = self.xkcd_fetcher.get_json(href) if not j: logger.warn('Data could not be fetched for {url}'.format(url=href)) continue refs[int(j.get('num', -1))] = { 'data': j, 'href': href } return self.process_references(submission, refs)
def wikimarkdown(text, include_toc=True, target=None): from r2.lib.template_helpers import s3_https_if_secure # this hard codes the stylesheet page for now, but should be parameterized # in the future to allow per-page images. from r2.models.wiki import ImagesByWikiPage page_images = ImagesByWikiPage.get_images(c.site, "config/stylesheet") def img_swap(tag): name = tag.get('src') name = custom_img_url.search(name) name = name and name.group(1) if name and name in page_images: url = page_images[name] url = s3_https_if_secure(url) tag['src'] = url else: tag.extract() nofollow = True text = snudown.markdown(_force_utf8(text), nofollow, target, renderer=snudown.RENDERER_WIKI) # TODO: We should test how much of a load this adds to the app soup = BeautifulSoup(text.decode('utf-8')) images = soup.findAll('img') if images: [img_swap(image) for image in images] if include_toc: tocdiv = generate_table_of_contents(soup, prefix="wiki") if tocdiv: soup.insert(0, tocdiv) text = str(soup) return SC_OFF + WIKI_MD_START + text + WIKI_MD_END + SC_ON
def wikimarkdown(text, include_toc=True, target=None): from r2.lib.template_helpers import media_https_if_secure # this hard codes the stylesheet page for now, but should be parameterized # in the future to allow per-page images. from r2.models.wiki import ImagesByWikiPage page_images = ImagesByWikiPage.get_images(c.site, "config/stylesheet") def img_swap(tag): name = tag.get('src') name = custom_img_url.search(name) name = name and name.group(1) if name and name in page_images: url = page_images[name] url = media_https_if_secure(url) tag['src'] = url else: tag.extract() nofollow = True text = snudown.markdown(_force_utf8(text), nofollow, target, renderer=snudown.RENDERER_WIKI) # TODO: We should test how much of a load this adds to the app soup = BeautifulSoup(text.decode('utf-8')) images = soup.findAll('img') if images: [img_swap(image) for image in images] if include_toc: tocdiv = generate_table_of_contents(soup, prefix="wiki") if tocdiv: soup.insert(0, tocdiv) text = str(soup) return SC_OFF + WIKI_MD_START + text + WIKI_MD_END + SC_ON
def parseComment(redditComment, postAuthorName, postAuthorExists, isRoot=True): commentAuthorName = '' commentAuthorExists = 0 try: commentAuthorName = fixUnicode(redditComment.author.name) commentAuthorExists = 1 except AttributeError: commentAuthorExists = 0 if isRoot: htmlFile.write('<div id="' + str(redditComment.id)) htmlFile.write('" class="comment">\n') else: htmlFile.write('<div id="' + str(redditComment.id)) htmlFile.write( '" class="comment" style="margin-bottom:10px;margin-left:0px;">\n') htmlFile.write('<div class="commentinfo">\n') if commentAuthorExists: if postAuthorExists and postAuthorName == commentAuthorName: htmlFile.write('<a href="' + redditComment.author._url) htmlFile.write('" class="postOP-comment">' + commentAuthorName + '</a> <em>') else: htmlFile.write('<a href="' + redditComment.author._url) htmlFile.write('">' + commentAuthorName + '</a> <em>') else: htmlFile.write('<strong>[Deleted]</strong> <em>') htmlFile.write(str(redditComment.ups - redditComment.downs)) htmlFile.write(' Points </em><em>') htmlFile.write('Posted at ') postDate = time.gmtime(redditComment.created_utc) htmlFile.write(str(postDate.tm_hour) + ':') htmlFile.write(str(postDate.tm_min) + ' UTC on ') htmlFile.write(monthsList[postDate.tm_mon - 1] + ' ') htmlFile.write(str(postDate.tm_mday) + ', ' + str(postDate.tm_year)) htmlFile.write('</em></div>\n') htmlFile.write(snudown.markdown(fixMarkdown(redditComment.body))) for reply in redditComment._replies: parseComment(reply, postAuthorName, postAuthorExists, False) htmlFile.write('</div>\n')
def parseComment(redditComment, postAuthorName, postAuthorExists, isRoot=True): commentAuthorName = '' commentAuthorExists = 0 try: commentAuthorName = fixUnicode(redditComment.author.name) commentAuthorExists = 1 except AttributeError: commentAuthorExists = 0 if isRoot: htmlFile.write('<div id="' + str(redditComment.id)) htmlFile.write('" class="comment">\n') else: htmlFile.write('<div id="' + str(redditComment.id)) htmlFile.write('" class="comment" style="margin-bottom:10px;margin-left:0px;">\n') htmlFile.write('<div class="commentinfo">\n') if commentAuthorExists: if postAuthorExists and postAuthorName == commentAuthorName: htmlFile.write('<a href="' + redditComment.author._url) htmlFile.write('" class="postOP-comment">' + commentAuthorName + '</a> <em>') else: htmlFile.write('<a href="' + redditComment.author._url) htmlFile.write('">' + commentAuthorName + '</a> <em>') else: htmlFile.write('<strong>[Deleted]</strong> <em>') htmlFile.write(str(redditComment.ups - redditComment.downs)) htmlFile.write(' Points </em><em>') htmlFile.write('Posted at ') postDate = time.gmtime(redditComment.created_utc) htmlFile.write(str(postDate.tm_hour) + ':') htmlFile.write(str(postDate.tm_min) + ' UTC on ') htmlFile.write(monthsList[postDate.tm_mon-1] + ' ') htmlFile.write(str(postDate.tm_mday) + ', ' + str(postDate.tm_year)) htmlFile.write('</em></div>\n') htmlFile.write(snudown.markdown(fixMarkdown(redditComment.body))) for reply in redditComment._replies: parseComment(reply, postAuthorName, postAuthorExists, False) htmlFile.write('</div>\n')
def wikimarkdown(text, include_toc=True, target=None): from r2.lib.cssfilter import legacy_s3_url def img_swap(tag): name = tag.get('src') name = custom_img_url.search(name) name = name and name.group(1) if name and c.site.images.has_key(name): url = c.site.images[name] url = legacy_s3_url(url, c.site) tag['src'] = url else: tag.extract() nofollow = True text = snudown.markdown(_force_utf8(text), nofollow, target, renderer=snudown.RENDERER_WIKI) # TODO: We should test how much of a load this adds to the app soup = BeautifulSoup(text.decode('utf-8')) images = soup.findAll('img') if images: [img_swap(image) for image in images] if include_toc: tocdiv = generate_table_of_contents(soup, prefix="wiki") if tocdiv: soup.insert(0, tocdiv) text = str(soup) return SC_OFF + WIKI_MD_START + text + WIKI_MD_END + SC_ON
def emailmarkdown(text, wrap=True): if not text: return None text = snudown.markdown(_force_utf8(text)) soup = BeautifulSoup(text.decode('utf-8')) links = soup.findAll('a') update_text = False base = g.https_endpoint or g.origin for link in links: # if link is relative if link['href'].startswith('/'): update_text = True link['href'] = urljoin(base, link['href']) if update_text: text = str(soup) if wrap: return SC_OFF + MD_START + text + MD_END + SC_ON else: return SC_OFF + text + SC_ON
def renderwith(renderer, body): body_utf8 = _force_utf8(body) if renderer is snudown: return snudown.markdown(body_utf8) nodesend(renderer, body_utf8) return nodereceive(renderer)
def parsePost(postObject): writeHeader(fixUnicode(postObject.title)) postObject.replace_more_comments() postAuthorName = '' postAuthorExists = 0 try: postAuthorName = fixUnicode(postObject.author.name) postAuthorExists = 1 except AttributeError: postAuthorExists = 0 htmlFile.write('<div class="title">\n') if postObject.is_self: # The post is a self post htmlFile.write(fixUnicode(postObject.title)) htmlFile.write('\n<br/><strong>') else: # The post is a link post htmlFile.write('<a id="postlink" href="' + fixUnicode(postObject.url)) htmlFile.write('">') htmlFile.write(fixUnicode(postObject.title)) htmlFile.write('</a>\n<br/><strong>') if postAuthorExists: htmlFile.write('Posted by <a id="userlink" href="' + fixUnicode(postObject.author._url)) htmlFile.write('">') htmlFile.write(postAuthorName) htmlFile.write('</a>. </strong><em>') else: htmlFile.write('Posted by [Deleted]. </strong><em>') htmlFile.write('Posted at ') postDate = time.gmtime(postObject.created_utc) htmlFile.write(str(postDate.tm_hour) + ':') htmlFile.write(str(postDate.tm_min) + ' UTC on ') htmlFile.write(monthsList[postDate.tm_mon-1] + ' ') htmlFile.write(str(postDate.tm_mday) + ', ' + str(postDate.tm_year)) htmlFile.write('. ' + str(postObject.ups - postObject.downs)) if postObject.is_self: htmlFile.write(' Points. </em><em>(self.<a id="selfLink" href="') else: htmlFile.write(' Points. </em><em>(<a id="selfLink" href="') htmlFile.write(postObject.subreddit._url) htmlFile.write('">' + postObject.subreddit.display_name) if postObject.is_self: htmlFile.write('</a>)</em><em>') else: htmlFile.write('</a> Subreddit)</em><em>') htmlFile.write(' (<a id="postpermalink" href="') htmlFile.write(fixUnicode(postObject.permalink)) htmlFile.write('">Permalink</a>)</em>\n') if postObject.is_self: htmlFile.write('<div class="post">\n') htmlFile.write(snudown.markdown(fixMarkdown(postObject.selftext))) htmlFile.write('</div>\n') else: htmlFile.write('<div class="post">\n<p>\n') htmlFile.write(postObject.url) htmlFile.write('</p>\n</div>\n') htmlFile.write('</div>\n') for comment in postObject._comments: parseComment(comment, postAuthorName, postAuthorExists) htmlFile.write('<hr id="footerhr">\n') htmlFile.write('<div id="footer"><em>Archived on ') htmlFile.write(str(datetime.datetime.utcnow())) htmlFile.write(' UTC</em></div>') htmlFile.write('\n\n</body>\n</html>\n')
def write_link_page(subreddits, link, subreddit='', hide_deleted_comments=False): # reddit: https://www.reddit.com/r/conspiracy/comments/8742iv/happening_now_classmate_former_friend_of/ # archive: r/conspiracy/comments/8/7/4/2/i/v/happening_now_classmate_former_friend_of.html idpath = '/'.join(list(link['id'])) filepath = link['permalink'].lower().strip('/') + '.html' filepath = filepath.replace(link['id'], idpath) if os.path.isfile(filepath): return True created = datetime.utcfromtimestamp(int(link['created_utc'])) sorted_comments = [] if len(link['comments']) > 0: sorted_comments = sort_comments(link['comments'], hide_deleted_comments) # traverse up to root dir, depends on id length static_include_path = '' for i in range(len(link['id']) + 2): static_include_path += '../' image = None if not args.noimages: i = is_imgur(link['url']) # if we have an imgur client id and the url in the loop is an imgur link then get the URL if i[0]: # Extract url from json and download the image itself imu = get_imgur_image_link(link['url']) if imu is not None: image = retrieve_media(imu) elif i[1]: # TODO: Implement Imgur albums support pass else: image = retrieve_media(link['url']) # Finally, if the image is downloaded then generate a path and attach it to the url entry in the link dict # so when it's used as an href link it will point to the path instead of the url itself # URL + /images/ + ID + . Image Extension if image is not None: link['url'] = subreddit + "/images/" + link['id'] + "." + image[0] # render comments comments_html = '' for c in sorted_comments: css_classes = 'ml-' + (str(c['depth']) if int(c['depth']) <= max_comment_depth else str(max_comment_depth)) if c['author'] == link['author'] and c[ 'author'] not in removed_content_identifiers: css_classes += ' op' if c['stickied'].lower() == 'true' or c['stickied'] is True: css_classes += ' stickied' # author link url = static_include_path + 'user/' + c['author'] + '.html' author_link_html = template_user_url.replace( '###URL_AUTHOR###', url).replace('###AUTHOR###', c['author']) comment_data_map = { '###ID###': c['id'], '###PARENT_ID###': c['parent_id'], '###DEPTH###': str(c['depth']), '###DATE###': created.strftime('%Y-%m-%d'), '###SCORE###': str(c['score']) if len(str(c['score'])) > 0 else missing_comment_score_label, '###BODY###': snudown.markdown(c['body'].replace('>', '>')), '###CSS_CLASSES###': css_classes, '###CLASS_SCORE###': 'badge-danger' if len(c['score']) > 0 and int(c['score']) < 1 else 'badge-secondary', '###HTML_AUTHOR_URL###': author_link_html, } comment_html = template_comment for key, value in comment_data_map.items(): comment_html = comment_html.replace(key, value) comments_html += comment_html + '\n' # render subreddits list subs_menu_html = '' for sub in subreddits: sub_url = static_include_path + sub + '/index.html' subs_menu_html += template_sub_link.replace('###URL_SUB###', sub_url).replace( '###SUB###', sub) # render selftext selftext_html = '' if len(link['selftext']) > 0: selftext_html = template_selftext.replace( '###SELFTEXT###', snudown.markdown(link['selftext'].replace('>', '>'))) # author link url = static_include_path + 'user/' + link['author'] + '.html' author_link_html = template_user_url.replace( '###URL_AUTHOR###', url).replace('###AUTHOR###', link['author']) #html_title = template_url.replace('#HREF#', link['url']).replace('#INNER_HTML#', link['title']) if image is None: html_title = template_url.replace('#HREF#', link['url']).replace( '#INNER_HTML#', link['title']) else: html_title = template_url.replace( '#HREF#', static_include_path + link['url']).replace('#INNER_HTML#', link['title']) if link['is_self'] is True or link['is_self'].lower() == 'true': html_title = link['title'] # render link page link_data_map = { '###INCLUDE_PATH###': static_include_path, '###SUB###': subreddit, '###TITLE###': link['title'], '###ID###': link['id'], '###DATE###': created.strftime('%Y-%m-%d'), '###ARCHIVE_DATE###': datetime.utcfromtimestamp(int( link['retrieved_on'])).strftime('%Y-%m-%d') if link['retrieved_on'] != '' else 'n/a', '###SCORE###': str(link['score']), '###NUM_COMMENTS###': str(link['num_comments']), '###URL_PROJECT###': url_project, '###URL_SUBS###': static_include_path + 'index.html', '###URL_SUB###': static_include_path + subreddit + '/index.html', '###URL_SUB_CMNT###': static_include_path + subreddit + '/index-' + sort_indexes['num_comments']['slug'] + '/index.html', '###URL_SUB_DATE###': static_include_path + subreddit + '/index-' + sort_indexes['created_utc']['slug'] + '/index.html', '###URL_SEARCH###': static_include_path + subreddit + '/search.html', '###HTML_SUBS_MENU###': subs_menu_html, '###HTML_SELFTEXT###': selftext_html, '###HTML_COMMENTS###': comments_html, '###HTML_AUTHOR_URL###': author_link_html, '###HTML_TITLE###': html_title, } html = template_link for key, value in link_data_map.items(): html = html.replace(key, value) # write html # reddit: https://www.reddit.com/r/conspiracy/comments/8742iv/happening_now_classmate_former_friend_of/ # archive: r/conspiracy/comments/8/7/4/2/i/v/happening_now_classmate_former_friend_of.html idpath = '/'.join(list(link['id'])) filepath = link['permalink'].lower().strip('/') + '.html' filepath = filepath.replace(link['id'], idpath) if not os.path.isfile(filepath): os.makedirs(os.path.dirname(filepath), exist_ok=True) with open(filepath, 'w', encoding='utf-8') as file: file.write(html) # print('wrote %s %s' % (created.strftime('%Y-%m-%d'), filepath)) if image is not None: if not os.path.isfile(link['url']): os.makedirs("r/" + subreddit + "/images/", exist_ok=True) open("r/" + link['url'], 'wb').write(image[1]) print("Writing media: %s " % link['url']) # Add a '../' because we will reuse the file location for the index file link['url'] = "../" + link['url'] return True
return self.tag_PRE() def tag_IMG(self): src = self.e.get("src", "") title = self.e.get("title", "") alt = self.e.get("alt") alt = ' "%s"' % alt if alt else "" return "![%s](%s%s)" % (title, src, alt) def tag_DEL(self): return "~~%s~~" % self.default() def tag_P(self): return "%s\n\n" % self.default() def tag_BR(self): return " \n" def tag_A(self): return "[%s](%s)" % (self.default(), self.e.get("href", "")) if __name__ == "__main__": import snudown template = "<textarea>%s</textarea><hr/>%s" original = unicode(open("input3.html", "r").read(), "utf-8") markdowned = MarkDowner(BeautifulSoup(original)).content.encode("ascii", "xmlcharrefreplace") final = snudown.markdown(markdowned, renderer=snudown.RENDERER_WIKI) open("output.html", "w").write(template % (markdowned, final))
def write_link_page(subreddits, link, subreddit='', hide_deleted_comments=False): # reddit: https://www.reddit.com/r/conspiracy/comments/8742iv/happening_now_classmate_former_friend_of/ # archive: r/conspiracy/comments/8/7/4/2/i/v/happening_now_classmate_former_friend_of.html idpath = '/'.join(list(link['id'])) filepath = link['permalink'].lower().strip('/') + '.html' filepath = filepath.replace(link['id'], idpath) if os.path.isfile(filepath): return True created = datetime.utcfromtimestamp(int(link['created_utc'])) sorted_comments = [] if len(link['comments']) > 0: sorted_comments = sort_comments(link['comments'], hide_deleted_comments) # traverse up to root dir, depends on id length static_include_path = '' for i in range(len(link['id']) + 2): static_include_path += '../' # render comments comments_html = '' for c in sorted_comments: css_classes = 'ml-' + (str(c['depth']) if int(c['depth']) <= max_comment_depth else str(max_comment_depth)) if c['author'] == link['author'] and c[ 'author'] not in removed_content_identifiers: css_classes += ' op' if c['stickied'].lower() == 'true' or c['stickied'] is True: css_classes += ' stickied' # author link url = static_include_path + 'user/' + c['author'] + '.html' author_link_html = template_user_url.replace( '###URL_AUTHOR###', url).replace('###AUTHOR###', c['author']) comment_data_map = { '###ID###': c['id'], '###PARENT_ID###': c['parent_id'], '###DEPTH###': str(c['depth']), '###DATE###': created.strftime('%Y-%m-%d'), '###SCORE###': str(c['score']) if len(str(c['score'])) > 0 else missing_comment_score_label, '###BODY###': snudown.markdown(c['body'].replace('>', '>')), '###CSS_CLASSES###': css_classes, '###CLASS_SCORE###': 'badge-danger' if len(c['score']) > 0 and int(c['score']) < 1 else 'badge-secondary', '###HTML_AUTHOR_URL###': author_link_html, } comment_html = template_comment for key, value in comment_data_map.items(): comment_html = comment_html.replace(key, value) comments_html += comment_html + '\n' # render subreddits list subs_menu_html = '' for sub in subreddits: sub_url = static_include_path + sub + '/index.html' subs_menu_html += template_sub_link.replace('###URL_SUB###', sub_url).replace( '###SUB###', sub) # render selftext selftext_html = '' if len(link['selftext']) > 0: selftext_html = template_selftext.replace( '###SELFTEXT###', snudown.markdown(link['selftext'].replace('>', '>'))) # author link url = static_include_path + 'user/' + link['author'] + '.html' author_link_html = template_user_url.replace( '###URL_AUTHOR###', url).replace('###AUTHOR###', link['author']) html_title = template_url.replace('#HREF#', link['url']).replace( '#INNER_HTML#', link['title']) if link['is_self'] is True or link['is_self'].lower() == 'true': html_title = link['title'] # render link page link_data_map = { '###INCLUDE_PATH###': static_include_path, '###SUB###': subreddit, '###TITLE###': link['title'], '###ID###': link['id'], '###DATE###': created.strftime('%Y-%m-%d'), '###ARCHIVE_DATE###': datetime.utcfromtimestamp(int( link['retrieved_on'])).strftime('%Y-%m-%d') if link['retrieved_on'] != '' else 'n/a', '###SCORE###': str(link['score']), '###NUM_COMMENTS###': str(link['num_comments']), '###URL_PROJECT###': url_project, '###URL_SUBS###': static_include_path + 'index.html', '###URL_SUB###': static_include_path + subreddit + '/index.html', '###URL_SUB_CMNT###': static_include_path + subreddit + '/index-' + sort_indexes['num_comments']['slug'] + '/index.html', '###URL_SUB_DATE###': static_include_path + subreddit + '/index-' + sort_indexes['created_utc']['slug'] + '/index.html', '###URL_SEARCH###': static_include_path + subreddit + '/search.html', '###HTML_SUBS_MENU###': subs_menu_html, '###HTML_SELFTEXT###': selftext_html, '###HTML_COMMENTS###': comments_html, '###HTML_AUTHOR_URL###': author_link_html, '###HTML_TITLE###': html_title, } html = template_link for key, value in link_data_map.items(): html = html.replace(key, value) # write html # reddit: https://www.reddit.com/r/conspiracy/comments/8742iv/happening_now_classmate_former_friend_of/ # archive: r/conspiracy/comments/8/7/4/2/i/v/happening_now_classmate_former_friend_of.html idpath = '/'.join(list(link['id'])) filepath = link['permalink'].lower().strip('/') + '.html' filepath = filepath.replace(link['id'], idpath) if not os.path.isfile(filepath): os.makedirs(os.path.dirname(filepath), exist_ok=True) with open(filepath, 'w', encoding='utf-8') as file: file.write(html) # print('wrote %s %s' % (created.strftime('%Y-%m-%d'), filepath)) return True
def markdown(value): return snudown.markdown(value)