if img_src.startswith('files/'): img_src = img_src[len('files/'):] else: print('invalid image: {}'.format(path)) img_alt = img.get('alt') img.drop_tree() else: img_src = None img_alt = None # make absolute links relative doc.rewrite_links(make_relative) body_value = lxml.html.tostring(doc, encoding='unicode') body_value = HTML2Text(bodywidth=0).handle(body_value) body_value = re.sub('\n\n$', '\n', body_value, flags=re.M) if Path(path).exists(): path += '-DUPLICATE.md' with open(path, 'w') as f: # using yaml.dump only for single fields because it does not maintain order f.write('---\n') f.write(yaml.dump(dict(title=title), allow_unicode=True)) f.write('date: %s\n' % date) if author: f.write('author: %s\n' % author) if tags != 'NULL': f.write('tags: [%s]\n' % tags) if img_src:
def _convert_html_markdown(self, title, text): html2plain = HTML2Text(None, "") html2plain.feed("<h1>%s</h1>" % title) html2plain.feed(text) return html2plain.close()
def main(): baseurl = '' p = optparse.OptionParser('%prog [(filename|url) [encoding]]', version='%prog ' + __version__) p.add_option("--ignore-emphasis", dest="ignore_emphasis", action="store_true", default=config.IGNORE_EMPHASIS, help="don't include any formatting for emphasis") p.add_option("--ignore-links", dest="ignore_links", action="store_true", default=config.IGNORE_ANCHORS, help="don't include any formatting for links") p.add_option("--protect-links", dest="protect_links", action="store_true", default=config.PROTECT_LINKS, help=("protect links from line breaks surrounding them " + "with angle brackets")) p.add_option("--ignore-images", dest="ignore_images", action="store_true", default=config.IGNORE_IMAGES, help="don't include any formatting for images") p.add_option("--images-to-alt", dest="images_to_alt", action="store_true", default=config.IMAGES_TO_ALT, help="Discard image data, only keep alt text") p.add_option( "--images-with-size", dest="images_with_size", action="store_true", default=config.IMAGES_WITH_SIZE, help="Write image tags with height and width attrs as raw html to " "retain dimensions") p.add_option("-g", "--google-doc", action="store_true", dest="google_doc", default=False, help="convert an html-exported Google Document") p.add_option("-d", "--dash-unordered-list", action="store_true", dest="ul_style_dash", default=False, help="use a dash rather than a star for unordered list items") p.add_option( "-e", "--asterisk-emphasis", action="store_true", dest="em_style_asterisk", default=False, help="use an asterisk rather than an underscore for emphasized text") p.add_option("-b", "--body-width", dest="body_width", action="store", type="int", default=config.BODY_WIDTH, help="number of characters per output line, 0 for no wrap") p.add_option("-i", "--google-list-indent", dest="list_indent", action="store", type="int", default=config.GOOGLE_LIST_INDENT, help="number of pixels Google indents nested lists") p.add_option("-s", "--hide-strikethrough", action="store_true", dest="hide_strikethrough", default=False, help="hide strike-through text. only relevant when -g is " "specified as well") p.add_option( "--escape-all", action="store_true", dest="escape_snob", default=False, help="Escape all special characters. Output is less readable, but " "avoids corner case formatting issues.") p.add_option("--bypass-tables", action="store_true", dest="bypass_tables", default=config.BYPASS_TABLES, help="Format tables in HTML rather than Markdown syntax.") p.add_option( "--single-line-break", action="store_true", dest="single_line_break", default=config.SINGLE_LINE_BREAK, help=("Use a single line break after a block element rather than two " "line breaks. NOTE: Requires --body-width=0")) (options, args) = p.parse_args() # process input encoding = "utf-8" if len(args) > 0 and args[0] != '-': file_ = args[0] if len(args) == 2: encoding = args[1] if len(args) > 2: p.error('Too many arguments') if file_.startswith('http://') or file_.startswith('https://'): baseurl = file_ j = urllib.urlopen(baseurl) data = j.read() if encoding is None: try: from feedparser import _getCharacterEncoding as enc except ImportError: enc = lambda x, y: ('utf-8', 1) encoding = enc(j.headers, data)[0] if encoding == 'us-ascii': encoding = 'utf-8' else: data = open(file_, 'rb').read() if encoding is None: try: from chardet import detect except ImportError: detect = lambda x: {'encoding': 'utf-8'} encoding = detect(data)['encoding'] else: data = wrap_read() if hasattr(data, 'decode'): data = data.decode(encoding) h = HTML2Text(baseurl=baseurl) # handle options if options.ul_style_dash: h.ul_item_mark = '-' if options.em_style_asterisk: h.emphasis_mark = '*' h.strong_mark = '__' h.body_width = options.body_width h.list_indent = options.list_indent h.ignore_emphasis = options.ignore_emphasis h.ignore_links = options.ignore_links h.protect_links = options.protect_links h.ignore_images = options.ignore_images h.images_to_alt = options.images_to_alt h.images_with_size = options.images_with_size h.google_doc = options.google_doc h.hide_strikethrough = options.hide_strikethrough h.escape_snob = options.escape_snob h.bypass_tables = options.bypass_tables h.single_line_break = options.single_line_break wrapwrite(h.handle(data))
COMMONS_CAT_TEMPLATE = u"https://commons.wikimedia.org/w/api.php?action=query&\ generator=categorymembers&iiurlwidth={0}&gcmtitle=\ Category:{1}&gcmlimit=500&gcmtype=file&prop=imageinfo&\ iiprop=url|timestamp|user|mime|extmetadata&format=json" HEADERS = {'user-agent': 'Science Stories API ([email protected])'} sanitizer = Sanitizer({ 'tags': {'a', 'b', 'br', 'i', 'img', 'p', 'span'}, 'attributes': { 'a': ('href'), 'img': ('src', 'alt') }, 'empty': {'br'}, 'separate': {'a', 'p'} }) html_converter = HTML2Text() html_converter.ignore_links = True def main(): """Call Main Function.""" return iiif_cat_manifest(CATEGORY_STRING) def safe_str(obj): """Return unicode encoding.""" return unicode(obj) def sanitise(html): """Sanitize html."""
def process_field(field, item, FEED, channel): logger.debug("%s:process_field:%s: started", FEED, field) item_url_base = FEED.get('item_url_base', None) if field == 'guid' and item_url_base is not None: if 'guid' in item: return item_url_base + item['guid'] else: logger.error( 'process_field:guid:no such field; try show_sample_entry.py on feed' ) return '' logger.debug("%s:process_field:%s: checking regexes", FEED, field) stringmatch = re.match('^"(.+?)"$', field) highlightmatch = re.match('^([*_~<]+)(.+?)([*_~>]+)$', field) bigcodematch = re.match('^```(.+)```$', field) codematch = re.match('^`(.+)`$', field) tagmatch = re.match('^@(.+)$', field) # new tag regex dictmatch = re.match('^\[(.+)\](.+)\.(.+)$', field) # new dict regex if stringmatch is not None: # Return an actual string literal from config: logger.debug("%s:process_field:%s:isString", FEED, field) return stringmatch.group(1) # string from config elif highlightmatch is not None: logger.debug("%s:process_field:%s:isHighlight", FEED, field) # If there's any markdown on the field, return field with that # markup on it: begin, field, end = highlightmatch.groups() if field in item: if field == "link": url = urljoin(FEED.get("feed-url"), item[field]) return begin + url + end else: return begin + item[field] + end else: logger.error("process_field:%s:no such field", field) return "" elif bigcodematch is not None: logger.debug("%s:process_field:%s:isCodeBlock", FEED, field) # Code blocks are a bit different, with a newline and stuff: field = bigcodematch.group(1) if field in item: return "```\n%s\n```" % (item[field]) else: logger.error("process_field:%s:no such field", field) return "" elif codematch is not None: logger.debug("%s:process_field:%s:isCode", FEED, field) # Since code chunk can't have other highlights, also do them # separately: field = codematch.group(1) if field in item: return "`%s`" % (item[field]) else: logger.error("process_field:%s:no such field", field) return "" elif tagmatch is not None: logger.debug("%s:process_field:%s:isTag", FEED, field) field = tagmatch.group(1) if field in item: # Assuming tags are ', ' separated taglist = item[field].split(', ') # Iterate through channel roles, see if a role is mentionable and # then substitute the role for its id for role in client.get_channel(channel['id']).server.roles: rn = str(role.name) taglist = [ "<@&%s>" % (role.id) if rn == str(i) else i for i in taglist ] return ", ".join(taglist) else: logger.error("process_field:%s:no such field", field) return "" elif dictmatch is not None: logger.debug("%s:process_field:%s:isDict", FEED, field) delim = dictmatch.group(1) field = dictmatch.group(2) dictkey = dictmatch.group(3) if field in item: return delim.join([x[dictkey] for x in item[field]]) else: logger.error("process_field:%s:no such field", field) return "" else: logger.debug("%s:process_field:%s:isPlain", FEED, field) # Just asking for plain field: if field in item: # If field is special field "link", # then use urljoin to turn relative URLs into absolute URLs if field == 'link': return urljoin(FEED.get('feed_url'), item[field]) # Else assume it's a "summary" or "content" or whatever field # and turn HTML into markdown and don't add any markup: else: htmlfixer = HTML2Text() logger.debug(htmlfixer) htmlfixer.ignore_links = True htmlfixer.ignore_images = True htmlfixer.ignore_emphasis = False htmlfixer.body_width = 1000 htmlfixer.unicode_snob = True htmlfixer.ul_item_mark = '-' # Default of "*" likely # to bold things, etc... markdownfield = htmlfixer.handle(item[field]) # Try to strip any remaining HTML out. Not "safe", but # simple and should catch most stuff: markdownfield = re.sub('<[^<]+?>', '', markdownfield) return markdownfield else: logger.error("process_field:%s:no such field", field) return ""
import json import os from textwrap import TextWrapper from html2text import HTML2Text # variable = ''' ''' # print(variable.replace('\"', '\'').encode(encoding='utf-8')) # print(variable) html_to_text = HTML2Text() # result = html_to_text.drop_white_space(variable) # print(result) # lines = lines.split() for file in os.listdir('input'): print(file) file = 'input/' + file lines = '' with open(file, "r+") as file: lines = file.read() # title = html_to_text.handle_tlines) # print(lines.split('\n')[6]) lines = ''.join(lines) # print(lines) # print(json.dumps(lines)) # lines = lines.replace("\\", "/\\") # lines = lines.replace('\"', '\'') # lines = lines.replace('\'', '\\\'')
def strip_tags(html): h = HTML2Text() h.ignore_links = True t = h.handle(html) return t.strip()
def _html_to_markdown(self, s): s = re.sub(r'<font color=[^> ]+>(.+?)</font>', r'<strong>\1</strong>', s) return HTML2Text(bodywidth=0).handle(s).strip()
#!/usr/bin/env python3 from io import BytesIO from warcio.warcwriter import WARCWriter from html2text import HTML2Text from libzim.reader import File as ZIMFile from urllib.parse import quote handler = HTML2Text() handler.ignore_links = True handler.images_to_alt = True html2text = handler.handle with open('example.warc.wet.gz', 'wb') as output: writer = WARCWriter(output, gzip=True) with ZIMFile("data/wikipedia_en_simple_all_nopic_2020-12.zim") as reader: for uid in range(0, reader.article_count): if uid % 10_000 == 0: print("{} out of {}".format(uid, reader.article_count)) article = reader.get_article_by_id(uid) try: if article.mimetype != "text/html": continue except RuntimeError: continue if article.is_redirect: continue url = 'https://simple.wikipedia.org/wiki/{}'.format(quote(article.url))
from run import app, mail, config, markdown from flask_mail import Message from html2text import HTML2Text text_maker = HTML2Text() text_maker.ignore_links = True default_sender = tuple(config['email']['default_sender']) default_recipient = tuple(config['email']['default_recipient']) from flaskext.markdown import Markdown markdown_no_math = Markdown(app, extensions=['extra']) import textwrap def send_email(body_md, sender=None, recipients=None): body_md = body_md.strip() assert body_md.startswith('Subject: ') subject, _, body_md = body_md.partition('\n') _, _, subject = subject.partition(' ') if not sender: sender = default_sender if not recipients: recipients = [default_recipient] msg = Message(subject) msg.recipients = recipients msg.sender = sender msg.html = textwrap.fill(markdown_no_math(body_md), 990)
from email.mime.multipart import MIMEMultipart from email.mime.text import MIMEText from html2text import HTML2Text from jinja2 import Template html_parser = HTML2Text() def build_from_template(source: str, jinja_env: dict = None) -> MIMEMultipart: """ Build an email from a Jinja template and create a MIMEMultipart containing the resulting HTML and alternative text. :param source: Path to the template file. :param jinja_env: Jinja context. :return: A MIMEMultipart containing the rendered message. """ jinja_env = jinja_env or {} multipart = MIMEMultipart('alternative') with open(source, 'r') as f: html = f.read() template = Template(html) html = template.render(**jinja_env) html_mime = MIMEText(html, 'html') multipart.attach(html_mime) # generate alt text from HTML
''' Github:https://github.com/kennethreitz/requests-html ''' from html2text import HTML2Text from requests_html import HTML import requests_html session = requests_html.Session() r = session.get('https://python.org/') res = r.html.links res2 = r.html.absolute_links print(res) print(res2) about = r.html.find('#about', first = True) print(about.text, about.attrs) # print(str(about.html)) doc = """<a href='https://httpbin.org'>""" html1 = HTML(html=doc, url='fakeurl', default_encoding='utf-8') print(html1.links) h = HTML2Text() print(h.handle(about.html))
def html2text(html): html2text_handler = HTML2Text() html2text_handler.ignore_images = True html2text_handler.ignore_links = True text = html2text_handler.handle(to_unicode(html)) return text
from html.parser import HTMLParser from io import StringIO from pprint import pprint from html2text import HTML2Text from markdown import Markdown import frontmatter from frontmatter.default_handlers import YAMLHandler # HTML to MARKDOWN # h2m from waltz.tools import yaml html_to_markdown = HTML2Text() html_to_markdown.single_line_break = False html_to_markdown.skip_internal_links = False html_to_markdown._skip_a_class_check = False html_to_markdown._class_stack = [] WALTZ_METADATA_CLASS = "-waltz-metadata-hidden" class ExtractWaltzMetadata(HTMLParser): def __init__(self): super().__init__() self.reset() self.inside_metadata = False self.strict = False self.convert_charrefs = True self.data = []
def POST(self): try: file = xutils.get_argument("file", {}) address = xutils.get_argument("url", "") name = xutils.get_argument("name", "") filename = "" if hasattr(file, "filename"): filename = file.filename plain_text = "" if not isempty(address): html = readhttp(address) else: # 读取文件 html = "" for chunk in file.file: html += chunk.decode("utf-8") print("Read html, filename={}, length={}".format( filename, len(html))) soup = BeautifulSoup(html, "html.parser") element_list = soup.find_all(["script", "style"]) for element in element_list: element.extract() plain_text = soup.get_text(separator=" ") plain_text = clean_whitespace(plain_text) images = soup.find_all("img") links = soup.find_all("a") csses = soup.find_all("link") scripts = soup.find_all("script") # texts = soup.find_all(["p", "span", "div", "h1", "h2", "h3", "h4"]) h = HTML2Text(baseurl=address) text = "From %s\n\n" % address + h.handle(html) texts = [text] images = get_addr_list(images) scripts = get_addr_list(scripts) if name != "" and name != None: dirname = os.path.join(xconfig.DATA_DIR, time.strftime("archive/%Y/%m/%d")) xutils.makedirs(dirname) path = os.path.join( dirname, "%s_%s.md" % (name, time.strftime("%H%M%S"))) xutils.savetofile(path, text) print("save file %s" % path) if False: user_name = xauth.get_current_name() xutils.call("note.create", name=name, content=content, type="md", tags=["来自网络"], creator=user_name) return xtemplate.render(self.template_path, show_aside=False, images=images, links=links, csses=csses, scripts=scripts, texts=texts, address=address, url=address, plain_text=plain_text) except Exception as e: xutils.print_stacktrace() return xtemplate.render(self.template_path, show_aside=False, error=str(e))
def html2md(title, text): html2plain = HTML2Text(None, "") html2plain.feed("<h1>%s</h1>" % title) html2plain.feed(text) return html2plain.close()
def main(): baseurl = "" class bcolors: HEADER = "\033[95m" OKBLUE = "\033[94m" OKGREEN = "\033[92m" WARNING = "\033[93m" FAIL = "\033[91m" ENDC = "\033[0m" BOLD = "\033[1m" UNDERLINE = "\033[4m" p = argparse.ArgumentParser() p.add_argument( "--default-image-alt", dest="default_image_alt", default=config.DEFAULT_IMAGE_ALT, help="The default alt string for images with missing ones", ) p.add_argument( "--pad-tables", dest="pad_tables", action="store_true", default=config.PAD_TABLES, help="pad the cells to equal column width in tables", ) p.add_argument( "--no-wrap-links", dest="wrap_links", action="store_false", default=config.WRAP_LINKS, help="don't wrap links during conversion", ) p.add_argument( "--wrap-list-items", dest="wrap_list_items", action="store_true", default=config.WRAP_LIST_ITEMS, help="wrap list items during conversion", ) p.add_argument( "--ignore-emphasis", dest="ignore_emphasis", action="store_true", default=config.IGNORE_EMPHASIS, help="don't include any formatting for emphasis", ) p.add_argument( "--reference-links", dest="inline_links", action="store_false", default=config.INLINE_LINKS, help="use reference style links instead of inline links", ) p.add_argument( "--ignore-links", dest="ignore_links", action="store_true", default=config.IGNORE_ANCHORS, help="don't include any formatting for links", ) p.add_argument( "--protect-links", dest="protect_links", action="store_true", default=config.PROTECT_LINKS, help="protect links from line breaks surrounding them with angle brackets", ) p.add_argument( "--ignore-images", dest="ignore_images", action="store_true", default=config.IGNORE_IMAGES, help="don't include any formatting for images", ) p.add_argument( "--images-as-html", dest="images_as_html", action="store_true", default=config.IMAGES_AS_HTML, help=( "Always write image tags as raw html; preserves `height`, `width` and " "`alt` if possible." ), ) p.add_argument( "--images-to-alt", dest="images_to_alt", action="store_true", default=config.IMAGES_TO_ALT, help="Discard image data, only keep alt text", ) p.add_argument( "--images-with-size", dest="images_with_size", action="store_true", default=config.IMAGES_WITH_SIZE, help=( "Write image tags with height and width attrs as raw html to retain " "dimensions" ), ) p.add_argument( "-g", "--google-doc", action="store_true", dest="google_doc", default=False, help="convert an html-exported Google Document", ) p.add_argument( "-d", "--dash-unordered-list", action="store_true", dest="ul_style_dash", default=False, help="use a dash rather than a star for unordered list items", ) p.add_argument( "-e", "--asterisk-emphasis", action="store_true", dest="em_style_asterisk", default=False, help="use an asterisk rather than an underscore for emphasized text", ) p.add_argument( "-b", "--body-width", dest="body_width", type=int, default=config.BODY_WIDTH, help="number of characters per output line, 0 for no wrap", ) p.add_argument( "-i", "--google-list-indent", dest="list_indent", type=int, default=config.GOOGLE_LIST_INDENT, help="number of pixels Google indents nested lists", ) p.add_argument( "-s", "--hide-strikethrough", action="store_true", dest="hide_strikethrough", default=False, help="hide strike-through text. only relevant when -g is " "specified as well", ) p.add_argument( "--escape-all", action="store_true", dest="escape_snob", default=False, help=( "Escape all special characters. Output is less readable, but avoids " "corner case formatting issues." ), ) p.add_argument( "--bypass-tables", action="store_true", dest="bypass_tables", default=config.BYPASS_TABLES, help="Format tables in HTML rather than Markdown syntax.", ) p.add_argument( "--ignore-tables", action="store_true", dest="ignore_tables", default=config.IGNORE_TABLES, help="Ignore table-related tags (table, th, td, tr) " "while keeping rows.", ) p.add_argument( "--single-line-break", action="store_true", dest="single_line_break", default=config.SINGLE_LINE_BREAK, help=( "Use a single line break after a block element rather than two line " "breaks. NOTE: Requires --body-width=0" ), ) p.add_argument( "--unicode-snob", action="store_true", dest="unicode_snob", default=config.UNICODE_SNOB, help="Use unicode throughout document", ) p.add_argument( "--no-automatic-links", action="store_false", dest="use_automatic_links", default=config.USE_AUTOMATIC_LINKS, help="Do not use automatic links wherever applicable", ) p.add_argument( "--no-skip-internal-links", action="store_false", dest="skip_internal_links", default=config.SKIP_INTERNAL_LINKS, help="Do not skip internal links", ) p.add_argument( "--links-after-para", action="store_true", dest="links_each_paragraph", default=config.LINKS_EACH_PARAGRAPH, help="Put links after each paragraph instead of document", ) p.add_argument( "--mark-code", action="store_true", dest="mark_code", default=config.MARK_CODE, help="Mark program code blocks with [code]...[/code]", ) p.add_argument( "--decode-errors", dest="decode_errors", default=config.DECODE_ERRORS, help=( "What to do in case of decode errors.'ignore', 'strict' and 'replace' are " "acceptable values" ), ) p.add_argument( "--open-quote", dest="open_quote", default=config.OPEN_QUOTE, help="The character used to open quotes", ) p.add_argument( "--close-quote", dest="close_quote", default=config.CLOSE_QUOTE, help="The character used to close quotes", ) p.add_argument( "--backquote-code", action="store_true", dest="backquote_code", default=config.BACKQUOTE_CODE, help="Wrap program code blocks with ```...```", ) p.add_argument( "--version", action="version", version=".".join(map(str, __version__)) ) p.add_argument("filename", nargs="?") p.add_argument("encoding", nargs="?", default="utf-8") args = p.parse_args() if args.filename and args.filename != "-": with open(args.filename, "rb") as fp: data = fp.read() else: data = sys.stdin.buffer.read() try: data = data.decode(args.encoding, args.decode_errors) except UnicodeDecodeError as err: warning = bcolors.WARNING + "Warning:" + bcolors.ENDC warning += " Use the " + bcolors.OKGREEN warning += "--decode-errors=ignore" + bcolors.ENDC + " flag." print(warning) raise err h = HTML2Text(baseurl=baseurl) # handle options if args.ul_style_dash: h.ul_item_mark = "-" if args.em_style_asterisk: h.emphasis_mark = "*" h.strong_mark = "__" h.body_width = args.body_width h.google_list_indent = args.list_indent h.ignore_emphasis = args.ignore_emphasis h.ignore_links = args.ignore_links h.protect_links = args.protect_links h.ignore_images = args.ignore_images h.images_as_html = args.images_as_html h.images_to_alt = args.images_to_alt h.images_with_size = args.images_with_size h.google_doc = args.google_doc h.hide_strikethrough = args.hide_strikethrough h.escape_snob = args.escape_snob h.bypass_tables = args.bypass_tables h.ignore_tables = args.ignore_tables h.single_line_break = args.single_line_break h.inline_links = args.inline_links h.unicode_snob = args.unicode_snob h.use_automatic_links = args.use_automatic_links h.skip_internal_links = args.skip_internal_links h.links_each_paragraph = args.links_each_paragraph h.mark_code = args.mark_code h.wrap_links = args.wrap_links h.wrap_list_items = args.wrap_list_items h.pad_tables = args.pad_tables h.default_image_alt = args.default_image_alt h.open_quote = args.open_quote h.close_quote = args.close_quote h.backquote_code = args.backquote_code sys.stdout.write(h.handle(data))
def process_field(field, item, feed, channel): """ This looks at the field from the config, and returns the processed string naked item in fields: return that field from the feed item *, **, _, ~, `, ```: markup the field and return it from the feed item " around the field: string literal Added new @, turns each comma separated tag into a group mention """ logger.debug(f"{feed}:process_field:{field}: started") item_url_base = feed.get('item_url_base', None) if field == 'guid' and item_url_base is not None: if 'guid' in item: return item_url_base + item['guid'] else: logger.error( 'process_field:guid:no such field; try show_sample_entry.py on feed' ) return "" logger.debug(f"{feed}:process_field:{field}: checking regexes") stringmatch = re.match('^"(.+?)"$', field) highlightmatch = re.match('^([*_~<]+)(.+?)([*_~>]+)$', field) bigcodematch = re.match('^```(.+)```$', field) codematch = re.match('^`(.+)`$', field) tagmatch = re.match('^@(.+)$', field) # new tag regex if stringmatch is not None: # Return an actual string literal from config: logger.debug(f"{feed}:process_field:{field}:isString") return stringmatch.group(1) # string from config elif highlightmatch is not None: logger.debug(f"{feed}:process_field:{field}:isHighlight") # If there's any markdown on the field, return field with that # markup on it: begin, field, end = highlightmatch.groups() if field in item: if field == "link": url = urljoin(feed.get("feed-url"), item[field]) return begin + url + end else: return begin + item[field] + end else: logger.error(f"process_field:{field}:no such field") return "" elif bigcodematch is not None: logger.debug(f"{feed}:process_field:{field}:isCodeBlock") # Code blocks are a bit different, with a newline and stuff: field = bigcodematch.group(1) if field in item: return "```\n{item[field]}\n```" else: logger.error(f"process_field:{field}:no such field") return "" elif codematch is not None: logger.debug(f"{feed}:process_field:{field}:isCode") # Since code chunk can't have other highlights, also do them # separately: field = codematch.group(1) if field in item: return f"`{item[field]}`" else: logger.error(f"process_field:{field}:no such field") return "" elif tagmatch is not None: logger.debug(f"{feed}:process_field:{field}:isTag") field = tagmatch.group(1) if field in item: # Assuming tags are ', ' separated taglist = item[field].split(', ') # Iterate through channel roles, see if a role is mentionable and # then substitute the role for its id for role in client.get_channel(channel['id']).server.roles: rn = str(role.name) taglist = [ f"<@&{role.id}>" if rn == str(i) else i for i in taglist ] return ", ".join(taglist) else: logger.error(f"process_field:{field}:no such field") return "" else: logger.debug(f"{feed}:process_field:{field}:isPlain") # Just asking for plain field: if field in item: # If field is special field "link", # then use urljoin to turn relative URLs into absolute URLs if field == 'link': return urljoin(feed.get('feed_url'), item[field]) # Else assume it's a "summary" or "content" or whatever field # and turn HTML into markdown and don't add any markup: else: htmlfixer = HTML2Text() logger.debug(htmlfixer) htmlfixer.ignore_links = True htmlfixer.ignore_images = True htmlfixer.ignore_emphasis = False htmlfixer.body_width = 1000 htmlfixer.unicode_snob = True htmlfixer.ul_item_mark = '-' # Default of "*" likely # to bold things, etc... markdownfield = htmlfixer.handle(item[field]) # Try to strip any remaining HTML out. Not "safe", but # simple and should catch most stuff: markdownfield = re.sub('<[^<]+?>', '', markdownfield) return markdownfield else: logger.error(f"process_field:{field}:no such field") return ""
import discord, asyncio from discord.ext import commands from ace import log from utils.docs_search import docs_search from utils.string_manip import welcomify, to_markdown, shorten from cogs.base import TogglableCogMixin from html2text import HTML2Text from bs4 import BeautifulSoup from datetime import datetime, timezone, timedelta htt = HTML2Text() htt.body_width = 0 # for verification stuff GENERAL_ID = 115993023636176902 STAFF_ID = 311784919208558592 MEMBER_ID = 509526426198999040 WELCOME_MSG = ''' Welcome to our Discord community {user}! A collection of useful tips are in <#407666416297443328> and recent announcements can be found in <#367301754729267202>. ''' # for rss #FORUM_ID = 517692823621861409 FORUM_ID = 536785342959845386 # for roles ROLES_CHANNEL = 513071256283906068
gi.require_version('AppStreamGlib', '1.0') from gi.repository import AppStreamGlib from html2text import HTML2Text from subprocess import call APP_ID = 'com.uploadedlobster.peek' APPSTREAM_TMP_FILE = '/tmp/%s.appdata.xml' % APP_ID DEFAULT_LOCALE = 'C' locales = [DEFAULT_LOCALE] if len(sys.argv) > 1: locales = sys.argv[1:] # Configure html2text html2text = HTML2Text() html2text.body_width = 0 html2text.ignore_links = True html2text.ignore_images = True html2text.ul_item_mark = '-' def format_description(text): text = html2text.handle(description).strip() text = re.sub(r"(\s*\n){3,}", "\n\n", text) return text def translate_appstream_template(output_file): cwd = os.path.dirname(os.path.abspath(__file__)) appstream_template = os.path.join(cwd,
def convert_using_html2text(text): h = HTML2Text(bodywidth=0) h.unicode_snob = True return h.handle(text)
def extract(self, year, month, day, update_existing=False, max_thumb_size=None, use_cached=False): if max_thumb_size: logger.debug( 'extractor nasa_apod_en: max_thumb_size not available') potd_at = date(year=year, month=month, day=day) potd_kwargs = {'potd_at': potd_at, 'source_type': self.source_type} try: potd = POTD.objects.get(**potd_kwargs) if not update_existing: logger.info( 'extractor nasa_apod_en: potd already existing and not updating: {}' .format(potd)) return else: logger.debug( 'extractor nasa_apod_en: using already existing potd: {}'. format(potd_kwargs)) except POTD.DoesNotExist: logger.debug( 'extractor nasa_apod_en: creating new potd for {}'.format( potd_kwargs)) potd = POTD(**potd_kwargs) source_potd_url = NASAAPODEnExtractor.APOD_DETAIL_URL_BY_DATE.format( yy=str(year)[-2:], mm=int(month), dd=int(day)) potd.source_url = source_potd_url logger.debug('extractor nasa_apod_en: source potd url: {}'.format( source_potd_url)) if use_cached and potd.raw_scaping_data_binary_compressed: response = self._FakeResponse( potd.raw_scaping_data_binary_uncompressed) logger.debug('extractor nasa_apod_en: using cached markup') else: response = requests.get(source_potd_url) potd.raw_scaping_data_binary_compressed = self._compress( response.content) if response.ok: # Marked up like it's 1995... tree = html.fromstring(response.content) _explanation = tree.xpath( "//b[starts-with(normalize-space(text()),'Explanation')]/parent::p" )[0] h = HTML2Text() h.ignore_links = True potd.description = h.handle( tostring(_explanation, encoding='unicode')).strip().replace( '** Explanation: ** ', '').strip() potd.title = tree.xpath( "//img[starts-with(@src,'image/')][1]/following::b[1]" )[0].text_content().strip() _image_filename = tree.xpath( "//img[starts-with(@src,'image/')][1]/attribute::src")[0] if not _image_filename.lower().endswith( settings.ALLOWED_IMAGE_EXTENSIONS): logger.info( 'extractor nasa_apod_en: not a matching potd type: {}'. format(_image_filename)) return potd.image_url = NASAAPODEnExtractor.APOD_BASE_URL + _image_filename # nice to have's try: center_elem = tree.xpath( "//img[starts-with(@src,'image/')][1]/following::b[1]/.." )[0] center_elem_html = tostring(center_elem, encoding='unicode').replace( '\n', ' ') # Ugly, but works... # TODO: reimplement this with elem.drop_tree() _copyright = NASAAPODEnExtractor.UGLY_TAG_STRIP_RE.sub( ' ', center_elem_html) h.ignore_links = False potd.copyright_info = h.handle(_copyright.strip()).replace( '\n', ' ') _image_hires_url = tree.xpath( "//img[contains(@src,'image/')][1]/../attribute::href")[0] if _image_hires_url.lower().endswith( settings.ALLOWED_IMAGE_EXTENSIONS): potd.image_thumbnail_url = potd.image_url if _image_hires_url.startswith('http'): potd.image_url = _image_hires_url else: potd.image_url = NASAAPODEnExtractor.APOD_BASE_URL + _image_hires_url except Exception as e: logger.debug( 'extractor nasa_apod_en: fail for additional info: {}'. format(e)) else: logger.error( 'extractor nasa_apod_en: http status code {} for url: {}'. format(response.status_code, source_potd_url)) response.raise_for_status() potd.retrieved_from_source_at = now() potd.save() logger.info( 'extractor nasa_apod_en: potd okay for {}-{}-{} "{}"'.format( year, month, day, potd.title)) return potd
def main(): baseurl = '' class bcolors: # pragma: no cover HEADER = '\033[95m' OKBLUE = '\033[94m' OKGREEN = '\033[92m' WARNING = '\033[93m' FAIL = '\033[91m' ENDC = '\033[0m' BOLD = '\033[1m' UNDERLINE = '\033[4m' p = optparse.OptionParser('%prog [(filename|url) [encoding]]', version='%prog ' + ".".join(map(str, __version__))) p.add_option("--pad-tables", dest="pad_tables", action="store_true", default=config.PAD_TABLES, help="pad the cells to equal column width in tables") p.add_option("--no-wrap-links", dest="wrap_links", action="store_false", default=config.WRAP_LINKS, help="wrap links during conversion") p.add_option("--ignore-emphasis", dest="ignore_emphasis", action="store_true", default=config.IGNORE_EMPHASIS, help="don't include any formatting for emphasis") p.add_option("--reference-links", dest="inline_links", action="store_false", default=config.INLINE_LINKS, help="use reference style links instead of inline links") p.add_option("--ignore-links", dest="ignore_links", action="store_true", default=config.IGNORE_ANCHORS, help="don't include any formatting for links") p.add_option("--protect-links", dest="protect_links", action="store_true", default=config.PROTECT_LINKS, help=("protect links from line breaks surrounding them " + "with angle brackets")) p.add_option("--ignore-images", dest="ignore_images", action="store_true", default=config.IGNORE_IMAGES, help="don't include any formatting for images") p.add_option("--images-to-alt", dest="images_to_alt", action="store_true", default=config.IMAGES_TO_ALT, help="Discard image data, only keep alt text") p.add_option( "--images-with-size", dest="images_with_size", action="store_true", default=config.IMAGES_WITH_SIZE, help="Write image tags with height and width attrs as raw html to " "retain dimensions") p.add_option("-g", "--google-doc", action="store_true", dest="google_doc", default=False, help="convert an html-exported Google Document") p.add_option("-d", "--dash-unordered-list", action="store_true", dest="ul_style_dash", default=False, help="use a dash rather than a star for unordered list items") p.add_option( "-e", "--asterisk-emphasis", action="store_true", dest="em_style_asterisk", default=False, help="use an asterisk rather than an underscore for emphasized text") p.add_option("-b", "--body-width", dest="body_width", action="store", type="int", default=config.BODY_WIDTH, help="number of characters per output line, 0 for no wrap") p.add_option("-i", "--google-list-indent", dest="list_indent", action="store", type="int", default=config.GOOGLE_LIST_INDENT, help="number of pixels Google indents nested lists") p.add_option("-s", "--hide-strikethrough", action="store_true", dest="hide_strikethrough", default=False, help="hide strike-through text. only relevant when -g is " "specified as well") p.add_option( "--escape-all", action="store_true", dest="escape_snob", default=False, help="Escape all special characters. Output is less readable, but " "avoids corner case formatting issues.") p.add_option("--bypass-tables", action="store_true", dest="bypass_tables", default=config.BYPASS_TABLES, help="Format tables in HTML rather than Markdown syntax.") p.add_option( "--single-line-break", action="store_true", dest="single_line_break", default=config.SINGLE_LINE_BREAK, help=("Use a single line break after a block element rather than two " "line breaks. NOTE: Requires --body-width=0")) p.add_option("--unicode-snob", action="store_true", dest="unicode_snob", default=config.UNICODE_SNOB, help="Use unicode throughout document") p.add_option("--no-automatic-links", action="store_false", dest="use_automatic_links", default=config.USE_AUTOMATIC_LINKS, help="Do not use automatic links wherever applicable") p.add_option("--no-skip-internal-links", action="store_false", dest="skip_internal_links", default=config.SKIP_INTERNAL_LINKS, help="Do not skip internal links") p.add_option("--links-after-para", action="store_true", dest="links_each_paragraph", default=config.LINKS_EACH_PARAGRAPH, help="Put links after each paragraph instead of document") p.add_option("--mark-code", action="store_true", dest="mark_code", default=config.MARK_CODE, help="Mark program code blocks with [code]...[/code]") p.add_option( "--decode-errors", dest="decode_errors", action="store", type="string", default=config.DECODE_ERRORS, help= "What to do in case of decode errors.'ignore', 'strict' and 'replace' are acceptable values" ) (options, args) = p.parse_args() # process input encoding = "utf-8" if len(args) == 2: encoding = args[1] elif len(args) > 2: p.error('Too many arguments') if len(args) > 0 and args[0] != '-': # pragma: no cover file_ = args[0] if file_.startswith('http://') or file_.startswith('https://'): warnings.warn( "Support for retrieving html over network is set for deprecation by version (2017, 1, x)", DeprecationWarning) baseurl = file_ j = urllib.urlopen(baseurl) data = j.read() if encoding is None: try: from feedparser import _getCharacterEncoding as enc except ImportError: enc = lambda x, y: ('utf-8', 1) encoding = enc(j.headers, data)[0] if encoding == 'us-ascii': encoding = 'utf-8' else: data = open(file_, 'rb').read() if encoding is None: try: from chardet import detect except ImportError: detect = lambda x: {'encoding': 'utf-8'} encoding = detect(data)['encoding'] else: data = wrap_read() if hasattr(data, 'decode'): try: try: data = data.decode(encoding, errors=options.decode_errors) except TypeError: # python 2.6.x does not have the errors option data = data.decode(encoding) except UnicodeDecodeError as err: warning = bcolors.WARNING + "Warning:" + bcolors.ENDC warning += ' Use the ' + bcolors.OKGREEN warning += '--decode-errors=ignore' + bcolors.ENDC + 'flag.' print(warning) raise err h = HTML2Text(baseurl=baseurl) # handle options if options.ul_style_dash: h.ul_item_mark = '-' if options.em_style_asterisk: h.emphasis_mark = '*' h.strong_mark = '__' h.body_width = options.body_width h.google_list_indent = options.list_indent h.ignore_emphasis = options.ignore_emphasis h.ignore_links = options.ignore_links h.protect_links = options.protect_links h.ignore_images = options.ignore_images h.images_to_alt = options.images_to_alt h.images_with_size = options.images_with_size h.google_doc = options.google_doc h.hide_strikethrough = options.hide_strikethrough h.escape_snob = options.escape_snob h.bypass_tables = options.bypass_tables h.single_line_break = options.single_line_break h.inline_links = options.inline_links h.unicode_snob = options.unicode_snob h.use_automatic_links = options.use_automatic_links h.skip_internal_links = options.skip_internal_links h.links_each_paragraph = options.links_each_paragraph h.mark_code = options.mark_code h.wrap_links = options.wrap_links h.pad_tables = options.pad_tables wrapwrite(h.handle(data))
# coding:utf-8 from html2text import HTML2Text import re import json import os from collections import OrderedDict from maya import OpenMaya from maya import OpenMayaAnim from maya import OpenMayaFX from maya import OpenMayaMPx from maya import OpenMayaRender from maya import OpenMayaUI parser = HTML2Text() parser.wrap_links = False parser.skip_internal_links = True parser.inline_links = True parser.ignore_anchors = True parser.ignore_images = True parser.ignore_emphasis = True parser.ignore_table = True # ! ---------------------------------------- DIR = os.path.dirname(__file__) WEB = os.path.join( r"D:\Users\Administrator\Desktop\MayaDoc\maya-2019-developer-help_enu_offline", "cpp_ref") FOLDER = os.path.join(DIR, "cpp_ref")
def POST(self): try: file = xutils.get_argument("file", {}) address = xutils.get_argument("url", "") name = xutils.get_argument("name", "") filename = "" if hasattr(file, "filename"): filename = file.filename plain_text = "" if not isempty(address): html = readhttp(address) else: # 读取文件 html = "" for chunk in file.file: html += chunk.decode("utf-8") print("Read html, filename={}, length={}".format( filename, len(html))) soup = BeautifulSoup(html, "html.parser") element_list = soup.find_all(["script", "style"]) for element in element_list: element.extract() plain_text = soup.get_text(separator=" ") plain_text = clean_whitespace(plain_text) images = soup.find_all("img") links = soup.find_all("a") csses = soup.find_all("link") scripts = soup.find_all("script") title = get_html_title(soup) h = HTML2Text(baseurl=address) text = "From %s\n\n" % address + h.handle(html) texts = [text] images = get_addr_list(images) scripts = get_addr_list(scripts) if name != "" and name != None: save_to_archive_dir(name) return xtemplate.render(self.template_path, show_aside=False, images=images, links=links, csses=csses, scripts=scripts, texts=texts, address=address, url=address, article_title=title, plain_text=plain_text) except Exception as e: xutils.print_stacktrace() return xtemplate.render(self.template_path, show_aside=False, error=str(e))
def html2md(text, width=0): h = HTML2Text() h.body_width = width return h.handle(text)
def get_markdown_content(self): html_content = self.get_text() text_maker = HTML2Text() text_maker.body_width = 0 text_maker.single_line_break = True return text_maker.handle(str(html_content))
def __init__(self): services = build(SERVICE, VERSION, developerKey=API_KEY) self.translator = services.translations() self.cleaner = HTML2Text()
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2', 'Cache-Control': 'no-cache', 'Connection': 'keep-alive', 'Pragma': 'no-cache', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:59.0) Gecko/20100101 Firefox/59.0', } MAX_TRY_AGAIN_TIME = 3 TIMEOUT = 3 UrlDetail = namedtuple('UrlDetail', ['url', 'detail']) # url是网址,detail中是字典: {内容1: 规则1, ...} LOGGER = MyLogger(__file__) BloomFilter = MyBloomFilter() TextMaker = HTML2Text() TextMaker.ignore_links = True TextMaker.ignore_images = True TextMaker.ignore_tables = True TextMaker.single_line_break = True class Item: """ 分类页的任务单位 """ __slots__ = ['url', 'detail', 'is_direct', 'is_json'] def __init__(self, url, detail, is_direct=False, is_json=False): self.url = url self.detail = detail self.is_direct = is_direct # 分类是不是直接到内容 # self.is_json = is_json # 内容是不是json格式
def __init__(self, source): # super(HtmlToMarkdown, self).__init__() self.source = source self.html_handle = HTML2Text()