Python HTML2Text 예제들, html2text.HTML2Text Python 예제들

예제 #1

0

파일 보기

            if img_src.startswith('files/'):
                img_src = img_src[len('files/'):]
            else:
                print('invalid image: {}'.format(path))
            img_alt = img.get('alt')
            img.drop_tree()
        else:
            img_src = None
            img_alt = None

# make absolute links relative
        doc.rewrite_links(make_relative)

        body_value = lxml.html.tostring(doc, encoding='unicode')

        body_value = HTML2Text(bodywidth=0).handle(body_value)
        body_value = re.sub('\n\n$', '\n', body_value, flags=re.M)

        if Path(path).exists():
            path += '-DUPLICATE.md'

        with open(path, 'w') as f:
            # using yaml.dump only for single fields because it does not maintain order
            f.write('---\n')
            f.write(yaml.dump(dict(title=title), allow_unicode=True))
            f.write('date: %s\n' % date)
            if author:
                f.write('author: %s\n' % author)
            if tags != 'NULL':
                f.write('tags: [%s]\n' % tags)
            if img_src:

예제 #2

0

파일 보기

 def _convert_html_markdown(self, title, text):
     html2plain = HTML2Text(None, "")
     html2plain.feed("<h1>%s</h1>" % title)
     html2plain.feed(text)
     return html2plain.close()

예제 #3

0

파일 보기

파일: cli.py 프로젝트: glondon/python-naive-bayes-spam-classifier

def main():
    baseurl = ''

    p = optparse.OptionParser('%prog [(filename|url) [encoding]]',
                              version='%prog ' + __version__)
    p.add_option("--ignore-emphasis",
                 dest="ignore_emphasis",
                 action="store_true",
                 default=config.IGNORE_EMPHASIS,
                 help="don't include any formatting for emphasis")
    p.add_option("--ignore-links",
                 dest="ignore_links",
                 action="store_true",
                 default=config.IGNORE_ANCHORS,
                 help="don't include any formatting for links")
    p.add_option("--protect-links",
                 dest="protect_links",
                 action="store_true",
                 default=config.PROTECT_LINKS,
                 help=("protect links from line breaks surrounding them " +
                       "with angle brackets"))
    p.add_option("--ignore-images",
                 dest="ignore_images",
                 action="store_true",
                 default=config.IGNORE_IMAGES,
                 help="don't include any formatting for images")
    p.add_option("--images-to-alt",
                 dest="images_to_alt",
                 action="store_true",
                 default=config.IMAGES_TO_ALT,
                 help="Discard image data, only keep alt text")
    p.add_option(
        "--images-with-size",
        dest="images_with_size",
        action="store_true",
        default=config.IMAGES_WITH_SIZE,
        help="Write image tags with height and width attrs as raw html to "
        "retain dimensions")
    p.add_option("-g",
                 "--google-doc",
                 action="store_true",
                 dest="google_doc",
                 default=False,
                 help="convert an html-exported Google Document")
    p.add_option("-d",
                 "--dash-unordered-list",
                 action="store_true",
                 dest="ul_style_dash",
                 default=False,
                 help="use a dash rather than a star for unordered list items")
    p.add_option(
        "-e",
        "--asterisk-emphasis",
        action="store_true",
        dest="em_style_asterisk",
        default=False,
        help="use an asterisk rather than an underscore for emphasized text")
    p.add_option("-b",
                 "--body-width",
                 dest="body_width",
                 action="store",
                 type="int",
                 default=config.BODY_WIDTH,
                 help="number of characters per output line, 0 for no wrap")
    p.add_option("-i",
                 "--google-list-indent",
                 dest="list_indent",
                 action="store",
                 type="int",
                 default=config.GOOGLE_LIST_INDENT,
                 help="number of pixels Google indents nested lists")
    p.add_option("-s",
                 "--hide-strikethrough",
                 action="store_true",
                 dest="hide_strikethrough",
                 default=False,
                 help="hide strike-through text. only relevant when -g is "
                 "specified as well")
    p.add_option(
        "--escape-all",
        action="store_true",
        dest="escape_snob",
        default=False,
        help="Escape all special characters.  Output is less readable, but "
        "avoids corner case formatting issues.")
    p.add_option("--bypass-tables",
                 action="store_true",
                 dest="bypass_tables",
                 default=config.BYPASS_TABLES,
                 help="Format tables in HTML rather than Markdown syntax.")
    p.add_option(
        "--single-line-break",
        action="store_true",
        dest="single_line_break",
        default=config.SINGLE_LINE_BREAK,
        help=("Use a single line break after a block element rather than two "
              "line breaks. NOTE: Requires --body-width=0"))
    (options, args) = p.parse_args()

    # process input
    encoding = "utf-8"
    if len(args) > 0 and args[0] != '-':
        file_ = args[0]
        if len(args) == 2:
            encoding = args[1]
        if len(args) > 2:
            p.error('Too many arguments')

        if file_.startswith('http://') or file_.startswith('https://'):
            baseurl = file_
            j = urllib.urlopen(baseurl)
            data = j.read()
            if encoding is None:
                try:
                    from feedparser import _getCharacterEncoding as enc
                except ImportError:
                    enc = lambda x, y: ('utf-8', 1)
                encoding = enc(j.headers, data)[0]
                if encoding == 'us-ascii':
                    encoding = 'utf-8'
        else:
            data = open(file_, 'rb').read()
            if encoding is None:
                try:
                    from chardet import detect
                except ImportError:
                    detect = lambda x: {'encoding': 'utf-8'}
                encoding = detect(data)['encoding']
    else:
        data = wrap_read()

    if hasattr(data, 'decode'):
        data = data.decode(encoding)

    h = HTML2Text(baseurl=baseurl)
    # handle options
    if options.ul_style_dash:
        h.ul_item_mark = '-'
    if options.em_style_asterisk:
        h.emphasis_mark = '*'
        h.strong_mark = '__'

    h.body_width = options.body_width
    h.list_indent = options.list_indent
    h.ignore_emphasis = options.ignore_emphasis
    h.ignore_links = options.ignore_links
    h.protect_links = options.protect_links
    h.ignore_images = options.ignore_images
    h.images_to_alt = options.images_to_alt
    h.images_with_size = options.images_with_size
    h.google_doc = options.google_doc
    h.hide_strikethrough = options.hide_strikethrough
    h.escape_snob = options.escape_snob
    h.bypass_tables = options.bypass_tables
    h.single_line_break = options.single_line_break

    wrapwrite(h.handle(data))

예제 #4

0

파일 보기

COMMONS_CAT_TEMPLATE = u"https://commons.wikimedia.org/w/api.php?action=query&\
                        generator=categorymembers&iiurlwidth={0}&gcmtitle=\
                        Category:{1}&gcmlimit=500&gcmtype=file&prop=imageinfo&\
                        iiprop=url|timestamp|user|mime|extmetadata&format=json"

HEADERS = {'user-agent': 'Science Stories API ([email protected])'}
sanitizer = Sanitizer({
    'tags': {'a', 'b', 'br', 'i', 'img', 'p', 'span'},
    'attributes': {
        'a': ('href'),
        'img': ('src', 'alt')
    },
    'empty': {'br'},
    'separate': {'a', 'p'}
})
html_converter = HTML2Text()
html_converter.ignore_links = True


def main():
    """Call Main Function."""
    return iiif_cat_manifest(CATEGORY_STRING)


def safe_str(obj):
    """Return unicode encoding."""
    return unicode(obj)


def sanitise(html):
    """Sanitize html."""

예제 #5

0

파일 보기

파일: feed2discord.py 프로젝트: qRdlvNqWw2Z1zTmj/discord_feedbot

def process_field(field, item, FEED, channel):
    logger.debug("%s:process_field:%s: started", FEED, field)

    item_url_base = FEED.get('item_url_base', None)
    if field == 'guid' and item_url_base is not None:
        if 'guid' in item:
            return item_url_base + item['guid']
        else:
            logger.error(
                'process_field:guid:no such field; try show_sample_entry.py on feed'
            )
            return ''

    logger.debug("%s:process_field:%s: checking regexes", FEED, field)
    stringmatch = re.match('^"(.+?)"$', field)
    highlightmatch = re.match('^([*_~<]+)(.+?)([*_~>]+)$', field)
    bigcodematch = re.match('^```(.+)```$', field)
    codematch = re.match('^`(.+)`$', field)

    tagmatch = re.match('^@(.+)$', field)  # new tag regex
    dictmatch = re.match('^\[(.+)\](.+)\.(.+)$', field)  # new dict regex

    if stringmatch is not None:
        # Return an actual string literal from config:
        logger.debug("%s:process_field:%s:isString", FEED, field)
        return stringmatch.group(1)  # string from config
    elif highlightmatch is not None:
        logger.debug("%s:process_field:%s:isHighlight", FEED, field)

        # If there's any markdown on the field, return field with that
        # markup on it:
        begin, field, end = highlightmatch.groups()
        if field in item:
            if field == "link":
                url = urljoin(FEED.get("feed-url"), item[field])
                return begin + url + end
            else:
                return begin + item[field] + end
        else:
            logger.error("process_field:%s:no such field", field)
            return ""

    elif bigcodematch is not None:
        logger.debug("%s:process_field:%s:isCodeBlock", FEED, field)

        # Code blocks are a bit different, with a newline and stuff:
        field = bigcodematch.group(1)
        if field in item:
            return "```\n%s\n```" % (item[field])
        else:
            logger.error("process_field:%s:no such field", field)
            return ""

    elif codematch is not None:
        logger.debug("%s:process_field:%s:isCode", FEED, field)

        # Since code chunk can't have other highlights, also do them
        # separately:
        field = codematch.group(1)
        if field in item:
            return "`%s`" % (item[field])
        else:
            logger.error("process_field:%s:no such field", field)
            return ""

    elif tagmatch is not None:
        logger.debug("%s:process_field:%s:isTag", FEED, field)
        field = tagmatch.group(1)
        if field in item:
            # Assuming tags are ', ' separated
            taglist = item[field].split(', ')
            # Iterate through channel roles, see if a role is mentionable and
            # then substitute the role for its id
            for role in client.get_channel(channel['id']).server.roles:
                rn = str(role.name)
                taglist = [
                    "<@&%s>" % (role.id) if rn == str(i) else i
                    for i in taglist
                ]
                return ", ".join(taglist)
        else:
            logger.error("process_field:%s:no such field", field)
            return ""

    elif dictmatch is not None:
        logger.debug("%s:process_field:%s:isDict", FEED, field)
        delim = dictmatch.group(1)
        field = dictmatch.group(2)
        dictkey = dictmatch.group(3)
        if field in item:
            return delim.join([x[dictkey] for x in item[field]])
        else:
            logger.error("process_field:%s:no such field", field)
            return ""

    else:
        logger.debug("%s:process_field:%s:isPlain", FEED, field)
        # Just asking for plain field:
        if field in item:
            # If field is special field "link",
            # then use urljoin to turn relative URLs into absolute URLs
            if field == 'link':
                return urljoin(FEED.get('feed_url'), item[field])
            # Else assume it's a "summary" or "content" or whatever field
            # and turn HTML into markdown and don't add any markup:
            else:
                htmlfixer = HTML2Text()
                logger.debug(htmlfixer)
                htmlfixer.ignore_links = True
                htmlfixer.ignore_images = True
                htmlfixer.ignore_emphasis = False
                htmlfixer.body_width = 1000
                htmlfixer.unicode_snob = True
                htmlfixer.ul_item_mark = '-'  # Default of "*" likely
                # to bold things, etc...
                markdownfield = htmlfixer.handle(item[field])

                # Try to strip any remaining HTML out.  Not "safe", but
                # simple and should catch most stuff:
                markdownfield = re.sub('<[^<]+?>', '', markdownfield)
                return markdownfield
        else:
            logger.error("process_field:%s:no such field", field)
            return ""

예제 #6

0

파일 보기

import json
import os
from textwrap import TextWrapper

from html2text import HTML2Text

# variable = ''' '''

# print(variable.replace('\"', '\'').encode(encoding='utf-8'))
# print(variable)

html_to_text = HTML2Text()
# result = html_to_text.drop_white_space(variable)
# print(result)

# lines = lines.split()

for file in os.listdir('input'):
    print(file)
    file = 'input/' + file
    lines = ''
    with open(file, "r+") as file:
        lines = file.read()
        # title = html_to_text.handle_tlines)
        # print(lines.split('\n')[6])
        lines = ''.join(lines)
        # print(lines)
        # print(json.dumps(lines))
        # lines = lines.replace("\\", "/\\")
        # lines = lines.replace('\"', '\'')
        # lines = lines.replace('\'', '\\\'')

예제 #7

0

파일 보기

def strip_tags(html):
    h = HTML2Text()
    h.ignore_links = True
    t = h.handle(html)
    return t.strip()

예제 #8

0

파일 보기

 def _html_to_markdown(self, s):
     s = re.sub(r'<font color=[^> ]+>(.+?)</font>', r'<strong>\1</strong>',
                s)
     return HTML2Text(bodywidth=0).handle(s).strip()

예제 #9

0

파일 보기

#!/usr/bin/env python3
from io import BytesIO
from warcio.warcwriter import WARCWriter
from html2text import HTML2Text
from libzim.reader import File as ZIMFile
from urllib.parse import quote


handler = HTML2Text()
handler.ignore_links = True
handler.images_to_alt = True
html2text = handler.handle

with open('example.warc.wet.gz', 'wb') as output:
    writer = WARCWriter(output, gzip=True)
    with ZIMFile("data/wikipedia_en_simple_all_nopic_2020-12.zim") as reader:
        for uid in range(0, reader.article_count):
            if uid % 10_000 == 0:
                print("{} out of {}".format(uid, reader.article_count))

            article = reader.get_article_by_id(uid)
            try:
                if article.mimetype != "text/html":
                    continue
            except RuntimeError:
                continue

            if article.is_redirect:
                continue

            url = 'https://simple.wikipedia.org/wiki/{}'.format(quote(article.url))

예제 #10

0

파일 보기

from run import app, mail, config, markdown

from flask_mail import Message

from html2text import HTML2Text

text_maker = HTML2Text()
text_maker.ignore_links = True

default_sender = tuple(config['email']['default_sender'])
default_recipient = tuple(config['email']['default_recipient'])

from flaskext.markdown import Markdown
markdown_no_math = Markdown(app, extensions=['extra'])

import textwrap


def send_email(body_md, sender=None, recipients=None):
    body_md = body_md.strip()
    assert body_md.startswith('Subject: ')
    subject, _, body_md = body_md.partition('\n')
    _, _, subject = subject.partition(' ')
    if not sender:
        sender = default_sender
    if not recipients:
        recipients = [default_recipient]
    msg = Message(subject)
    msg.recipients = recipients
    msg.sender = sender
    msg.html = textwrap.fill(markdown_no_math(body_md), 990)

예제 #11

0

파일 보기

from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
from html2text import HTML2Text
from jinja2 import Template

html_parser = HTML2Text()


def build_from_template(source: str, jinja_env: dict = None) -> MIMEMultipart:
    """
    Build an email from a Jinja template and create a MIMEMultipart containing the resulting HTML and alternative text.

    :param source: Path to the template file.
    :param jinja_env: Jinja context.
    :return: A MIMEMultipart containing the rendered message.
    """

    jinja_env = jinja_env or {}

    multipart = MIMEMultipart('alternative')

    with open(source, 'r') as f:
        html = f.read()

    template = Template(html)

    html = template.render(**jinja_env)
    html_mime = MIMEText(html, 'html')
    multipart.attach(html_mime)

    # generate alt text from HTML

예제 #12

0

파일 보기

'''
Github:https://github.com/kennethreitz/requests-html
'''
from html2text import HTML2Text
from requests_html import HTML
import requests_html
session = requests_html.Session()
r = session.get('https://python.org/')
res =  r.html.links
res2 = r.html.absolute_links
print(res)
print(res2)

about = r.html.find('#about', first = True)
print(about.text, about.attrs)
# print(str(about.html))

doc = """<a href='https://httpbin.org'>"""
html1 = HTML(html=doc, url='fakeurl', default_encoding='utf-8')
print(html1.links)


h = HTML2Text()
print(h.handle(about.html))

예제 #13

0

파일 보기

파일: util.py 프로젝트: 631068264/fs_web

def html2text(html):
    html2text_handler = HTML2Text()
    html2text_handler.ignore_images = True
    html2text_handler.ignore_links = True
    text = html2text_handler.handle(to_unicode(html))
    return text

예제 #14

0

파일 보기

파일: html_markdown_utilities.py 프로젝트: russross/waltz

from html.parser import HTMLParser
from io import StringIO
from pprint import pprint

from html2text import HTML2Text
from markdown import Markdown
import frontmatter
from frontmatter.default_handlers import YAMLHandler

# HTML to MARKDOWN
# h2m
from waltz.tools import yaml

html_to_markdown = HTML2Text()
html_to_markdown.single_line_break = False
html_to_markdown.skip_internal_links = False
html_to_markdown._skip_a_class_check = False
html_to_markdown._class_stack = []

WALTZ_METADATA_CLASS = "-waltz-metadata-hidden"


class ExtractWaltzMetadata(HTMLParser):
    def __init__(self):
        super().__init__()
        self.reset()
        self.inside_metadata = False
        self.strict = False
        self.convert_charrefs = True
        self.data = []

예제 #15

0

파일 보기

    def POST(self):
        try:
            file = xutils.get_argument("file", {})
            address = xutils.get_argument("url", "")
            name = xutils.get_argument("name", "")
            filename = ""

            if hasattr(file, "filename"):
                filename = file.filename
            plain_text = ""

            if not isempty(address):
                html = readhttp(address)
            else:
                # 读取文件
                html = ""
                for chunk in file.file:
                    html += chunk.decode("utf-8")

            print("Read html, filename={}, length={}".format(
                filename, len(html)))

            soup = BeautifulSoup(html, "html.parser")
            element_list = soup.find_all(["script", "style"])
            for element in element_list:
                element.extract()
            plain_text = soup.get_text(separator=" ")
            plain_text = clean_whitespace(plain_text)

            images = soup.find_all("img")
            links = soup.find_all("a")
            csses = soup.find_all("link")
            scripts = soup.find_all("script")
            # texts = soup.find_all(["p", "span", "div", "h1", "h2", "h3", "h4"])

            h = HTML2Text(baseurl=address)
            text = "From %s\n\n" % address + h.handle(html)

            texts = [text]

            images = get_addr_list(images)
            scripts = get_addr_list(scripts)

            if name != "" and name != None:
                dirname = os.path.join(xconfig.DATA_DIR,
                                       time.strftime("archive/%Y/%m/%d"))
                xutils.makedirs(dirname)
                path = os.path.join(
                    dirname, "%s_%s.md" % (name, time.strftime("%H%M%S")))
                xutils.savetofile(path, text)
                print("save file %s" % path)

            if False:
                user_name = xauth.get_current_name()
                xutils.call("note.create",
                            name=name,
                            content=content,
                            type="md",
                            tags=["来自网络"],
                            creator=user_name)

            return xtemplate.render(self.template_path,
                                    show_aside=False,
                                    images=images,
                                    links=links,
                                    csses=csses,
                                    scripts=scripts,
                                    texts=texts,
                                    address=address,
                                    url=address,
                                    plain_text=plain_text)
        except Exception as e:
            xutils.print_stacktrace()
            return xtemplate.render(self.template_path,
                                    show_aside=False,
                                    error=str(e))

예제 #16

0

파일 보기

def html2md(title, text):
    html2plain = HTML2Text(None, "")
    html2plain.feed("<h1>%s</h1>" % title)
    html2plain.feed(text)
    return html2plain.close()

예제 #17

0

파일 보기

파일: cli.py 프로젝트: drunkdream/html2text

def main():
    baseurl = ""

    class bcolors:
        HEADER = "\033[95m"
        OKBLUE = "\033[94m"
        OKGREEN = "\033[92m"
        WARNING = "\033[93m"
        FAIL = "\033[91m"
        ENDC = "\033[0m"
        BOLD = "\033[1m"
        UNDERLINE = "\033[4m"

    p = argparse.ArgumentParser()
    p.add_argument(
        "--default-image-alt",
        dest="default_image_alt",
        default=config.DEFAULT_IMAGE_ALT,
        help="The default alt string for images with missing ones",
    )
    p.add_argument(
        "--pad-tables",
        dest="pad_tables",
        action="store_true",
        default=config.PAD_TABLES,
        help="pad the cells to equal column width in tables",
    )
    p.add_argument(
        "--no-wrap-links",
        dest="wrap_links",
        action="store_false",
        default=config.WRAP_LINKS,
        help="don't wrap links during conversion",
    )
    p.add_argument(
        "--wrap-list-items",
        dest="wrap_list_items",
        action="store_true",
        default=config.WRAP_LIST_ITEMS,
        help="wrap list items during conversion",
    )
    p.add_argument(
        "--ignore-emphasis",
        dest="ignore_emphasis",
        action="store_true",
        default=config.IGNORE_EMPHASIS,
        help="don't include any formatting for emphasis",
    )
    p.add_argument(
        "--reference-links",
        dest="inline_links",
        action="store_false",
        default=config.INLINE_LINKS,
        help="use reference style links instead of inline links",
    )
    p.add_argument(
        "--ignore-links",
        dest="ignore_links",
        action="store_true",
        default=config.IGNORE_ANCHORS,
        help="don't include any formatting for links",
    )
    p.add_argument(
        "--protect-links",
        dest="protect_links",
        action="store_true",
        default=config.PROTECT_LINKS,
        help="protect links from line breaks surrounding them with angle brackets",
    )
    p.add_argument(
        "--ignore-images",
        dest="ignore_images",
        action="store_true",
        default=config.IGNORE_IMAGES,
        help="don't include any formatting for images",
    )
    p.add_argument(
        "--images-as-html",
        dest="images_as_html",
        action="store_true",
        default=config.IMAGES_AS_HTML,
        help=(
            "Always write image tags as raw html; preserves `height`, `width` and "
            "`alt` if possible."
        ),
    )
    p.add_argument(
        "--images-to-alt",
        dest="images_to_alt",
        action="store_true",
        default=config.IMAGES_TO_ALT,
        help="Discard image data, only keep alt text",
    )
    p.add_argument(
        "--images-with-size",
        dest="images_with_size",
        action="store_true",
        default=config.IMAGES_WITH_SIZE,
        help=(
            "Write image tags with height and width attrs as raw html to retain "
            "dimensions"
        ),
    )
    p.add_argument(
        "-g",
        "--google-doc",
        action="store_true",
        dest="google_doc",
        default=False,
        help="convert an html-exported Google Document",
    )
    p.add_argument(
        "-d",
        "--dash-unordered-list",
        action="store_true",
        dest="ul_style_dash",
        default=False,
        help="use a dash rather than a star for unordered list items",
    )
    p.add_argument(
        "-e",
        "--asterisk-emphasis",
        action="store_true",
        dest="em_style_asterisk",
        default=False,
        help="use an asterisk rather than an underscore for emphasized text",
    )
    p.add_argument(
        "-b",
        "--body-width",
        dest="body_width",
        type=int,
        default=config.BODY_WIDTH,
        help="number of characters per output line, 0 for no wrap",
    )
    p.add_argument(
        "-i",
        "--google-list-indent",
        dest="list_indent",
        type=int,
        default=config.GOOGLE_LIST_INDENT,
        help="number of pixels Google indents nested lists",
    )
    p.add_argument(
        "-s",
        "--hide-strikethrough",
        action="store_true",
        dest="hide_strikethrough",
        default=False,
        help="hide strike-through text. only relevant when -g is " "specified as well",
    )
    p.add_argument(
        "--escape-all",
        action="store_true",
        dest="escape_snob",
        default=False,
        help=(
            "Escape all special characters.  Output is less readable, but avoids "
            "corner case formatting issues."
        ),
    )
    p.add_argument(
        "--bypass-tables",
        action="store_true",
        dest="bypass_tables",
        default=config.BYPASS_TABLES,
        help="Format tables in HTML rather than Markdown syntax.",
    )
    p.add_argument(
        "--ignore-tables",
        action="store_true",
        dest="ignore_tables",
        default=config.IGNORE_TABLES,
        help="Ignore table-related tags (table, th, td, tr) " "while keeping rows.",
    )
    p.add_argument(
        "--single-line-break",
        action="store_true",
        dest="single_line_break",
        default=config.SINGLE_LINE_BREAK,
        help=(
            "Use a single line break after a block element rather than two line "
            "breaks. NOTE: Requires --body-width=0"
        ),
    )
    p.add_argument(
        "--unicode-snob",
        action="store_true",
        dest="unicode_snob",
        default=config.UNICODE_SNOB,
        help="Use unicode throughout document",
    )
    p.add_argument(
        "--no-automatic-links",
        action="store_false",
        dest="use_automatic_links",
        default=config.USE_AUTOMATIC_LINKS,
        help="Do not use automatic links wherever applicable",
    )
    p.add_argument(
        "--no-skip-internal-links",
        action="store_false",
        dest="skip_internal_links",
        default=config.SKIP_INTERNAL_LINKS,
        help="Do not skip internal links",
    )
    p.add_argument(
        "--links-after-para",
        action="store_true",
        dest="links_each_paragraph",
        default=config.LINKS_EACH_PARAGRAPH,
        help="Put links after each paragraph instead of document",
    )
    p.add_argument(
        "--mark-code",
        action="store_true",
        dest="mark_code",
        default=config.MARK_CODE,
        help="Mark program code blocks with [code]...[/code]",
    )
    p.add_argument(
        "--decode-errors",
        dest="decode_errors",
        default=config.DECODE_ERRORS,
        help=(
            "What to do in case of decode errors.'ignore', 'strict' and 'replace' are "
            "acceptable values"
        ),
    )
    p.add_argument(
        "--open-quote",
        dest="open_quote",
        default=config.OPEN_QUOTE,
        help="The character used to open quotes",
    )
    p.add_argument(
        "--close-quote",
        dest="close_quote",
        default=config.CLOSE_QUOTE,
        help="The character used to close quotes",
    )
    p.add_argument(
        "--backquote-code",
        action="store_true",
        dest="backquote_code",
        default=config.BACKQUOTE_CODE,
        help="Wrap program code blocks with ```...```",
    )
    p.add_argument(
        "--version", action="version", version=".".join(map(str, __version__))
    )
    p.add_argument("filename", nargs="?")
    p.add_argument("encoding", nargs="?", default="utf-8")
    args = p.parse_args()

    if args.filename and args.filename != "-":
        with open(args.filename, "rb") as fp:
            data = fp.read()
    else:
        data = sys.stdin.buffer.read()

    try:
        data = data.decode(args.encoding, args.decode_errors)
    except UnicodeDecodeError as err:
        warning = bcolors.WARNING + "Warning:" + bcolors.ENDC
        warning += " Use the " + bcolors.OKGREEN
        warning += "--decode-errors=ignore" + bcolors.ENDC + " flag."
        print(warning)
        raise err

    h = HTML2Text(baseurl=baseurl)
    # handle options
    if args.ul_style_dash:
        h.ul_item_mark = "-"
    if args.em_style_asterisk:
        h.emphasis_mark = "*"
        h.strong_mark = "__"

    h.body_width = args.body_width
    h.google_list_indent = args.list_indent
    h.ignore_emphasis = args.ignore_emphasis
    h.ignore_links = args.ignore_links
    h.protect_links = args.protect_links
    h.ignore_images = args.ignore_images
    h.images_as_html = args.images_as_html
    h.images_to_alt = args.images_to_alt
    h.images_with_size = args.images_with_size
    h.google_doc = args.google_doc
    h.hide_strikethrough = args.hide_strikethrough
    h.escape_snob = args.escape_snob
    h.bypass_tables = args.bypass_tables
    h.ignore_tables = args.ignore_tables
    h.single_line_break = args.single_line_break
    h.inline_links = args.inline_links
    h.unicode_snob = args.unicode_snob
    h.use_automatic_links = args.use_automatic_links
    h.skip_internal_links = args.skip_internal_links
    h.links_each_paragraph = args.links_each_paragraph
    h.mark_code = args.mark_code
    h.wrap_links = args.wrap_links
    h.wrap_list_items = args.wrap_list_items
    h.pad_tables = args.pad_tables
    h.default_image_alt = args.default_image_alt
    h.open_quote = args.open_quote
    h.close_quote = args.close_quote
    h.backquote_code = args.backquote_code

    sys.stdout.write(h.handle(data))

예제 #18

0

파일 보기

파일: feed2discord.py 프로젝트: Infiziert90/discord_feedbot

def process_field(field, item, feed, channel):
    """
    This looks at the field from the config, and returns the processed string
    naked item in fields: return that field from the feed item
    *, **, _, ~, `, ```: markup the field and return it from the feed item
    " around the field: string literal
    Added new @, turns each comma separated tag into a group mention
    """
    logger.debug(f"{feed}:process_field:{field}: started")

    item_url_base = feed.get('item_url_base', None)
    if field == 'guid' and item_url_base is not None:
        if 'guid' in item:
            return item_url_base + item['guid']
        else:
            logger.error(
                'process_field:guid:no such field; try show_sample_entry.py on feed'
            )
            return ""

    logger.debug(f"{feed}:process_field:{field}: checking regexes")
    stringmatch = re.match('^"(.+?)"$', field)
    highlightmatch = re.match('^([*_~<]+)(.+?)([*_~>]+)$', field)
    bigcodematch = re.match('^```(.+)```$', field)
    codematch = re.match('^`(.+)`$', field)

    tagmatch = re.match('^@(.+)$', field)  # new tag regex

    if stringmatch is not None:
        # Return an actual string literal from config:
        logger.debug(f"{feed}:process_field:{field}:isString")
        return stringmatch.group(1)  # string from config
    elif highlightmatch is not None:
        logger.debug(f"{feed}:process_field:{field}:isHighlight")

        # If there's any markdown on the field, return field with that
        # markup on it:
        begin, field, end = highlightmatch.groups()
        if field in item:
            if field == "link":
                url = urljoin(feed.get("feed-url"), item[field])
                return begin + url + end
            else:
                return begin + item[field] + end
        else:
            logger.error(f"process_field:{field}:no such field")
            return ""

    elif bigcodematch is not None:
        logger.debug(f"{feed}:process_field:{field}:isCodeBlock")

        # Code blocks are a bit different, with a newline and stuff:
        field = bigcodematch.group(1)
        if field in item:
            return "```\n{item[field]}\n```"
        else:
            logger.error(f"process_field:{field}:no such field")
            return ""

    elif codematch is not None:
        logger.debug(f"{feed}:process_field:{field}:isCode")

        # Since code chunk can't have other highlights, also do them
        # separately:
        field = codematch.group(1)
        if field in item:
            return f"`{item[field]}`"
        else:
            logger.error(f"process_field:{field}:no such field")
            return ""

    elif tagmatch is not None:
        logger.debug(f"{feed}:process_field:{field}:isTag")
        field = tagmatch.group(1)
        if field in item:
            # Assuming tags are ', ' separated
            taglist = item[field].split(', ')
            # Iterate through channel roles, see if a role is mentionable and
            # then substitute the role for its id
            for role in client.get_channel(channel['id']).server.roles:
                rn = str(role.name)
                taglist = [
                    f"<@&{role.id}>" if rn == str(i) else i for i in taglist
                ]
                return ", ".join(taglist)
        else:
            logger.error(f"process_field:{field}:no such field")
            return ""

    else:
        logger.debug(f"{feed}:process_field:{field}:isPlain")
        # Just asking for plain field:
        if field in item:
            # If field is special field "link",
            # then use urljoin to turn relative URLs into absolute URLs
            if field == 'link':
                return urljoin(feed.get('feed_url'), item[field])
            # Else assume it's a "summary" or "content" or whatever field
            # and turn HTML into markdown and don't add any markup:
            else:
                htmlfixer = HTML2Text()
                logger.debug(htmlfixer)
                htmlfixer.ignore_links = True
                htmlfixer.ignore_images = True
                htmlfixer.ignore_emphasis = False
                htmlfixer.body_width = 1000
                htmlfixer.unicode_snob = True
                htmlfixer.ul_item_mark = '-'  # Default of "*" likely
                # to bold things, etc...
                markdownfield = htmlfixer.handle(item[field])

                # Try to strip any remaining HTML out.  Not "safe", but
                # simple and should catch most stuff:
                markdownfield = re.sub('<[^<]+?>', '', markdownfield)
                return markdownfield
        else:
            logger.error(f"process_field:{field}:no such field")
            return ""

예제 #19

0

파일 보기

파일: ahk.py 프로젝트: Bluscream/AceBot

import discord, asyncio
from discord.ext import commands

from ace import log
from utils.docs_search import docs_search
from utils.string_manip import welcomify, to_markdown, shorten
from cogs.base import TogglableCogMixin

from html2text import HTML2Text
from bs4 import BeautifulSoup
from datetime import datetime, timezone, timedelta

htt = HTML2Text()
htt.body_width = 0

# for verification stuff
GENERAL_ID = 115993023636176902
STAFF_ID = 311784919208558592
MEMBER_ID = 509526426198999040

WELCOME_MSG = '''
Welcome to our Discord community {user}!
A collection of useful tips are in <#407666416297443328> and recent announcements can be found in <#367301754729267202>.
'''

# for rss
#FORUM_ID = 517692823621861409
FORUM_ID = 536785342959845386

# for roles
ROLES_CHANNEL = 513071256283906068

예제 #20

0

파일 보기

gi.require_version('AppStreamGlib', '1.0')
from gi.repository import AppStreamGlib
from html2text import HTML2Text
from subprocess import call

APP_ID = 'com.uploadedlobster.peek'
APPSTREAM_TMP_FILE = '/tmp/%s.appdata.xml' % APP_ID

DEFAULT_LOCALE = 'C'
locales = [DEFAULT_LOCALE]

if len(sys.argv) > 1:
    locales = sys.argv[1:]

# Configure html2text
html2text = HTML2Text()
html2text.body_width = 0
html2text.ignore_links = True
html2text.ignore_images = True
html2text.ul_item_mark = '-'


def format_description(text):
    text = html2text.handle(description).strip()
    text = re.sub(r"(\s*\n){3,}", "\n\n", text)
    return text


def translate_appstream_template(output_file):
    cwd = os.path.dirname(os.path.abspath(__file__))
    appstream_template = os.path.join(cwd,

예제 #21

0

파일 보기

파일: migrate_html_descriptions.py 프로젝트: vireshbackup/indico

def convert_using_html2text(text):
    h = HTML2Text(bodywidth=0)
    h.unicode_snob = True
    return h.handle(text)

예제 #22

0

파일 보기

    def extract(self,
                year,
                month,
                day,
                update_existing=False,
                max_thumb_size=None,
                use_cached=False):
        if max_thumb_size:
            logger.debug(
                'extractor nasa_apod_en: max_thumb_size not available')
        potd_at = date(year=year, month=month, day=day)
        potd_kwargs = {'potd_at': potd_at, 'source_type': self.source_type}
        try:
            potd = POTD.objects.get(**potd_kwargs)
            if not update_existing:
                logger.info(
                    'extractor nasa_apod_en: potd already existing and not updating: {}'
                    .format(potd))
                return
            else:
                logger.debug(
                    'extractor nasa_apod_en: using already existing potd: {}'.
                    format(potd_kwargs))
        except POTD.DoesNotExist:
            logger.debug(
                'extractor nasa_apod_en: creating new potd for {}'.format(
                    potd_kwargs))
            potd = POTD(**potd_kwargs)

        source_potd_url = NASAAPODEnExtractor.APOD_DETAIL_URL_BY_DATE.format(
            yy=str(year)[-2:], mm=int(month), dd=int(day))
        potd.source_url = source_potd_url
        logger.debug('extractor nasa_apod_en: source potd url: {}'.format(
            source_potd_url))
        if use_cached and potd.raw_scaping_data_binary_compressed:
            response = self._FakeResponse(
                potd.raw_scaping_data_binary_uncompressed)
            logger.debug('extractor nasa_apod_en: using cached markup')
        else:
            response = requests.get(source_potd_url)
            potd.raw_scaping_data_binary_compressed = self._compress(
                response.content)

        if response.ok:
            # Marked up like it's 1995...
            tree = html.fromstring(response.content)
            _explanation = tree.xpath(
                "//b[starts-with(normalize-space(text()),'Explanation')]/parent::p"
            )[0]
            h = HTML2Text()
            h.ignore_links = True
            potd.description = h.handle(
                tostring(_explanation, encoding='unicode')).strip().replace(
                    '** Explanation: ** ', '').strip()
            potd.title = tree.xpath(
                "//img[starts-with(@src,'image/')][1]/following::b[1]"
            )[0].text_content().strip()
            _image_filename = tree.xpath(
                "//img[starts-with(@src,'image/')][1]/attribute::src")[0]
            if not _image_filename.lower().endswith(
                    settings.ALLOWED_IMAGE_EXTENSIONS):
                logger.info(
                    'extractor nasa_apod_en: not a matching potd type: {}'.
                    format(_image_filename))
                return
            potd.image_url = NASAAPODEnExtractor.APOD_BASE_URL + _image_filename
            # nice to have's
            try:
                center_elem = tree.xpath(
                    "//img[starts-with(@src,'image/')][1]/following::b[1]/.."
                )[0]
                center_elem_html = tostring(center_elem,
                                            encoding='unicode').replace(
                                                '\n', ' ')
                # Ugly, but works...
                # TODO: reimplement this with elem.drop_tree()
                _copyright = NASAAPODEnExtractor.UGLY_TAG_STRIP_RE.sub(
                    ' ', center_elem_html)
                h.ignore_links = False
                potd.copyright_info = h.handle(_copyright.strip()).replace(
                    '\n', ' ')
                _image_hires_url = tree.xpath(
                    "//img[contains(@src,'image/')][1]/../attribute::href")[0]
                if _image_hires_url.lower().endswith(
                        settings.ALLOWED_IMAGE_EXTENSIONS):
                    potd.image_thumbnail_url = potd.image_url
                    if _image_hires_url.startswith('http'):
                        potd.image_url = _image_hires_url
                    else:
                        potd.image_url = NASAAPODEnExtractor.APOD_BASE_URL + _image_hires_url
            except Exception as e:
                logger.debug(
                    'extractor nasa_apod_en: fail for additional info: {}'.
                    format(e))
        else:
            logger.error(
                'extractor nasa_apod_en: http status code {} for url: {}'.
                format(response.status_code, source_potd_url))
            response.raise_for_status()

        potd.retrieved_from_source_at = now()

        potd.save()

        logger.info(
            'extractor nasa_apod_en: potd okay for {}-{}-{} "{}"'.format(
                year, month, day, potd.title))

        return potd

예제 #23

0

파일 보기

파일: cli.py 프로젝트: edgarskos/wbots

def main():
    baseurl = ''

    class bcolors:  # pragma: no cover
        HEADER = '\033[95m'
        OKBLUE = '\033[94m'
        OKGREEN = '\033[92m'
        WARNING = '\033[93m'
        FAIL = '\033[91m'
        ENDC = '\033[0m'
        BOLD = '\033[1m'
        UNDERLINE = '\033[4m'

    p = optparse.OptionParser('%prog [(filename|url) [encoding]]',
                              version='%prog ' +
                              ".".join(map(str, __version__)))
    p.add_option("--pad-tables",
                 dest="pad_tables",
                 action="store_true",
                 default=config.PAD_TABLES,
                 help="pad the cells to equal column width in tables")
    p.add_option("--no-wrap-links",
                 dest="wrap_links",
                 action="store_false",
                 default=config.WRAP_LINKS,
                 help="wrap links during conversion")
    p.add_option("--ignore-emphasis",
                 dest="ignore_emphasis",
                 action="store_true",
                 default=config.IGNORE_EMPHASIS,
                 help="don't include any formatting for emphasis")
    p.add_option("--reference-links",
                 dest="inline_links",
                 action="store_false",
                 default=config.INLINE_LINKS,
                 help="use reference style links instead of inline links")
    p.add_option("--ignore-links",
                 dest="ignore_links",
                 action="store_true",
                 default=config.IGNORE_ANCHORS,
                 help="don't include any formatting for links")
    p.add_option("--protect-links",
                 dest="protect_links",
                 action="store_true",
                 default=config.PROTECT_LINKS,
                 help=("protect links from line breaks surrounding them " +
                       "with angle brackets"))
    p.add_option("--ignore-images",
                 dest="ignore_images",
                 action="store_true",
                 default=config.IGNORE_IMAGES,
                 help="don't include any formatting for images")
    p.add_option("--images-to-alt",
                 dest="images_to_alt",
                 action="store_true",
                 default=config.IMAGES_TO_ALT,
                 help="Discard image data, only keep alt text")
    p.add_option(
        "--images-with-size",
        dest="images_with_size",
        action="store_true",
        default=config.IMAGES_WITH_SIZE,
        help="Write image tags with height and width attrs as raw html to "
        "retain dimensions")
    p.add_option("-g",
                 "--google-doc",
                 action="store_true",
                 dest="google_doc",
                 default=False,
                 help="convert an html-exported Google Document")
    p.add_option("-d",
                 "--dash-unordered-list",
                 action="store_true",
                 dest="ul_style_dash",
                 default=False,
                 help="use a dash rather than a star for unordered list items")
    p.add_option(
        "-e",
        "--asterisk-emphasis",
        action="store_true",
        dest="em_style_asterisk",
        default=False,
        help="use an asterisk rather than an underscore for emphasized text")
    p.add_option("-b",
                 "--body-width",
                 dest="body_width",
                 action="store",
                 type="int",
                 default=config.BODY_WIDTH,
                 help="number of characters per output line, 0 for no wrap")
    p.add_option("-i",
                 "--google-list-indent",
                 dest="list_indent",
                 action="store",
                 type="int",
                 default=config.GOOGLE_LIST_INDENT,
                 help="number of pixels Google indents nested lists")
    p.add_option("-s",
                 "--hide-strikethrough",
                 action="store_true",
                 dest="hide_strikethrough",
                 default=False,
                 help="hide strike-through text. only relevant when -g is "
                 "specified as well")
    p.add_option(
        "--escape-all",
        action="store_true",
        dest="escape_snob",
        default=False,
        help="Escape all special characters.  Output is less readable, but "
        "avoids corner case formatting issues.")
    p.add_option("--bypass-tables",
                 action="store_true",
                 dest="bypass_tables",
                 default=config.BYPASS_TABLES,
                 help="Format tables in HTML rather than Markdown syntax.")
    p.add_option(
        "--single-line-break",
        action="store_true",
        dest="single_line_break",
        default=config.SINGLE_LINE_BREAK,
        help=("Use a single line break after a block element rather than two "
              "line breaks. NOTE: Requires --body-width=0"))
    p.add_option("--unicode-snob",
                 action="store_true",
                 dest="unicode_snob",
                 default=config.UNICODE_SNOB,
                 help="Use unicode throughout document")
    p.add_option("--no-automatic-links",
                 action="store_false",
                 dest="use_automatic_links",
                 default=config.USE_AUTOMATIC_LINKS,
                 help="Do not use automatic links wherever applicable")
    p.add_option("--no-skip-internal-links",
                 action="store_false",
                 dest="skip_internal_links",
                 default=config.SKIP_INTERNAL_LINKS,
                 help="Do not skip internal links")
    p.add_option("--links-after-para",
                 action="store_true",
                 dest="links_each_paragraph",
                 default=config.LINKS_EACH_PARAGRAPH,
                 help="Put links after each paragraph instead of document")
    p.add_option("--mark-code",
                 action="store_true",
                 dest="mark_code",
                 default=config.MARK_CODE,
                 help="Mark program code blocks with [code]...[/code]")
    p.add_option(
        "--decode-errors",
        dest="decode_errors",
        action="store",
        type="string",
        default=config.DECODE_ERRORS,
        help=
        "What to do in case of decode errors.'ignore', 'strict' and 'replace' are acceptable values"
    )
    (options, args) = p.parse_args()

    # process input
    encoding = "utf-8"
    if len(args) == 2:
        encoding = args[1]
    elif len(args) > 2:
        p.error('Too many arguments')

    if len(args) > 0 and args[0] != '-':  # pragma: no cover
        file_ = args[0]

        if file_.startswith('http://') or file_.startswith('https://'):
            warnings.warn(
                "Support for retrieving html over network is set for deprecation by version (2017, 1, x)",
                DeprecationWarning)
            baseurl = file_
            j = urllib.urlopen(baseurl)
            data = j.read()
            if encoding is None:
                try:
                    from feedparser import _getCharacterEncoding as enc
                except ImportError:
                    enc = lambda x, y: ('utf-8', 1)
                encoding = enc(j.headers, data)[0]
                if encoding == 'us-ascii':
                    encoding = 'utf-8'
        else:
            data = open(file_, 'rb').read()
            if encoding is None:
                try:
                    from chardet import detect
                except ImportError:
                    detect = lambda x: {'encoding': 'utf-8'}
                encoding = detect(data)['encoding']
    else:
        data = wrap_read()

    if hasattr(data, 'decode'):
        try:
            try:
                data = data.decode(encoding, errors=options.decode_errors)
            except TypeError:
                # python 2.6.x does not have the errors option
                data = data.decode(encoding)
        except UnicodeDecodeError as err:
            warning = bcolors.WARNING + "Warning:" + bcolors.ENDC
            warning += ' Use the ' + bcolors.OKGREEN
            warning += '--decode-errors=ignore' + bcolors.ENDC + 'flag.'
            print(warning)
            raise err

    h = HTML2Text(baseurl=baseurl)
    # handle options
    if options.ul_style_dash:
        h.ul_item_mark = '-'
    if options.em_style_asterisk:
        h.emphasis_mark = '*'
        h.strong_mark = '__'

    h.body_width = options.body_width
    h.google_list_indent = options.list_indent
    h.ignore_emphasis = options.ignore_emphasis
    h.ignore_links = options.ignore_links
    h.protect_links = options.protect_links
    h.ignore_images = options.ignore_images
    h.images_to_alt = options.images_to_alt
    h.images_with_size = options.images_with_size
    h.google_doc = options.google_doc
    h.hide_strikethrough = options.hide_strikethrough
    h.escape_snob = options.escape_snob
    h.bypass_tables = options.bypass_tables
    h.single_line_break = options.single_line_break
    h.inline_links = options.inline_links
    h.unicode_snob = options.unicode_snob
    h.use_automatic_links = options.use_automatic_links
    h.skip_internal_links = options.skip_internal_links
    h.links_each_paragraph = options.links_each_paragraph
    h.mark_code = options.mark_code
    h.wrap_links = options.wrap_links
    h.pad_tables = options.pad_tables

    wrapwrite(h.handle(data))

예제 #24

0

파일 보기

# coding:utf-8
from html2text import HTML2Text
import re
import json
import os
from collections import OrderedDict
from maya import OpenMaya
from maya import OpenMayaAnim
from maya import OpenMayaFX
from maya import OpenMayaMPx
from maya import OpenMayaRender
from maya import OpenMayaUI

parser = HTML2Text()
parser.wrap_links = False
parser.skip_internal_links = True
parser.inline_links = True
parser.ignore_anchors = True
parser.ignore_images = True
parser.ignore_emphasis = True
parser.ignore_table = True

# ! ----------------------------------------

DIR = os.path.dirname(__file__)
WEB = os.path.join(
    r"D:\Users\Administrator\Desktop\MayaDoc\maya-2019-developer-help_enu_offline",
    "cpp_ref")
FOLDER = os.path.join(DIR, "cpp_ref")

예제 #25

0

파일 보기

파일: html_importer.py 프로젝트: ydx2099/xnote

    def POST(self):
        try:
            file = xutils.get_argument("file", {})
            address = xutils.get_argument("url", "")
            name = xutils.get_argument("name", "")
            filename = ""

            if hasattr(file, "filename"):
                filename = file.filename
            plain_text = ""

            if not isempty(address):
                html = readhttp(address)
            else:
                # 读取文件
                html = ""
                for chunk in file.file:
                    html += chunk.decode("utf-8")

            print("Read html, filename={}, length={}".format(
                filename, len(html)))

            soup = BeautifulSoup(html, "html.parser")
            element_list = soup.find_all(["script", "style"])
            for element in element_list:
                element.extract()
            plain_text = soup.get_text(separator=" ")
            plain_text = clean_whitespace(plain_text)

            images = soup.find_all("img")
            links = soup.find_all("a")
            csses = soup.find_all("link")
            scripts = soup.find_all("script")
            title = get_html_title(soup)

            h = HTML2Text(baseurl=address)
            text = "From %s\n\n" % address + h.handle(html)

            texts = [text]
            images = get_addr_list(images)
            scripts = get_addr_list(scripts)

            if name != "" and name != None:
                save_to_archive_dir(name)

            return xtemplate.render(self.template_path,
                                    show_aside=False,
                                    images=images,
                                    links=links,
                                    csses=csses,
                                    scripts=scripts,
                                    texts=texts,
                                    address=address,
                                    url=address,
                                    article_title=title,
                                    plain_text=plain_text)
        except Exception as e:
            xutils.print_stacktrace()
            return xtemplate.render(self.template_path,
                                    show_aside=False,
                                    error=str(e))

예제 #26

0

파일 보기

def html2md(text, width=0):
    h = HTML2Text()
    h.body_width = width
    return h.handle(text)

예제 #27

0

파일 보기

 def get_markdown_content(self):
     html_content = self.get_text()
     text_maker = HTML2Text()
     text_maker.body_width = 0
     text_maker.single_line_break = True
     return text_maker.handle(str(html_content))

예제 #28

0

파일 보기

파일: translate.py 프로젝트: baekchun/NYT-crawler

 def __init__(self):
     services = build(SERVICE, VERSION, developerKey=API_KEY)
     self.translator = services.translations()
     self.cleaner = HTML2Text()

예제 #29

0

파일 보기

파일: beta.py 프로젝트: Nanrou/lulu-crawler

    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Encoding': 'gzip, deflate',
    'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
    'Cache-Control': 'no-cache',
    'Connection': 'keep-alive',
    'Pragma': 'no-cache',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:59.0) Gecko/20100101 Firefox/59.0',
}
MAX_TRY_AGAIN_TIME = 3
TIMEOUT = 3
UrlDetail = namedtuple('UrlDetail', ['url', 'detail'])  # url是网址，detail中是字典: {内容1: 规则1, ...}
LOGGER = MyLogger(__file__)
BloomFilter = MyBloomFilter()

TextMaker = HTML2Text()
TextMaker.ignore_links = True
TextMaker.ignore_images = True
TextMaker.ignore_tables = True
TextMaker.single_line_break = True


class Item:
    """ 分类页的任务单位 """
    __slots__ = ['url', 'detail', 'is_direct', 'is_json']

    def __init__(self, url, detail, is_direct=False, is_json=False):
        self.url = url
        self.detail = detail
        self.is_direct = is_direct  # 分类是不是直接到内容
        # self.is_json = is_json  # 内容是不是json格式

예제 #30

0

파일 보기

파일: HtmlToMarkdown.py 프로젝트: repository-template/HtmlToMarkdown

 def __init__(self, source):
     # super(HtmlToMarkdown, self).__init__()
     self.source = source
     self.html_handle = HTML2Text()