Exemplo n.º 1
0
# -*- coding: utf-8 -*-
import re
import cgi

from talon.quotations import (register_xpath_extensions, extract_from_html,
                              extract_from_plain)  # noqa
register_xpath_extensions()

from HTMLParser import HTMLParser


# http://stackoverflow.com/questions/753052/strip-html-from-strings-in-python
class MLStripper(HTMLParser):
    strippedTags = ["title", "script", "style"]

    def __init__(self):
        self.reset()
        self.fed = []
        self.strip_tag_contents_mode = False

    def handle_starttag(self, tag, attrs):
        # Strip the contents of a tag when it's
        # in strippedTags. We can do this because
        # HTMLParser won't try to parse the inner
        # contents of a tag.
        if tag.lower() in MLStripper.strippedTags:
            self.strip_tag_contents_mode = True

    def handle_endtag(self, tag):
        self.strip_tag_contents_mode = False
Exemplo n.º 2
0
def init(path_to_models=None):
    register_xpath_extensions()
    if ML_ENABLED:
        signature.initialize(path_to_models)
Exemplo n.º 3
0
Arquivo: html.py Projeto: 0xcd03/inbox
# -*- coding: utf-8 -*-
import re
import cgi
from HTMLParser import HTMLParser, HTMLParseError
from talon.quotations import (register_xpath_extensions, extract_from_html,
                              extract_from_plain)  # noqa
register_xpath_extensions()

from inbox.log import get_logger

__all__ = ['strip_tags', 'plaintext2html', 'extract_from_html',
           'extract_from_plain']


# http://stackoverflow.com/questions/753052/strip-html-from-strings-in-python
class MLStripper(HTMLParser):
    strippedTags = ["title", "script", "style"]

    def __init__(self):
        self.reset()
        self.fed = []
        self.strip_tag_contents_mode = False

    def handle_starttag(self, tag, attrs):
        # Strip the contents of a tag when it's
        # in strippedTags. We can do this because
        # HTMLParser won't try to parse the inner
        # contents of a tag.
        if tag.lower() in MLStripper.strippedTags:
            self.strip_tag_contents_mode = True
Exemplo n.º 4
0
def init():
    register_xpath_extensions()
Exemplo n.º 5
0
def init():
    register_xpath_extensions()
    if ML_ENABLED:
        signature.initialize()
Exemplo n.º 6
0
def init():
    register_xpath_extensions()
    signature.initialize()
Exemplo n.º 7
0
def init():
    register_xpath_extensions()
    if ML_ENABLED:
        signature.initialize()
Exemplo n.º 8
0
def init():
    register_xpath_extensions()
    signature.initialize()
Exemplo n.º 9
0
def init():
    register_xpath_extensions()