Python Cleaner.processing_instructions示例

编程语言: Python

命名空间/包名称: lxml.html.clean

类/类型: Cleaner

方法/功能: processing_instructions

hotexamples.com的示例: 7

Python Cleaner.processing_instructions - 已找到7个示例。这些是从开源项目中提取的最受好评的lxml.html.clean.Cleaner.processing_instructions现实Python示例。您可以评价示例，以帮助我们提高示例质量。

常用方法

显示隐藏

Cleaner(30)

clean_html(30)

style(30)

kill_tags(30)

javascript(30)

remove_tags(23)

scripts(21)

page_structure(19)

meta(19)

links(16)

remove_unknown_tags(15)

comments(14)

allow_tags(13)

safe_attrs_only(12)

embedded(11)

forms(11)

frames(9)

annoying_tags(8)

html(7)

processing_instructions(7)

inline_style(4)

safe_attrs(3)

xpath(2)

add_nofollow(2)

__call__(2)

allow_tag(1)

javasript(1)

remove_attributes(1)

host_whitelist(1)

replace(1)

frame(1)

embeded(1)

script(1)

allow_attributes(1)

startswith(1)

__init__(1)

whitelist_tags(1)

allow_embedded_url(1)

示例#1

显示文件

def html2text(html):

    cleaner = Cleaner()
    cleaner.javascript = True  # This is True because we want to activate the javascript filter
    cleaner.style = True
    cleaner.scripts = True
    cleaner.comments = True
    cleaner.links = True
    cleaner.meta = True
    cleaner.page_structure = True
    cleaner.processing_instructions = True
    cleaner.forms = True
    cleaner.add_nofollow = True

    #html = unicodedata.normalize('NFKD', html).encode('ascii','ignore')

    try:
        document = lxml.html.document_fromstring(html)
        c = cleaner.clean_html(document)
        html = lxml.html.tostring(c)

        soup = BeautifulSoup(html, 'lxml')
        parsed_text = soup.get_text()

        if (len(parsed_text) > MINSIZE_CHARSDOC):
            return parsed_text.lower()
        else:
            return None
    except:
        return None

示例#2

显示文件

文件： Lake_Utils.py 项目： hitallocavas/code-cup

def clean_html(html_text,
               javascript=True,
               scripts=True,
               style=True,
               embedded=True,
               links=True,
               forms=True,
               frames=True,
               comments=True,
               annoying_tags=True,
               meta=True,
               safe_attrs_only=True,
               remove_unknown_tags=True,
               processing_instructions=True):
    """Clean all the javascript and styles from the HTML returning the string with only the html content"""
    # True = Remove | False = Keep
    cleaner = Cleaner()
    cleaner.javascript = javascript  # This is True because we want to activate the javascript filter
    cleaner.scripts = scripts  # This is True because we want to activate the scripts filter
    cleaner.style = style
    cleaner.embedded = embedded
    cleaner.links = links
    cleaner.forms = forms
    cleaner.frames = frames
    cleaner.comments = comments
    cleaner.page_structure = False  # Keep page structure
    cleaner.annoying_tags = annoying_tags
    cleaner.meta = meta
    cleaner.safe_attrs_only = safe_attrs_only
    cleaner.remove_unknown_tags = remove_unknown_tags
    cleaner.processing_instructions = processing_instructions
    clean_content = cleaner.clean_html(lxml.html.fromstring(html_text))
    return lxml.html.tostring(clean_content)

示例#3

显示文件

文件： Task1.py 项目： dipanjan44/Python-Projects

def cleaner_parameters():
    reject_list = [
        'script', 'noscript', 'style', 'meta', 'semantics', 'img', 'label',
        'table', 'li', 'ul', 'ol', 'nav', 'dl', 'dd', 'sub', 'sup', 'math'
    ]
    accept_list = [
        'div', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
        'span', 'b', 'a', 'u', 'i', 'body'
    ]
    html_cleaner = Cleaner()
    html_cleaner.remove_unknown_tags = True
    html_cleaner.processing_instructions = True
    html_cleaner.style = True
    html_cleaner.comments = True
    html_cleaner.scripts = True
    html_cleaner.javascript = True
    html_cleaner.meta = True
    html_cleaner.links = True
    html_cleaner.embedded = True
    html_cleaner.annoying_tags = True
    html_cleaner.frames = True
    html_cleaner.forms = True
    html_cleaner.remove_tags = accept_list
    html_cleaner.kill_tags = reject_list
    return html_cleaner

示例#4

显示文件

文件： multi-processing-example.py 项目： anuragkumar/python-multiprocessing-bfs-crawler

def get_cleaner():
    cleaner = Cleaner()
    cleaner.embedded = True
    cleaner.frames = True
    cleaner.style = True
    cleaner.remove_unknown_tags = True
    cleaner.processing_instructions = True
    cleaner.annoying_tags = True
    cleaner.remove_tags = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'b', 'a', 'u', 'i', 'body', 'div', 'span', 'p']
    cleaner.kill_tags = ['table', 'img', 'semantics', 'script', 'noscript', 'style', 'meta', 'label', 'li', 'ul',
                         'ol', 'sup', 'math', 'nav', 'dl', 'dd', 'sub']
    return cleaner

示例#5

显示文件

def f_parse(args):
    def isAlphabet(word):

        alphabet = [
            'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'j', 'k', 'i', 'l', 'm',
            'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'x', 'y', 'w', 'z',
            'à', 'è', 'é', 'ì', 'í', 'ò', 'ó', 'ù', 'ú'
        ]
        guard = True
        for t in word:
            if t not in alphabet:
                guard = False
        return guard

    loc = args[0]
    corpuses = args[1]

    MINSIZE_WORD = 4
    MAXSIZE_WORD = 15
    MINSIZE_CHARSDOC = 100
    MINSIZE_WORDSDOC = 50

    cleaner = Cleaner()
    cleaner.javascript = True  # This is True because we want to activate the javascript filter
    cleaner.style = True
    cleaner.scripts = True
    cleaner.comments = True
    cleaner.links = True
    cleaner.meta = True
    cleaner.page_structure = True
    cleaner.processing_instructions = True
    cleaner.forms = True
    cleaner.add_nofollow = True

    ret = []

    for document in corpuses:
        #html = unicodedata.normalize('NFKD', html).encode('ascii','ignore')
        if len(document) > 0:
            try:
                document = lxml.html.document_fromstring(document)
                c = cleaner.clean_html(document)
                html = lxml.html.tostring(c)

                soup = BeautifulSoup(html, 'lxml')
                parsed_text = soup.get_text()

                if (len(parsed_text) > MINSIZE_CHARSDOC):
                    parsed_text = parsed_text.lower()

                    tokenizer = RegexpTokenizer(r'\w+')

                    # create English stop words list
                    en_stop = get_stop_words('en')
                    it_stop = get_stop_words('it')
                    sp_stop = get_stop_words('es')
                    ge_stop = get_stop_words('de')
                    fr_stop = get_stop_words('fr')

                    # Create p_stemmer of class PorterStemmer
                    #p_stemmer = PorterStemmer()

                    # clean and tokenize document string
                    tokens = tokenizer.tokenize(parsed_text)

                    # remove stop words from tokens
                    stopped_tokens1 = [i for i in tokens if not i in en_stop]
                    stopped_tokens2 = [
                        i for i in stopped_tokens1 if not i in it_stop
                    ]
                    stopped_tokens3 = [
                        i for i in stopped_tokens2 if not i in sp_stop
                    ]
                    stopped_tokens4 = [
                        i for i in stopped_tokens3 if not i in ge_stop
                    ]
                    stopped_tokens5 = [
                        i for i in stopped_tokens4 if not i in fr_stop
                    ]

                    for word in stopped_tokens5:
                        if not any(char.isdigit() for char in word):
                            if len(word) > 1:
                                #check if the word has the alphabet character
                                if isAlphabet(word):
                                    ret.append(word)
            except:
                print('Exception : Document empty')
    return [loc, ret]

示例#6

显示文件

文件： htmlprocessing.py 项目： vbarbaresi/trafilatura

LOGGER = logging.getLogger(__name__)

# HTML_CLEANER config
# http://lxml.de/api/lxml.html.clean.Cleaner-class.html
# https://lxml.de/apidoc/lxml.html.clean.html
HTML_CLEANER = Cleaner()
HTML_CLEANER.annoying_tags = False  # True
HTML_CLEANER.comments = True
HTML_CLEANER.embedded = False  # True
HTML_CLEANER.forms = False  # True
HTML_CLEANER.frames = False  # True
HTML_CLEANER.javascript = False
HTML_CLEANER.links = False
HTML_CLEANER.meta = False
HTML_CLEANER.page_structure = False
HTML_CLEANER.processing_instructions = True
HTML_CLEANER.remove_unknown_tags = False
HTML_CLEANER.safe_attrs_only = False
HTML_CLEANER.scripts = False
HTML_CLEANER.style = False
#HTML_CLEANER.remove_tags = MANUALLY_STRIPPED
#HTML_CLEANER.kill_tags = MANUALLY_CLEANED


def tree_cleaning(tree, include_tables, include_images=False):
    '''Prune the tree by discarding unwanted elements'''
    # determine cleaning strategy
    cleaning_list, stripping_list = \
        MANUALLY_CLEANED.copy(), MANUALLY_STRIPPED.copy()
    if include_tables is False:
        cleaning_list.append('table')

示例#7

显示文件

文件： core.py 项目： jay-kakkad/StockMarketPrediction_using_SVR

    'DATE_ORDER': 'DMY'
}

logger.debug('settings: %s %s %s', MIN_YEAR, TODAY, MAX_YEAR)
logger.debug('dateparser configuration: %s', PARSERCONFIG)

cleaner = Cleaner()
cleaner.comments = True
cleaner.embedded = True
cleaner.forms = False
cleaner.frames = True
cleaner.javascript = False
cleaner.links = False
cleaner.meta = False
cleaner.page_structure = True
cleaner.processing_instructions = True
cleaner.remove_unknown_tags = False
cleaner.safe_attrs_only = False
cleaner.scripts = False
cleaner.style = False
cleaner.kill_tags = [
    'audio', 'canvas', 'label', 'map', 'math', 'object', 'picture', 'table',
    'svg', 'video'
]
# 'embed', 'figure', 'img',


def date_validator(datestring, outputformat):
    """Validate a string with respect to the chosen outputformat and basic heuristics"""
    # try if date can be parsed using chosen outputformat
    try: