Python Cleaner.add_nofollow примеры использования

Язык программирования: Python

Пространство имен/Пакет: lxml.html.clean

Класс/Тип: Cleaner

Метод/Функция: add_nofollow

Примеров на hotexamples.com: 3

Python Cleaner.add_nofollow - 3 примера найдено. Это лучшие примеры Python кода для lxml.html.clean.Cleaner.add_nofollow, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

Cleaner(30)

clean_html(30)

style(30)

kill_tags(30)

javascript(30)

remove_tags(23)

scripts(21)

page_structure(19)

meta(19)

links(16)

remove_unknown_tags(15)

comments(14)

allow_tags(13)

safe_attrs_only(12)

embedded(11)

forms(11)

frames(9)

annoying_tags(8)

html(7)

processing_instructions(7)

inline_style(4)

safe_attrs(3)

xpath(2)

add_nofollow(2)

__call__(2)

allow_tag(1)

javasript(1)

remove_attributes(1)

host_whitelist(1)

replace(1)

frame(1)

embeded(1)

script(1)

allow_attributes(1)

startswith(1)

__init__(1)

whitelist_tags(1)

allow_embedded_url(1)

Пример #1

Показать файл

def html2text(html):

    cleaner = Cleaner()
    cleaner.javascript = True  # This is True because we want to activate the javascript filter
    cleaner.style = True
    cleaner.scripts = True
    cleaner.comments = True
    cleaner.links = True
    cleaner.meta = True
    cleaner.page_structure = True
    cleaner.processing_instructions = True
    cleaner.forms = True
    cleaner.add_nofollow = True

    #html = unicodedata.normalize('NFKD', html).encode('ascii','ignore')

    try:
        document = lxml.html.document_fromstring(html)
        c = cleaner.clean_html(document)
        html = lxml.html.tostring(c)

        soup = BeautifulSoup(html, 'lxml')
        parsed_text = soup.get_text()

        if (len(parsed_text) > MINSIZE_CHARSDOC):
            return parsed_text.lower()
        else:
            return None
    except:
        return None

Пример #2

Показать файл

def f_parse(args):
    def isAlphabet(word):

        alphabet = [
            'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'j', 'k', 'i', 'l', 'm',
            'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'x', 'y', 'w', 'z',
            'à', 'è', 'é', 'ì', 'í', 'ò', 'ó', 'ù', 'ú'
        ]
        guard = True
        for t in word:
            if t not in alphabet:
                guard = False
        return guard

    loc = args[0]
    corpuses = args[1]

    MINSIZE_WORD = 4
    MAXSIZE_WORD = 15
    MINSIZE_CHARSDOC = 100
    MINSIZE_WORDSDOC = 50

    cleaner = Cleaner()
    cleaner.javascript = True  # This is True because we want to activate the javascript filter
    cleaner.style = True
    cleaner.scripts = True
    cleaner.comments = True
    cleaner.links = True
    cleaner.meta = True
    cleaner.page_structure = True
    cleaner.processing_instructions = True
    cleaner.forms = True
    cleaner.add_nofollow = True

    ret = []

    for document in corpuses:
        #html = unicodedata.normalize('NFKD', html).encode('ascii','ignore')
        if len(document) > 0:
            try:
                document = lxml.html.document_fromstring(document)
                c = cleaner.clean_html(document)
                html = lxml.html.tostring(c)

                soup = BeautifulSoup(html, 'lxml')
                parsed_text = soup.get_text()

                if (len(parsed_text) > MINSIZE_CHARSDOC):
                    parsed_text = parsed_text.lower()

                    tokenizer = RegexpTokenizer(r'\w+')

                    # create English stop words list
                    en_stop = get_stop_words('en')
                    it_stop = get_stop_words('it')
                    sp_stop = get_stop_words('es')
                    ge_stop = get_stop_words('de')
                    fr_stop = get_stop_words('fr')

                    # Create p_stemmer of class PorterStemmer
                    #p_stemmer = PorterStemmer()

                    # clean and tokenize document string
                    tokens = tokenizer.tokenize(parsed_text)

                    # remove stop words from tokens
                    stopped_tokens1 = [i for i in tokens if not i in en_stop]
                    stopped_tokens2 = [
                        i for i in stopped_tokens1 if not i in it_stop
                    ]
                    stopped_tokens3 = [
                        i for i in stopped_tokens2 if not i in sp_stop
                    ]
                    stopped_tokens4 = [
                        i for i in stopped_tokens3 if not i in ge_stop
                    ]
                    stopped_tokens5 = [
                        i for i in stopped_tokens4 if not i in fr_stop
                    ]

                    for word in stopped_tokens5:
                        if not any(char.isdigit() for char in word):
                            if len(word) > 1:
                                #check if the word has the alphabet character
                                if isAlphabet(word):
                                    ret.append(word)
            except:
                print('Exception : Document empty')
    return [loc, ret]

Пример #3

Показать файл

Файл: oh_tags.py Проект: ZaoRiTian/ooi-hack

from django import template
from django.utils.safestring import mark_safe
import lxml.html
from lxml.html.clean import Cleaner

register = template.Library()
cleaner = Cleaner()
cleaner.safe_attrs = lxml.html.defs.safe_attrs | {'style'}
cleaner.add_nofollow = True


@register.filter(name='xss_safe')
def xss_safe(value):
    return mark_safe(cleaner.clean_html(value))