Exemplos de Cleaner.add_nofollow em Python

Linguagem de programação: Python

Espaço para nome / nome do pacote: lxml.html.clean

Classe / Tipo: Cleaner

Método / Função: add_nofollow

Exemplos em hotexamples.com: 3

Cleaner.add_nofollow em Python - 3 exemplos encontrados. Esses são os exemplos do mundo real mais bem avaliados de lxml.html.clean.Cleaner.add_nofollow em Python extraídos de projetos de código aberto. Você pode avaliar os exemplos para nos ajudar a melhorar a qualidade deles.

Métodos Frequentes

Exibir Ocultar

Cleaner(30)

clean_html(30)

style(30)

kill_tags(30)

javascript(30)

remove_tags(23)

scripts(21)

page_structure(19)

meta(19)

links(16)

remove_unknown_tags(15)

comments(14)

allow_tags(13)

safe_attrs_only(12)

embedded(11)

forms(11)

frames(9)

annoying_tags(8)

html(7)

processing_instructions(7)

inline_style(4)

safe_attrs(3)

xpath(2)

add_nofollow(2)

__call__(2)

allow_tag(1)

javasript(1)

remove_attributes(1)

host_whitelist(1)

replace(1)

frame(1)

embeded(1)

script(1)

allow_attributes(1)

startswith(1)

__init__(1)

whitelist_tags(1)

allow_embedded_url(1)

Métodos Frequentes

Cleaner (30)

clean_html (30)

style (30)

kill_tags (30)

javascript (30)

remove_tags (23)

scripts (21)

page_structure (19)

meta (19)

links (16)

Métodos Frequentes

remove_unknown_tags (15)

comments (14)

allow_tags (13)

safe_attrs_only (12)

embedded (11)

forms (11)

frames (9)

annoying_tags (8)

html (7)

processing_instructions (7)

inline_style (4)

safe_attrs (3)

xpath (2)

add_nofollow (2)

__call__ (2)

allow_tag (1)

javasript (1)

remove_attributes (1)

host_whitelist (1)

replace (1)

Métodos Frequentes

inline_style (4)

safe_attrs (3)

xpath (2)

add_nofollow (2)

__call__ (2)

allow_tag (1)

javasript (1)

remove_attributes (1)

host_whitelist (1)

replace (1)

frame (1)

embeded (1)

script (1)

allow_attributes (1)

startswith (1)

__init__ (1)

whitelist_tags (1)

allow_embedded_url (1)

Métodos Frequentes

frame (1)

embeded (1)

script (1)

allow_attributes (1)

startswith (1)

__init__ (1)

whitelist_tags (1)

allow_embedded_url (1)

Exemplo n.º 1

0

Exibir arquivo

def html2text(html): cleaner = Cleaner() cleaner.javascript = True # This is True because we want to activate the javascript filter cleaner.style = True cleaner.scripts = True cleaner.comments = True cleaner.links = True cleaner.meta = True cleaner.page_structure = True cleaner.processing_instructions = True cleaner.forms = True cleaner.add_nofollow = True #html = unicodedata.normalize('NFKD', html).encode('ascii','ignore') try: document = lxml.html.document_fromstring(html) c = cleaner.clean_html(document) html = lxml.html.tostring(c) soup = BeautifulSoup(html, 'lxml') parsed_text = soup.get_text() if (len(parsed_text) > MINSIZE_CHARSDOC): return parsed_text.lower() else: return None except: return None

Exemplo n.º 2

0

Exibir arquivo

def f_parse(args): def isAlphabet(word): alphabet = [ 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'j', 'k', 'i', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'x', 'y', 'w', 'z', 'à', 'è', 'é', 'ì', 'í', 'ò', 'ó', 'ù', 'ú' ] guard = True for t in word: if t not in alphabet: guard = False return guard loc = args[0] corpuses = args[1] MINSIZE_WORD = 4 MAXSIZE_WORD = 15 MINSIZE_CHARSDOC = 100 MINSIZE_WORDSDOC = 50 cleaner = Cleaner() cleaner.javascript = True # This is True because we want to activate the javascript filter cleaner.style = True cleaner.scripts = True cleaner.comments = True cleaner.links = True cleaner.meta = True cleaner.page_structure = True cleaner.processing_instructions = True cleaner.forms = True cleaner.add_nofollow = True ret = [] for document in corpuses: #html = unicodedata.normalize('NFKD', html).encode('ascii','ignore') if len(document) > 0: try: document = lxml.html.document_fromstring(document) c = cleaner.clean_html(document) html = lxml.html.tostring(c) soup = BeautifulSoup(html, 'lxml') parsed_text = soup.get_text() if (len(parsed_text) > MINSIZE_CHARSDOC): parsed_text = parsed_text.lower() tokenizer = RegexpTokenizer(r'\w+') # create English stop words list en_stop = get_stop_words('en') it_stop = get_stop_words('it') sp_stop = get_stop_words('es') ge_stop = get_stop_words('de') fr_stop = get_stop_words('fr') # Create p_stemmer of class PorterStemmer #p_stemmer = PorterStemmer() # clean and tokenize document string tokens = tokenizer.tokenize(parsed_text) # remove stop words from tokens stopped_tokens1 = [i for i in tokens if not i in en_stop] stopped_tokens2 = [ i for i in stopped_tokens1 if not i in it_stop ] stopped_tokens3 = [ i for i in stopped_tokens2 if not i in sp_stop ] stopped_tokens4 = [ i for i in stopped_tokens3 if not i in ge_stop ] stopped_tokens5 = [ i for i in stopped_tokens4 if not i in fr_stop ] for word in stopped_tokens5: if not any(char.isdigit() for char in word): if len(word) > 1: #check if the word has the alphabet character if isAlphabet(word): ret.append(word) except: print('Exception : Document empty') return [loc, ret]

Exemplo n.º 3

0

Exibir arquivo

Arquivo: oh_tags.py Projeto: ZaoRiTian/ooi-hack

from django import template from django.utils.safestring import mark_safe import lxml.html from lxml.html.clean import Cleaner register = template.Library() cleaner = Cleaner() cleaner.safe_attrs = lxml.html.defs.safe_attrs | {'style'} cleaner.add_nofollow = True @register.filter(name='xss_safe') def xss_safe(value): return mark_safe(cleaner.clean_html(value))