Exemplo n.º 1
0
def cleaner_parameters():
    reject_list = [
        'script', 'noscript', 'style', 'meta', 'semantics', 'img', 'label',
        'table', 'li', 'ul', 'ol', 'nav', 'dl', 'dd', 'sub', 'sup', 'math'
    ]
    accept_list = [
        'div', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
        'span', 'b', 'a', 'u', 'i', 'body'
    ]
    html_cleaner = Cleaner()
    html_cleaner.remove_unknown_tags = True
    html_cleaner.processing_instructions = True
    html_cleaner.style = True
    html_cleaner.comments = True
    html_cleaner.scripts = True
    html_cleaner.javascript = True
    html_cleaner.meta = True
    html_cleaner.links = True
    html_cleaner.embedded = True
    html_cleaner.annoying_tags = True
    html_cleaner.frames = True
    html_cleaner.forms = True
    html_cleaner.remove_tags = accept_list
    html_cleaner.kill_tags = reject_list
    return html_cleaner
Exemplo n.º 2
0
def lxml_extractor(html, url):
    '''LXML PARSER'''
    cleaner = Cleaner()
    cleaner.javascript = True # This is True because we want to activate the javascript filter
    cleaner.style = True      # This is True because we want to activate the styles & stylesheet filter
    cleaner.comments = True
    cleaner.embedded = True
    cleaner.forms= True
    cleaner.frames = True
    cleaner.annoying_tags = True
    cleaner.kill_tags = NEGATIVE_K 
    cleaner.allow_tag = POSITIVE_K
    cleaner.safe_attrs_only = True
    #~ oc = document_fromstring(html, parser=parser, base_url=base_url, **kw)
  #~ File "/usr/local/lib/python2.7/dist-packages/lxml/html/__init__.py", line 752, in document_fromstring
    #~ value = etree.fromstring(html, parser, **kw)
    try:
        html = lxml.html.fromstring(html, base_url="url")
    
        tree = cleaner.clean_html(html)
        #tree.make_links_absolute(url)
        doc = lxml.html.tostring(tree)
        doc = soup_extractor(doc, url)
    except ValueError:
        doc = soup_extractor(html, url)
    
    #~ (title, doc, article, text) = read_extractor(html, url)
    #~ print title
    #~ doc = (self.doc).replace(unichr(160), " ")
    #~ doc = re.sub(spaces,"",self.doc)
    return doc
Exemplo n.º 3
0
def clean_html(html_text,
               javascript=True,
               scripts=True,
               style=True,
               embedded=True,
               links=True,
               forms=True,
               frames=True,
               comments=True,
               annoying_tags=True,
               meta=True,
               safe_attrs_only=True,
               remove_unknown_tags=True,
               processing_instructions=True):
    """Clean all the javascript and styles from the HTML returning the string with only the html content"""
    # True = Remove | False = Keep
    cleaner = Cleaner()
    cleaner.javascript = javascript  # This is True because we want to activate the javascript filter
    cleaner.scripts = scripts  # This is True because we want to activate the scripts filter
    cleaner.style = style
    cleaner.embedded = embedded
    cleaner.links = links
    cleaner.forms = forms
    cleaner.frames = frames
    cleaner.comments = comments
    cleaner.page_structure = False  # Keep page structure
    cleaner.annoying_tags = annoying_tags
    cleaner.meta = meta
    cleaner.safe_attrs_only = safe_attrs_only
    cleaner.remove_unknown_tags = remove_unknown_tags
    cleaner.processing_instructions = processing_instructions
    clean_content = cleaner.clean_html(lxml.html.fromstring(html_text))
    return lxml.html.tostring(clean_content)
Exemplo n.º 4
0
def html2text(html):

    cleaner = Cleaner()
    cleaner.javascript = True  # This is True because we want to activate the javascript filter
    cleaner.style = True
    cleaner.scripts = True
    cleaner.comments = True
    cleaner.links = True
    cleaner.meta = True
    cleaner.page_structure = True
    cleaner.processing_instructions = True
    cleaner.forms = True
    cleaner.add_nofollow = True

    #html = unicodedata.normalize('NFKD', html).encode('ascii','ignore')

    try:
        document = lxml.html.document_fromstring(html)
        c = cleaner.clean_html(document)
        html = lxml.html.tostring(c)

        soup = BeautifulSoup(html, 'lxml')
        parsed_text = soup.get_text()

        if (len(parsed_text) > MINSIZE_CHARSDOC):
            return parsed_text.lower()
        else:
            return None
    except:
        return None
Exemplo n.º 5
0
def cleanpage(html):
	# cleaner setup
	cleaner = Cleaner()
        cleaner.html = True
        cleaner.page_structure = False
        cleaner.meta = False
        cleaner.safe_attrs_only = False
        cleaner.links = False
	cleaner.javascript = True # activate the javascript filter
	cleaner.style = True      #  activate the styles & stylesheet filter
        cleaner.links = False
        cleaner.frames = True
        cleaner.embedded = True
	cleaner.comments = True
	cleaner.annoying_tags = True
	cleaner.inline_style = True
	cleaner.page_structure = False
#	cleaner.remove_tags = ['b','img','h']
	cleaner.kill_tags = ['img','script']
	
	#invoke cleaner
        try:
            content=cleaner.clean_html(html)
        except:
            #error: ValueError: Unicode strings with encoding declaration are not supported. Please use bytes input or XML fr 
            content = u""
        return content
Exemplo n.º 6
0
    def get_data(self):

        html_body = self._by_pass_get_html(self.url)

        cleaner = Cleaner()
        cleaner.comments = True
        doc = cleaner.clean_html(html_body)
        return doc
Exemplo n.º 7
0
 def cleaned_html(self):
     # Try to parse the provided HTML string using lxml
     # strip all unnecessary information to save space
     cleaner = Cleaner()
     cleaner.scripts = True
     cleaner.javascript = True
     cleaner.comments = True
     cleaner.style = True
     self.dom = cleaner.clean_html(self.dom)
     assert len(self.dom), 'The html needs to be parsed to get the cleaned html'
     return lxml.html.tostring(self.dom)
Exemplo n.º 8
0
 def cleaned_html(self):
     # Try to parse the provided HTML string using lxml
     # strip all unnecessary information to save space
     cleaner = Cleaner()
     cleaner.scripts = True
     cleaner.javascript = True
     cleaner.comments = True
     cleaner.style = True
     self.dom = cleaner.clean_html(self.dom)
     assert len(self.dom), 'The html needs to be parsed to get the cleaned html'
     return lxml.html.tostring(self.dom)
Exemplo n.º 9
0
def cleaned_html(htmlString):

    # Try to parse the provided HTML string using lxml
    # strip all unnecessary information to save space
    cleaner = Cleaner()
    cleaner.comments = True
    cleaner.javascript = True
    cleaner.scripts = True
    cleaner.style = True

    htmlString = cleaner.clean_html(htmlString)

    return htmlString
Exemplo n.º 10
0
def clean_text(data):
    cleaner = Cleaner()
    cleaner.javascript = True
    cleaner.style = True
    cleaner.scripts = True
    cleaner.comments = True
    cleaner.meta = True
    cleaner.annoying_tags = True

    stuff = lxml.html.tostring(cleaner.clean_html(data))

    soup = BeautifulSoup(stuff.decode('utf-8', 'ignore'))
    all_text = ' '.join(filter(lambda val: val, \
                               map(lambda x: x.strip(), soup.findAll(text=True))))

    return all_text
Exemplo n.º 11
0
    def crawNews(self, url):
        cleaner = Cleaner()
        cleaner.javascript = True
        cleaner.style = True
        cleaner.comments = True

        tech_content = lxml.html.parse(url)
        tech_content = (lxml.html.tostring(tech_content))

        re_title = re.compile(r'<h1.*>(.*)</h1', re.S)
        re_content = re.compile(
            r'<!-- Begin: Wordpress Article Content -->(.*)<!-- End: Wordpress Article Content -->',
            re.S)
        re_published = re.compile(r'name="sailthru.date"\scontent="(.*?)"')
        re_author = re.compile(
            r'<a\shref="(.*?)"\stitle.*?rel="author">(.*?)<\/a>.*?rel="external">(.*?)<\/a>'
        )

        match_title = re.search(re_title, tech_content)
        match_content = re.search(re_content, tech_content)
        match_date = re.search(re_published, tech_content)
        match_author = re.search(re_author, tech_content)

        author_url = "http://techcrunch.com" + match_author.group(1)
        author_name = match_author.group(2)
        author_twitter = match_author.group(3)

        title = re.sub(r'<[^>]*?>', '',
                       cleaner.clean_html(match_title.group(1)))
        title = re.sub(r'\s+', ' ', title)
        title = title.decode('utf-8').strip()
        content = re.sub(r'<[^>]*?>', '',
                         cleaner.clean_html(match_content.group(1)))
        content = re.sub(r'\s+', ' ', content)
        content = content.decode('utf-8').strip()
        content = content.strip('\n')
        published_on = datetime.datetime.strptime(match_date.group(1),
                                                  '%Y-%m-%d %H:%M:%S')

        news = self.save_news(url, title, content, published_on)

        author = self.findAuthorByUrl(author_url)
        if (isinstance(author, Author) == False):
            author = self.save_author(author_url, author_name, author_twitter,
                                      '')

        self.newsAuthor(news, author)
Exemplo n.º 12
0
    def clean_html(self):
        """
            Cleaner removes HTML tags prior to processing. Note: cleaning removes
            the Title tags from HTML. Do not clean before grabbing titles!
        """
        if len(self.response.content):
            cleaner = Cleaner()
            cleaner.javascript = True
            cleaner.scripts = True
            cleaner.style = True
            cleaner.comments = True

            try:
                return html.fromstring(cleaner.clean_html(self.response.content))
            except Exception as e:
                logging.error(e)

            return None
def clean_text(data):
    cleaner = Cleaner()
    cleaner.javascript = True
    cleaner.style = True
    cleaner.scripts = True
    cleaner.comments = True
    cleaner.meta = True
    cleaner.annoying_tags = True

    doc = UnicodeDammit(data, is_html=True)
    parser = html.HTMLParser(encoding=doc.original_encoding)
    root = html.document_fromstring(data, parser=parser)
    stuff = lxml.html.tostring(cleaner.clean_html(root))

    soup = BeautifulSoup(stuff.decode('utf-8', 'ignore'))
    all_text = ' '.join(filter(lambda val: val, \
                               map(lambda x: x.strip(), soup.findAll(text=True))))

    return all_text.encode('ascii', 'ignore')
Exemplo n.º 14
0
def extract_content(bytehtml, doc):
    """
    extracts blog post content from html
    """
    lxmldoc = lxml.html.document_fromstring(bytehtml)
    cleaner = Cleaner()
    cleaner.scripts = True
    cleaner.comments = True
    cleaner.style = True
    #cleaner.page_structure = True
    cleaner.kill_tags = ['head', 'noscript']
    cleaner.remove_tags = ['p', 'i', 'b', 'strong', 'em', 'blockquote']
    cleaner(lxmldoc)
    content_el = find_content_element(lxmldoc)
    if content_el:
        debug(3, 'content quality {}'.format(content_el._quality))
        text = tidy_content(content_el.text_content())
        return text
    else:
        debug(2, 'no content found!')
        raise Exception('no content')
Exemplo n.º 15
0
    def crawNews(self, url):
        cleaner = Cleaner()
        cleaner.javascript = True
        cleaner.style = True
        cleaner.comments = True

        tech_content = lxml.html.parse(url)
        tech_content = (lxml.html.tostring(tech_content))

        re_title = re.compile(r'<h1.*>(.*)</h1', re.S)
        re_content = re.compile(r'<!-- Begin: Wordpress Article Content -->(.*)<!-- End: Wordpress Article Content -->', re.S)
        re_published = re.compile(r'name="sailthru.date"\scontent="(.*?)"')
        re_author = re.compile(r'<a\shref="(.*?)"\stitle.*?rel="author">(.*?)<\/a>.*?rel="external">(.*?)<\/a>')

        match_title = re.search(re_title, tech_content)
        match_content = re.search(re_content, tech_content)
        match_date = re.search(re_published, tech_content)
        match_author = re.search(re_author, tech_content)

        author_url = "http://techcrunch.com" + match_author.group(1)
        author_name = match_author.group(2)
        author_twitter = match_author.group(3)

        title = re.sub(r'<[^>]*?>', '', cleaner.clean_html(match_title.group(1)))
        title = re.sub(r'\s+', ' ', title)
        title = title.decode('utf-8').strip()
        content = re.sub(r'<[^>]*?>', '', cleaner.clean_html(match_content.group(1)))
        content = re.sub(r'\s+', ' ', content)
        content = content.decode('utf-8').strip()
        content = content.strip('\n')
        published_on = datetime.datetime.strptime(match_date.group(1), '%Y-%m-%d %H:%M:%S')

        news = self.save_news(url, title, content, published_on)

        author = self.findAuthorByUrl(author_url)
        if (isinstance(author, Author) == False):
            author = self.save_author(author_url, author_name, author_twitter, '')

        self.newsAuthor(news, author)
Exemplo n.º 16
0
def scrape(lineHashDB, html, encoding):
    # cleaner setup
    cleaner = Cleaner(allow_tags=['div', 'p'], remove_unknown_tags=False)
    cleaner.javascript = True  # activate the javascript filter
    cleaner.style = True  #  activate the styles & stylesheet filter
    cleaner.comments = True
    cleaner.annoying_tags = True
    cleaner.inline_style = True
    cleaner.page_structure = False
    cleaner.remove_tags = ['b', 'a', 'h']
    cleaner.kill_tags = ['script']

    #invoke cleaner
    try:
        page = cleaner.clean_html(html)
    except:
        #error: ValueError: Unicode strings with encoding declaration are not supported. Please use bytes input or XML fr
        content = u""
        return content

    page8 = page
    page8 = re.sub(u'\n', ' ', page8)  # remove NL
    #	page8 = re.sub(u'\s','',page8,re.UNICODE) # blanks -> space
    page8 = re.sub(u'&#13;', ' ', page8)  # remove CR
    page8 = re.sub(u'<!--.*?-->', ' ', page8)  # remove comments
    page8 = re.sub(u' class=".*?"', ' ', page8)  # remove attributes
    page8 = re.sub(u' id=".*?"', ' ', page8)
    page8 = re.sub(u' rel=".*?"', ' ', page8)
    page8 = re.sub(u'\[an error occurred while processing this directive\]',
                   ' ', page8)
    page8 = re.sub(u'>\s*?<', '><', page8)  # remove blanks between tags

    # cycle to remove spurious divs
    for count in range(1, 20):
        page8 = re.sub(u'>.{0,10}<', '><',
                       page8)  # remove words under 10 chars between tags
        page8 = re.sub(u'<div></div>', ' ', page8)
        page8 = re.sub(u'<p></p>', ' ', page8)
        page8 = re.sub(u'<span></span>', ' ', page8)

    page8 = re.sub(u'\s+', ' ', page8)  # remove repeated blanks

    #XPATHs
    xpath = '//*[((p) or (a) or (b) or (div) or (span)) ]/node()[(string-length() > 300)]/text()'
    xpath = '//*[((p) or (div))]/node()[(string-length() > 100)]/text()'

    sel = Selector(text=page8, type="html")
    text = sel.xpath(xpath).extract()
    content = u""
    if text:
        for s in text:
            # squash duplicate whitespaces
            ' '.join(s.split())
            # remove short lines
            # on empirical analysis, no unfrequent sentence under 40 chars is a relevant part of the article text, excluding repetition of title, authors, dates, etc.
            if len(s) < 40:
                next
    # remove leading whitespace
    #if s.endswith(" "): s = s[:-1]
            if s.startswith(" "): s = s[1:]
            content += s
            content += "\n"
    return content
Exemplo n.º 17
0
def f_parse(args):
    def isAlphabet(word):

        alphabet = [
            'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'j', 'k', 'i', 'l', 'm',
            'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'x', 'y', 'w', 'z',
            'à', 'è', 'é', 'ì', 'í', 'ò', 'ó', 'ù', 'ú'
        ]
        guard = True
        for t in word:
            if t not in alphabet:
                guard = False
        return guard

    loc = args[0]
    corpuses = args[1]

    MINSIZE_WORD = 4
    MAXSIZE_WORD = 15
    MINSIZE_CHARSDOC = 100
    MINSIZE_WORDSDOC = 50

    cleaner = Cleaner()
    cleaner.javascript = True  # This is True because we want to activate the javascript filter
    cleaner.style = True
    cleaner.scripts = True
    cleaner.comments = True
    cleaner.links = True
    cleaner.meta = True
    cleaner.page_structure = True
    cleaner.processing_instructions = True
    cleaner.forms = True
    cleaner.add_nofollow = True

    ret = []

    for document in corpuses:
        #html = unicodedata.normalize('NFKD', html).encode('ascii','ignore')
        if len(document) > 0:
            try:
                document = lxml.html.document_fromstring(document)
                c = cleaner.clean_html(document)
                html = lxml.html.tostring(c)

                soup = BeautifulSoup(html, 'lxml')
                parsed_text = soup.get_text()

                if (len(parsed_text) > MINSIZE_CHARSDOC):
                    parsed_text = parsed_text.lower()

                    tokenizer = RegexpTokenizer(r'\w+')

                    # create English stop words list
                    en_stop = get_stop_words('en')
                    it_stop = get_stop_words('it')
                    sp_stop = get_stop_words('es')
                    ge_stop = get_stop_words('de')
                    fr_stop = get_stop_words('fr')

                    # Create p_stemmer of class PorterStemmer
                    #p_stemmer = PorterStemmer()

                    # clean and tokenize document string
                    tokens = tokenizer.tokenize(parsed_text)

                    # remove stop words from tokens
                    stopped_tokens1 = [i for i in tokens if not i in en_stop]
                    stopped_tokens2 = [
                        i for i in stopped_tokens1 if not i in it_stop
                    ]
                    stopped_tokens3 = [
                        i for i in stopped_tokens2 if not i in sp_stop
                    ]
                    stopped_tokens4 = [
                        i for i in stopped_tokens3 if not i in ge_stop
                    ]
                    stopped_tokens5 = [
                        i for i in stopped_tokens4 if not i in fr_stop
                    ]

                    for word in stopped_tokens5:
                        if not any(char.isdigit() for char in word):
                            if len(word) > 1:
                                #check if the word has the alphabet character
                                if isAlphabet(word):
                                    ret.append(word)
            except:
                print('Exception : Document empty')
    return [loc, ret]
Exemplo n.º 18
0
import lxml
from lxml import etree
from lxml.html.clean import autolink_html
from lxml.html.clean import Cleaner

#LXML PARSER
cleaner = Cleaner()
cleaner.javascript = True # This is True because we want to activate the javascript filter
cleaner.style = True      # This is True because we want to activate the styles & stylesheet filter
cleaner.comments = True
cleaner.embedded = True
cleaner.forms= True
cleaner.frames = True
#cleaner.safe_attrs_only = True

import re
notalpha = re.compile('[^a-zA-Z]')

#BS PARSER
from bs4 import BeautifulSoup as bs
from bs4 import Comment

def make_links_absolute(soup, url):
    return [urlparse.urljoin(url, tag['href']) for tag in soup.findAll('a', href=True)]
                        
def clean_html(soup):
    soup = bs(" ".join([s.extract() for s in soup('script')]))
    soup = bs(" ".join([s.extract() for s in soup('iframe')]))
    soup = bs(" ".join([s.extract() for s in soup('form')]))
    soup = bs(" ".join([s.extract() for s in soup('embed')]))
    soup = bs(" ".join([s.extract() for s in soup('style')]))
Exemplo n.º 19
0
from lxml import etree
from lxml.html.clean import Cleaner

from .filters import duplicate_test, textfilter
from .settings import CUT_EMPTY_ELEMS, DEFAULT_CONFIG, MANUALLY_CLEANED, MANUALLY_STRIPPED
from .utils import trim

LOGGER = logging.getLogger(__name__)

# HTML_CLEANER config
# http://lxml.de/api/lxml.html.clean.Cleaner-class.html
# https://lxml.de/apidoc/lxml.html.clean.html
HTML_CLEANER = Cleaner()
HTML_CLEANER.annoying_tags = False  # True
HTML_CLEANER.comments = True
HTML_CLEANER.embedded = False  # True
HTML_CLEANER.forms = False  # True
HTML_CLEANER.frames = False  # True
HTML_CLEANER.javascript = False
HTML_CLEANER.links = False
HTML_CLEANER.meta = False
HTML_CLEANER.page_structure = False
HTML_CLEANER.processing_instructions = True
HTML_CLEANER.remove_unknown_tags = False
HTML_CLEANER.safe_attrs_only = False
HTML_CLEANER.scripts = False
HTML_CLEANER.style = False
#HTML_CLEANER.remove_tags = MANUALLY_STRIPPED
#HTML_CLEANER.kill_tags = MANUALLY_CLEANED
Exemplo n.º 20
0
DATE_EXPRESSIONS = [
    "//*[contains(@id, 'date') or contains(@id, 'Date') or contains(@id, 'datum') or contains(@id, 'Datum') or contains(@id, 'time') or contains(@class, 'post-meta-time')]",
    "//*[contains(@class, 'date') or contains(@class, 'Date') or contains(@class, 'datum') or contains(@class, 'Datum')]",
    "//*[contains(@class, 'postmeta') or contains(@class, 'post-meta') or contains(@class, 'entry-meta') or contains(@class, 'postMeta') or contains(@class, 'post_meta') or contains(@class, 'post__meta')]",
    "//*[@class='meta' or @class='meta-before' or @class='asset-meta' or contains(@id, 'article-metadata') or contains(@class, 'article-metadata') or contains(@class, 'byline') or contains(@class, 'subline')]",
    "//*[contains(@class, 'published') or contains(@class, 'posted') or contains(@class, 'submitted') or contains(@class, 'created-post')]",
    "//*[contains(@id, 'lastmod') or contains(@itemprop, 'date') or contains(@class, 'time')]",
    "//footer",
    "//*[@class='post-footer' or @class='footer' or @id='footer']",
    "//small",
    "//*[contains(@class, 'author') or contains(@class, 'autor') or contains(@class, 'field-content') or @class='meta' or contains(@class, 'info') or contains(@class, 'fa-clock-o')]",
]

CLEANER = Cleaner()
CLEANER.comments = False
CLEANER.embedded = True
CLEANER.forms = False
CLEANER.frames = True
CLEANER.javascript = True
CLEANER.links = False
CLEANER.meta = False
CLEANER.page_structure = True
CLEANER.processing_instructions = True
CLEANER.remove_unknown_tags = False
CLEANER.safe_attrs_only = False
CLEANER.scripts = False
CLEANER.style = True
CLEANER.kill_tags = [
    'audio', 'canvas', 'label', 'map', 'math', 'object', 'picture', 'rdf',
    'svg', 'video'
Exemplo n.º 21
0
        E.BODY(
            E.H1(E.CLASS("heading"), title),
            lxml.html.fromstring(html)
            )
        )

    html_out.getroottree().write(file="summarized-roanoke.html", method="html")

if __name__ == "__main__":
    
    cleaner = Cleaner()
    cleaner.javascript = True
    cleaner.scripts = True
    cleaner.frame = True
    cleaner.meta = True
    cleaner.comments = True
    cleaner.links = True
    cleaner.style = True    
    cleaner.kill_tags = ["cite", "sup", "img", "noscript", "label", "video"]
        
    url = "https://en.wikipedia.org/wiki/Roanoke_Colony"
    doc = urllib2.urlopen(url)
    
    tree = lxml.html.parse(doc)
    title = tree.find(".//title").text
    
    tree = cleaner.clean_html(tree)

    netloc = urlparse(url).netloc
    if netloc == "en.wikipedia.org":
        parse_wiki(tree, title)