Exemplo n.º 1
0
def cleanpage(html):
	# cleaner setup
	cleaner = Cleaner()
        cleaner.html = True
        cleaner.page_structure = False
        cleaner.meta = False
        cleaner.safe_attrs_only = False
        cleaner.links = False
	cleaner.javascript = True # activate the javascript filter
	cleaner.style = True      #  activate the styles & stylesheet filter
        cleaner.links = False
        cleaner.frames = True
        cleaner.embedded = True
	cleaner.comments = True
	cleaner.annoying_tags = True
	cleaner.inline_style = True
	cleaner.page_structure = False
#	cleaner.remove_tags = ['b','img','h']
	cleaner.kill_tags = ['img','script']
	
	#invoke cleaner
        try:
            content=cleaner.clean_html(html)
        except:
            #error: ValueError: Unicode strings with encoding declaration are not supported. Please use bytes input or XML fr 
            content = u""
        return content
    def get_clean_html(self, html_text, text_only=True):
        try:
            etree = lxml.html.document_fromstring(html_text)

            self._is_etree(etree)
            # enable filters to remove Javascript and CSS from HTML document
            cleaner = Cleaner()
            cleaner.javascript = True
            cleaner.style = True
            cleaner.html = True
            cleaner.page_structure = False
            cleaner.meta = False
            cleaner.safe_attrs_only = False
            cleaner.links = False

            html = cleaner.clean_html(etree)
            if text_only:
                return ' '.join(html.text_content().split())
                # return html.text_content()

            res = lxml.html.tostring(html)
        except Exception as e:
            logger.error(f"While parsing email in get_clean_html {e}")
            res = "junk"

        return res
Exemplo n.º 3
0
def lxml_extractor(html, url):
    '''LXML PARSER'''
    cleaner = Cleaner()
    cleaner.javascript = True # This is True because we want to activate the javascript filter
    cleaner.style = True      # This is True because we want to activate the styles & stylesheet filter
    cleaner.comments = True
    cleaner.embedded = True
    cleaner.forms= True
    cleaner.frames = True
    cleaner.annoying_tags = True
    cleaner.kill_tags = NEGATIVE_K 
    cleaner.allow_tag = POSITIVE_K
    cleaner.safe_attrs_only = True
    #~ oc = document_fromstring(html, parser=parser, base_url=base_url, **kw)
  #~ File "/usr/local/lib/python2.7/dist-packages/lxml/html/__init__.py", line 752, in document_fromstring
    #~ value = etree.fromstring(html, parser, **kw)
    try:
        html = lxml.html.fromstring(html, base_url="url")
    
        tree = cleaner.clean_html(html)
        #tree.make_links_absolute(url)
        doc = lxml.html.tostring(tree)
        doc = soup_extractor(doc, url)
    except ValueError:
        doc = soup_extractor(html, url)
    
    #~ (title, doc, article, text) = read_extractor(html, url)
    #~ print title
    #~ doc = (self.doc).replace(unichr(160), " ")
    #~ doc = re.sub(spaces,"",self.doc)
    return doc
Exemplo n.º 4
0
def clean_html(html_text,
               javascript=True,
               scripts=True,
               style=True,
               embedded=True,
               links=True,
               forms=True,
               frames=True,
               comments=True,
               annoying_tags=True,
               meta=True,
               safe_attrs_only=True,
               remove_unknown_tags=True,
               processing_instructions=True):
    """Clean all the javascript and styles from the HTML returning the string with only the html content"""
    # True = Remove | False = Keep
    cleaner = Cleaner()
    cleaner.javascript = javascript  # This is True because we want to activate the javascript filter
    cleaner.scripts = scripts  # This is True because we want to activate the scripts filter
    cleaner.style = style
    cleaner.embedded = embedded
    cleaner.links = links
    cleaner.forms = forms
    cleaner.frames = frames
    cleaner.comments = comments
    cleaner.page_structure = False  # Keep page structure
    cleaner.annoying_tags = annoying_tags
    cleaner.meta = meta
    cleaner.safe_attrs_only = safe_attrs_only
    cleaner.remove_unknown_tags = remove_unknown_tags
    cleaner.processing_instructions = processing_instructions
    clean_content = cleaner.clean_html(lxml.html.fromstring(html_text))
    return lxml.html.tostring(clean_content)
Exemplo n.º 5
0
    def cleaner_li(self):

        cleaner = Cleaner()
        cleaner.javascript = True
        cleaner.style = True
        cleaner.meta = True
        cleaner.safe_attrs_only = True
        cleaner.remove_tags = ['i', 'span', 'b', 'li']
        cleaner.safe_attrs = ['href']

        return cleaner
Exemplo n.º 6
0
def trim_html(html):
    """Takes a html string as input and returns the html without any styles nor javascript"""
    cleaner = Cleaner()

    cleaner.scripts         = True
    cleaner.javascript      = True  # Get rid of the javascript and the style
    cleaner.style           = True

    cleaner.meta            = False # Keeping the meta tags is important for page redirection purposes
    cleaner.safe_attrs_only = False

    return cleaner.clean_html(html)
Exemplo n.º 7
0
def get_clean_html(etree, text_only=False):
    _is_etree(etree)
    # enable filters to remove Javascript and CSS from HTML document
    cleaner = Cleaner()
    cleaner.javascript = True
    cleaner.style = True
    cleaner.html = True
    cleaner.page_structure = False
    cleaner.meta = False
    cleaner.safe_attrs_only = False
    cleaner.links = False
    
    html = cleaner.clean_html(etree)
    if text_only:
        return html.text_content()

    return lxml.html.tostring(html)
Exemplo n.º 8
0
def get_clean_html(etree, text_only=False):
    _is_etree(etree)
    # enable filters to remove Javascript and CSS from HTML document
    cleaner = Cleaner()
    cleaner.javascript = True
    cleaner.style = True
    cleaner.html = True
    cleaner.page_structure = False
    cleaner.meta = False
    cleaner.safe_attrs_only = False
    cleaner.links = False

    html = cleaner.clean_html(etree)
    if text_only:
        return html.text_content()

    return lxml.html.tostring(html)
Exemplo n.º 9
0
    def clean(self: T) -> str:
        cleaner = Cleaner()
        cleaner.style = self.__style
        cleaner.links = self.__links
        cleaner.page_structure = self.__page_structure
        cleaner.safe_attrs_only = self.__safe_attrs_only

        # allow_tags and remove_unknown_tags can't work together
        if self.__allow_tags is not None:
            cleaner.remove_unknown_tags = False
            cleaner.allow_tags = self.__allow_tags
        if self.__kill_tags is not None: cleaner.kill_tags = self.__kill_tags
        if self.__remove_tags is not None:
            cleaner.remove_tags = self.__remove_tags
        if self.__safe_attrs is not None:
            cleaner.safe_attrs = self.__safe_attrs

        self.__input = cleaner.clean_html(self.__input)
        return self.__input
Exemplo n.º 10
0
def get_clean_text(filename):
    utf8_parser = html.HTMLParser(encoding='utf-8')
    htmltxt = lxml.html.parse(filename, parser=utf8_parser)
    cleaner = Cleaner()
    cleaner.javascript = True
    cleaner.style = True
    cleaner.html = True
    cleaner.page_structure = False
    cleaner.meta = False
    cleaner.safe_attrs_only = False
    cleaner.links = False

    htmltxt = cleaner.clean_html(htmltxt)

    txt = etree.tostring(htmltxt, encoding='unicode')
    txtresub = re.sub(r'<.+?>', ' ', txt)
    txtresub = re.sub(r'(\s|&?(amp;|apos;|quot;|gt;|lt;|nbsp;))+', ' ',
                      txtresub)

    return txtresub
Exemplo n.º 11
0
    def handle(self, **options):
        since = get_last_change()
        writer = get_writer()

        last_change = since
        while True:
            doc = {}

            changes = settings.db.changes(since=since)
            since = changes["last_seq"]

            if since != last_change:
                print("Detected new tasks ".format(len(changes)))
                print("=== changes ===")
                pprint(changes)
            for changeset in changes["results"]:
                try:
                    doc = settings.db[changeset["id"]]
                except couchdb.http.ResourceNotFound:
                    print("resource not found")
                    continue

            if not ("type" in doc and "page" in doc["type"]):
                if since != last_change:
                    print("not processing doc: {}".format(str(doc)))
                    last_change = since
                continue

            print("indexing", doc["url"])

            #####
            # raw, html, text
            #####################
            raw = doc["content"]
            print("type(RAW) = %s" % type(raw))
            tree = document_fromstring(str(raw))
            title = " ".join([title for title in tree.xpath("//title/text()")])

            # enable filters to remove Javascript and CSS from HTML document
            cleaner = Cleaner()
            cleaner.javascript = True
            cleaner.style = True
            cleaner.html = True
            cleaner.page_structure = False
            cleaner.meta = False
            cleaner.safe_attrs_only = False
            cleaner.links = False

            html = cleaner.clean_html(tree)
            text_content = html.text_content()

            lxml.html.tostring(html)
            description = " ".join(tree.xpath("//meta[@name='description']/@content"))

            writer.update_document(
                title=title,
                url=doc["url"],
                desc=description,
                rank=doc["rank"],
                content="\n".join([title, doc["url"], text_content]),
                raw=raw,
            )

            writer.commit()
            writer = get_writer()
            set_last_change(since)
            last_change = since
Exemplo n.º 12
0
# HTML_CLEANER config
# http://lxml.de/api/lxml.html.clean.Cleaner-class.html
# https://lxml.de/apidoc/lxml.html.clean.html
HTML_CLEANER = Cleaner()
HTML_CLEANER.annoying_tags = False  # True
HTML_CLEANER.comments = True
HTML_CLEANER.embedded = False  # True
HTML_CLEANER.forms = False  # True
HTML_CLEANER.frames = False  # True
HTML_CLEANER.javascript = False
HTML_CLEANER.links = False
HTML_CLEANER.meta = False
HTML_CLEANER.page_structure = False
HTML_CLEANER.processing_instructions = True
HTML_CLEANER.remove_unknown_tags = False
HTML_CLEANER.safe_attrs_only = False
HTML_CLEANER.scripts = False
HTML_CLEANER.style = False
#HTML_CLEANER.remove_tags = MANUALLY_STRIPPED
#HTML_CLEANER.kill_tags = MANUALLY_CLEANED


def tree_cleaning(tree, include_tables, include_images=False):
    '''Prune the tree by discarding unwanted elements'''
    # determine cleaning strategy
    cleaning_list, stripping_list = \
        MANUALLY_CLEANED.copy(), MANUALLY_STRIPPED.copy()
    if include_tables is False:
        cleaning_list.append('table')
    if include_images is True:
        # Many websites have <img> inside <figure> or <picture> or <source> tag
Exemplo n.º 13
0
try:
    readline.read_history_file(histfile)
except IOError:
    pass

try:
    from lxml.html.clean import Cleaner
    import lxml
    from lxml.html import document_fromstring
    import requests
    resp = requests.get('http://en.wikipedia.org/')
    tree = document_fromstring(resp.text)
    raw = resp.text
    # enable filters to remove Javascript and CSS from HTML document                                                                     
    cleaner = Cleaner()
    cleaner.javascript = True
    cleaner.style = True
    cleaner.html = True
    cleaner.page_structure = False
    cleaner.meta = False
    cleaner.safe_attrs_only = False
    cleaner.links = False

    html = cleaner.clean_html(tree)
    text_content = html.text_content()
except ImportError:
    pass

atexit.register(readline.write_history_file, histfile)
del os, histfile
logger.debug('settings: %s %s %s', MIN_YEAR, TODAY, MAX_YEAR)
logger.debug('dateparser configuration: %s', PARSERCONFIG)

cleaner = Cleaner()
cleaner.comments = True
cleaner.embedded = True
cleaner.forms = False
cleaner.frames = True
cleaner.javascript = False
cleaner.links = False
cleaner.meta = False
cleaner.page_structure = True
cleaner.processing_instructions = True
cleaner.remove_unknown_tags = False
cleaner.safe_attrs_only = False
cleaner.scripts = False
cleaner.style = False
cleaner.kill_tags = [
    'audio', 'canvas', 'label', 'map', 'math', 'object', 'picture', 'table',
    'svg', 'video'
]
# 'embed', 'figure', 'img',


def date_validator(datestring, outputformat):
    """Validate a string with respect to the chosen outputformat and basic heuristics"""
    # try if date can be parsed using chosen outputformat
    try:
        dateobject = datetime.datetime.strptime(datestring, outputformat)
    except ValueError:
Exemplo n.º 15
0
    def handle(self, **options):
        since = get_last_change()
        writer = get_writer()

        last_change = since
        while True:
            doc = {}

            changes = settings.db.changes(since=since)
            since = changes["last_seq"]

            if since != last_change:
                print("Detected new tasks ".format(len(changes)))
                print("=== changes ===")
                pprint(changes)
            for changeset in changes["results"]:
                try:
                    doc = settings.db[changeset["id"]]
                except couchdb.http.ResourceNotFound:
                    print("resource not found")
                    continue

            if not ("type" in doc and "page" in doc["type"]):
                if since != last_change:
                    print("not processing doc: {}".format(str(doc)))
                    last_change = since
                continue

            print("indexing", doc["url"])

            #####
            # raw, html, text
            #####################
            raw = doc['content']
            print("type(RAW) = %s" % type(raw))
            tree = document_fromstring(str(raw))
            title = ' '.join([title for title in tree.xpath('//title/text()')])

            # enable filters to remove Javascript and CSS from HTML document
            cleaner = Cleaner()
            cleaner.javascript = True
            cleaner.style = True
            cleaner.html = True
            cleaner.page_structure = False
            cleaner.meta = False
            cleaner.safe_attrs_only = False
            cleaner.links = False

            html = cleaner.clean_html(tree)
            text_content = html.text_content()

            lxml.html.tostring(html)
            description = ' '.join(
                tree.xpath("//meta[@name='description']/@content"))

            writer.update_document(
                title=title,
                url=doc['url'],
                desc=description,
                rank=doc['rank'],
                content='\n'.join([title, doc['url'], text_content]),
                raw=raw,
            )

            writer.commit()
            writer = get_writer()
            set_last_change(since)
            last_change = since
Exemplo n.º 16
0
 def html_sanitizer(content):
     """ sanitize malicious scripts """
     cleaner = Cleaner()
     cleaner.embedded = False
     cleaner.safe_attrs_only = False
     return cleaner.clean_html(content)