Пример #1
0
class DocumentCleaner(object):
    def __init__(self, config, article):
        # config
        self.config = config

        # parser
        self.parser = self.config.get_parser()

        # article
        self.article = article

        # nodes to remove regexp
        self.remove_nodes_re = (
            "^side$|combx|retweet|mediaarticlerelated|menucontainer|"
            "navbar|storytopbar-bucket|utility-bar|inline-share-tools"
            "|comment|PopularQuestions|contact|foot|footer|Footer|footnote"
            "|cnn_strycaptiontxt|cnn_html_slideshow|cnn_strylftcntnt"
            "|^links$|meta$|shoutbox|sponsor"
            "|tags|socialnetworking|socialNetworking|cnnStryHghLght"
            "|cnn_stryspcvbx|^inset$|pagetools|post-attributes"
            "|welcome_form|contentTools2|the_answers"
            "|communitypromo|runaroundLeft|subscribe|vcard|articleheadings"
            "|date|^print$|popup|author-dropdown|tools|socialtools|byline"
            "|konafilter|KonaFilter|breadcrumbs|^fn$|wp-caption-text"
            "|legende|ajoutVideo|timestamp|js_replies")
        self.regexp_namespace = "http://exslt.org/regular-expressions"
        self.nauthy_ids_re = "//*[re:test(@id, '%s', 'i')]" % self.remove_nodes_re
        self.nauthy_classes_re = "//*[re:test(@class, '%s', 'i')]" % self.remove_nodes_re
        self.nauthy_names_re = "//*[re:test(@name, '%s', 'i')]" % self.remove_nodes_re
        self.div_to_p_re = r"<(a|blockquote|dl|div|img|ol|p|pre|table|ul)"
        self.caption_re = "^caption$"
        self.google_re = " google "
        self.entries_re = "^[^entry-]more.*$"
        self.facebook_re = "[^-]facebook"
        self.facebook_braodcasting_re = "facebook-broadcasting"
        self.twitter_re = "[^-]twitter"
        self.tablines_replacements = ReplaceSequence()\
                                            .create("\n", "\n\n")\
                                            .append("\t")\
                                            .append("^\\s+$")

    def clean(self):
        doc_to_clean = self.article.doc
        doc_to_clean = self.clean_body_classes(doc_to_clean)
        doc_to_clean = self.clean_article_tags(doc_to_clean)
        doc_to_clean = self.clean_em_tags(doc_to_clean)
        doc_to_clean = self.remove_drop_caps(doc_to_clean)
        doc_to_clean = self.remove_scripts_styles(doc_to_clean)
        doc_to_clean = self.clean_bad_tags(doc_to_clean)
        doc_to_clean = self.remove_nodes_regex(doc_to_clean, self.caption_re)
        doc_to_clean = self.remove_nodes_regex(doc_to_clean, self.google_re)
        doc_to_clean = self.remove_nodes_regex(doc_to_clean, self.entries_re)
        doc_to_clean = self.remove_nodes_regex(doc_to_clean, self.facebook_re)
        doc_to_clean = self.remove_nodes_regex(doc_to_clean,
                                               self.facebook_braodcasting_re)
        doc_to_clean = self.remove_nodes_regex(doc_to_clean, self.twitter_re)
        doc_to_clean = self.clean_para_spans(doc_to_clean)
        doc_to_clean = self.convert_wanted_tags_to_paragraphs(
            doc_to_clean, ARTICLE_ROOT_TAGS)
        return doc_to_clean

    def clean_body_classes(self, doc):
        # we don't need body classes
        # in case it matches an unwanted class all the document
        # will be empty
        elements = self.parser.getElementsByTag(doc, tag="body")
        if elements:
            self.parser.delAttribute(elements[0], attr="class")
        return doc

    def clean_article_tags(self, doc):
        articles = self.parser.getElementsByTag(doc, tag='article')
        for article in articles:
            for attr in ['id', 'name', 'class']:
                self.parser.delAttribute(article, attr=attr)
        return doc

    def clean_em_tags(self, doc):
        ems = self.parser.getElementsByTag(doc, tag='em')
        for node in ems:
            images = self.parser.getElementsByTag(node, tag='img')
            if len(images) == 0:
                self.parser.drop_tag(node)
        return doc

    def remove_drop_caps(self, doc):
        items = self.parser.css_select(
            doc, "span[class~=dropcap], span[class~=drop_cap]")
        for item in items:
            self.parser.drop_tag(item)

        return doc

    def remove_scripts_styles(self, doc):
        # remove scripts
        scripts = self.parser.getElementsByTag(doc, tag='script')
        for item in scripts:
            self.parser.remove(item)

        # remove styles
        styles = self.parser.getElementsByTag(doc, tag='style')
        for item in styles:
            self.parser.remove(item)

        # remove comments
        comments = self.parser.getComments(doc)
        for item in comments:
            self.parser.remove(item)

        return doc

    def clean_bad_tags(self, doc):
        # ids
        naughty_list = self.parser.xpath_re(doc, self.nauthy_ids_re)
        for node in naughty_list:
            self.parser.remove(node)

        # class
        naughty_classes = self.parser.xpath_re(doc, self.nauthy_classes_re)
        for node in naughty_classes:
            self.parser.remove(node)

        # name
        naughty_names = self.parser.xpath_re(doc, self.nauthy_names_re)
        for node in naughty_names:
            self.parser.remove(node)

        return doc

    def remove_nodes_regex(self, doc, pattern):
        for selector in ['id', 'class']:
            reg = "//*[re:test(@%s, '%s', 'i')]" % (selector, pattern)
            naughty_list = self.parser.xpath_re(doc, reg)

            for node in naughty_list:
                self.parser.remove(node)
        return doc

    def clean_para_spans(self, doc):
        spans = self.parser.css_select(doc, 'p span')
        for item in spans:
            self.parser.drop_tag(item)
        return doc

    def get_flushed_buffer(self, replacement_text, doc):
        return self.parser.textToPara(replacement_text)

    def get_replacement_nodes(self, doc, div):
        replacement_text = []
        nodes_to_return = []
        nodes_to_remove = []
        childs = self.parser.childNodesWithText(div)

        for kid in childs:
            # node is a p
            # and already have some replacement text
            if self.parser.getTag(kid) == 'p' and len(replacement_text) > 0:
                newNode = self.get_flushed_buffer(''.join(replacement_text),
                                                  doc)
                nodes_to_return.append(newNode)
                replacement_text = []
                nodes_to_return.append(kid)
            # node is a text node
            elif self.parser.isTextNode(kid):
                kid_text_node = kid
                kid_text = self.parser.getText(kid)
                replace_text = self.tablines_replacements.replaceAll(kid_text)
                if (len(replace_text)) > 1:
                    previous_sibling_node = self.parser.previousSibling(
                        kid_text_node)
                    while previous_sibling_node is not None \
                        and self.parser.getTag(previous_sibling_node) == "a" \
                        and self.parser.getAttribute(previous_sibling_node, 'grv-usedalready') != 'yes':
                        outer = " " + self.parser.outerHtml(
                            previous_sibling_node) + " "
                        replacement_text.append(outer)
                        nodes_to_remove.append(previous_sibling_node)
                        self.parser.setAttribute(previous_sibling_node,
                                                 attr='grv-usedalready',
                                                 value='yes')
                        prev = self.parser.previousSibling(
                            previous_sibling_node)
                        previous_sibling_node = prev if prev is not None else None
                    # append replace_text
                    replacement_text.append(replace_text)
                    #
                    next_sibling_node = self.parser.nextSibling(kid_text_node)
                    while next_sibling_node is not None \
                        and self.parser.getTag(next_sibling_node) == "a" \
                        and self.parser.getAttribute(next_sibling_node, 'grv-usedalready') != 'yes':
                        outer = " " + self.parser.outerHtml(
                            next_sibling_node) + " "
                        replacement_text.append(outer)
                        nodes_to_remove.append(next_sibling_node)
                        self.parser.setAttribute(next_sibling_node,
                                                 attr='grv-usedalready',
                                                 value='yes')
                        next = self.parser.nextSibling(next_sibling_node)
                        previous_sibling_node = next if next is not None else None

            # otherwise
            else:
                nodes_to_return.append(kid)

        # flush out anything still remaining
        if (len(replacement_text) > 0):
            new_node = self.get_flushed_buffer(''.join(replacement_text), doc)
            nodes_to_return.append(new_node)
            replacement_text = []

        for n in nodes_to_remove:
            self.parser.remove(n)

        return nodes_to_return

    def replace_with_para(self, doc, div):
        self.parser.replaceTag(div, 'p')

    def convert_wanted_tags_to_paragraphs(self, doc, wanted_tags):
        selected = self.parser.getElementsByTags(doc, wanted_tags)

        for elem in selected:
            if not self.parser.getElementsByTags(elem, BLOCK_ELEMENT_TAGS):
                self.replace_with_para(doc, elem)
            else:
                replaceNodes = self.get_replacement_nodes(doc, elem)
                elem.clear()

                for c, n in enumerate(replaceNodes):
                    elem.insert(c, n)

        return doc
Пример #2
0
class DocumentCleaner(object):

    def __init__(self, config):
        self.config = config
        # parser
        self.parser = self.config.get_parser()
        self.remove_nodes_re = (
        "^side$|combx|retweet|mediaarticlerelated|menucontainer|navbar"
        "|comment|PopularQuestions|contact|foot|footer|Footer|footnote"
        "|cnn_strycaptiontxt|cnn_html_slideshow|cnn_strylftcntnt|links|meta$|scroll|shoutbox|sponsor"
        "|tags|socialnetworking|socialNetworking|cnnStryHghLght"
        "|cnn_stryspcvbx|^inset$|pagetools|post-attributes"
        "|welcome_form|contentTools2|the_answers"
        "|communitypromo|runaroundLeft|subscribe|vcard|articleheadings"
        "|date|^print$|popup|author-dropdown|tools|socialtools|byline"
        "|konafilter|KonaFilter|breadcrumbs|^fn$|wp-caption-text"
        "|legende|ajoutVideo|timestamp|js_replies"
        )
        self.regexp_namespace = "http://exslt.org/regular-expressions"
        self.nauthy_ids_re = "//*[re:test(@id, '%s', 'i')]" % self.remove_nodes_re
        self.nauthy_classes_re = "//*[re:test(@class, '%s', 'i')]" % self.remove_nodes_re
        self.nauthy_names_re = "//*[re:test(@name, '%s', 'i')]" % self.remove_nodes_re
        self.div_to_p_re = r"<(a|blockquote|dl|div|img|ol|p|pre|table|ul)"
        self.caption_re = "^caption$"
        self.google_re = " google "
        self.entries_re = "^[^entry-]more.*$"
        self.facebook_re = "[^-]facebook"
        self.facebook_braodcasting_re = "facebook-broadcasting"
        self.twitter_re = "[^-]twitter"
        self.tablines_replacements = ReplaceSequence()\
                                            .create("\n", "\n\n")\
                                            .append("\t")\
                                            .append("^\\s+$")

    def clean(self, article):

        doc_to_clean = article.doc
        doc_to_clean = self.clean_article_tags(doc_to_clean)
        doc_to_clean = self.clean_em_tags(doc_to_clean)
        doc_to_clean = self.remove_drop_caps(doc_to_clean)
        doc_to_clean = self.remove_scripts_styles(doc_to_clean)
        doc_to_clean = self.clean_bad_tags(doc_to_clean)
        doc_to_clean = self.remove_nodes_regex(doc_to_clean, self.caption_re)
        doc_to_clean = self.remove_nodes_regex(doc_to_clean, self.google_re)
        doc_to_clean = self.remove_nodes_regex(doc_to_clean, self.entries_re)
        doc_to_clean = self.remove_nodes_regex(doc_to_clean, self.facebook_re)
        doc_to_clean = self.remove_nodes_regex(doc_to_clean, self.facebook_braodcasting_re)
        doc_to_clean = self.remove_nodes_regex(doc_to_clean, self.twitter_re)
        doc_to_clean = self.clean_para_spans(doc_to_clean)
        doc_to_clean = self.div_to_para(doc_to_clean, 'div')
        doc_to_clean = self.div_to_para(doc_to_clean, 'span')
        return doc_to_clean

    def clean_article_tags(self, doc):
        articles = self.parser.getElementsByTag(doc, tag='article')
        for article in articles:
            for attr in ['id', 'name', 'class']:
                self.parser.delAttribute(article, attr=attr)
        return doc

    def clean_em_tags(self, doc):
        ems = self.parser.getElementsByTag(doc, tag='em')
        for node in ems:
            images = self.parser.getElementsByTag(node, tag='img')
            if len(images) == 0:
                self.parser.drop_tag(node)
        return doc

    def remove_drop_caps(self, doc):
        items = self.parser.css_select(doc, "span[class~=dropcap], span[class~=drop_cap]")
        for item in items:
            self.parser.drop_tag(item)

        return doc

    def remove_scripts_styles(self, doc):
        # remove scripts
        scripts = self.parser.getElementsByTag(doc, tag='script')
        for item in scripts:
            self.parser.remove(item)

        # remove styles
        styles = self.parser.getElementsByTag(doc, tag='style')
        for item in styles:
            self.parser.remove(item)

        # remove comments
        comments = self.parser.getComments(doc)
        for item in comments:
            self.parser.remove(item)

        return doc

    def clean_bad_tags(self, doc):
        # ids
        naughty_list = self.parser.xpath_re(doc, self.nauthy_ids_re)
        for node in naughty_list:
            self.parser.remove(node)

        # class
        naughty_classes = self.parser.xpath_re(doc, self.nauthy_classes_re)
        for node in naughty_classes:
            self.parser.remove(node)

        # name
        naughty_names = self.parser.xpath_re(doc, self.nauthy_names_re)
        for node in naughty_names:
            self.parser.remove(node)

        return doc

    def remove_nodes_regex(self, doc, pattern):
        for selector in ['id', 'class']:
            reg = "//*[re:test(@%s, '%s', 'i')]" % (selector, pattern)
            naughty_list = self.parser.xpath_re(doc, reg)
            for node in naughty_list:
                self.parser.remove(node)
        return doc

    def clean_para_spans(self, doc):
        spans = self.parser.css_select(doc, 'p > span')
        for item in spans:
            self.parser.drop_tag(item)
        return doc

    def get_flushed_buffer(self, replacement_text, doc):
        return self.parser.textToPara(replacement_text)

    def get_replacement_nodes(self, doc, div):
        replacement_text = []
        nodes_to_return = []
        nodes_to_remove = []
        childs = self.parser.childNodesWithText(div)

        for kid in childs:
            # node is a p
            # and already have some replacement text
            if self.parser.getTag(kid) == 'p' and len(replacement_text) > 0:
                newNode = self.get_flushed_buffer(''.join(replacement_text), doc)
                nodes_to_return.append(newNode)
                replacement_text = []
                nodes_to_return.append(kid)
            # node is a text node
            elif self.parser.isTextNode(kid):
                kid_text_node = kid
                kid_text = self.parser.getText(kid)
                replace_text = self.tablines_replacements.replaceAll(kid_text)
                if(len(replace_text)) > 1:
                    previous_sibling_node = self.parser.previousSibling(kid_text_node)
                    while previous_sibling_node is not None \
                        and self.parser.getTag(previous_sibling_node) == "a" \
                        and self.parser.getAttribute(previous_sibling_node, 'grv-usedalready') != 'yes':
                        outer = " " + self.parser.outerHtml(previous_sibling_node) + " "
                        replacement_text.append(outer)
                        nodes_to_remove.append(previous_sibling_node)
                        self.parser.setAttribute(previous_sibling_node,
                                    attr='grv-usedalready', value='yes')
                        prev = self.parser.previousSibling(previous_sibling_node)
                        previous_sibling_node = prev if prev is not None else None
                    # append replace_text
                    replacement_text.append(replace_text)
                    #
                    next_sibling_node = self.parser.nextSibling(kid_text_node)
                    while next_sibling_node is not None \
                        and self.parser.getTag(next_sibling_node) == "a" \
                        and self.parser.getAttribute(next_sibling_node, 'grv-usedalready') != 'yes':
                        outer = " " + self.parser.outerHtml(next_sibling_node) + " "
                        replacement_text.append(outer)
                        nodes_to_remove.append(next_sibling_node)
                        self.parser.setAttribute(next_sibling_node,
                                    attr='grv-usedalready', value='yes')
                        next = self.parser.nextSibling(next_sibling_node)
                        previous_sibling_node = next if next is not None else None

            # otherwise
            else:
                nodes_to_return.append(kid)

        # flush out anything still remaining
        if(len(replacement_text) > 0):
            new_node = self.get_flushed_buffer(''.join(replacement_text), doc)
            nodes_to_return.append(new_node)
            replacement_text = []

        for n in nodes_to_remove:
            self.parser.remove(n)

        return nodes_to_return

    def replace_with_para(self, doc, div):
        self.parser.replaceTag(div, 'p')

    def div_to_para(self, doc, dom_type):
        bad_divs = 0
        else_divs = 0
        divs = self.parser.getElementsByTag(doc, tag=dom_type)
        tags = ['a', 'blockquote', 'dl', 'div', 'img', 'ol', 'p', 'pre', 'table', 'ul']

        for div in divs:
            items = self.parser.getElementsByTags(div, tags)
            if div is not None and len(items) == 0:
                self.replace_with_para(doc, div)
                bad_divs += 1
            elif div is not None:
                replaceNodes = self.get_replacement_nodes(doc, div)
                div.clear()

                for c, n in enumerate(replaceNodes):
                    div.insert(c, n)

                else_divs += 1

        return doc
Пример #3
0
class DocumentCleaner(object):
    def __init__(self):

        self.regExRemoveNodes = (
            "^side$|combx|retweet|mediaarticlerelated|menucontainer|navbar"
            "|comment|PopularQuestions|contact|foot|footer|Footer|footnote"
            "|cnn_strycaptiontxt|links|meta$|scroll|shoutbox|sponsor"
            "|tags|socialnetworking|socialNetworking|cnnStryHghLght"
            "|cnn_stryspcvbx|^inset$|pagetools|post-attributes"
            "|welcome_form|contentTools2|the_answers"
            "|communitypromo|runaroundLeft|subscribe|vcard|articleheadings"
            "|date|^print$|popup|author-dropdown|tools|socialtools|byline"
            "|konafilter|KonaFilter|breadcrumbs|^fn$|wp-caption-text"
            "|source|legende|ajoutVideo|timestamp"
        )
        self.regexpNS = "http://exslt.org/regular-expressions"
        self.queryNaughtyIDs = "//*[re:test(@id, '%s', 'i')]" % self.regExRemoveNodes
        self.queryNaughtyClasses = "//*[re:test(@class, '%s', 'i')]" % self.regExRemoveNodes
        self.queryNaughtyNames = "//*[re:test(@name, '%s', 'i')]" % self.regExRemoveNodes
        self.divToPElementsPattern = r"<(a|blockquote|dl|div|img|ol|p|pre|table|ul)"
        self.captionPattern = "^caption$"
        self.googlePattern = " google "
        self.entriesPattern = "^[^entry-]more.*$"
        self.facebookPattern = "[^-]facebook"
        self.facebookBroadcastingPattern = "facebook-broadcasting"
        self.twitterPattern = "[^-]twitter"
        self.tabsAndNewLinesReplcesments = ReplaceSequence().create("\n", "\n\n").append("\t").append("^\\s+$")

    def clean(self, article):

        docToClean = article.doc
        docToClean = self.cleanEmTags(docToClean)
        docToClean = self.removeDropCaps(docToClean)
        docToClean = self.removeScriptsAndStyles(docToClean)
        docToClean = self.cleanBadTags(docToClean)
        docToClean = self.removeNodesViaRegEx(docToClean, self.captionPattern)
        docToClean = self.removeNodesViaRegEx(docToClean, self.googlePattern)
        docToClean = self.removeNodesViaRegEx(docToClean, self.entriesPattern)
        docToClean = self.removeNodesViaRegEx(docToClean, self.facebookPattern)
        docToClean = self.removeNodesViaRegEx(docToClean, self.facebookBroadcastingPattern)
        docToClean = self.removeNodesViaRegEx(docToClean, self.twitterPattern)
        docToClean = self.cleanUpSpanTagsInParagraphs(docToClean)
        docToClean = self.convertDivsToParagraphs(docToClean, "div")
        docToClean = self.convertDivsToParagraphs(docToClean, "span")
        return docToClean

    def cleanEmTags(self, doc):
        ems = Parser.getElementsByTag(doc, tag="em")
        for node in ems:
            images = Parser.getElementsByTag(node, tag="img")
            if len(images) == 0:
                node.drop_tag()
        return doc

    def removeDropCaps(self, doc):
        items = doc.cssselect("span[class~=dropcap], span[class~=drop_cap]")
        for item in items:
            item.drop_tag()

        return doc

    def removeScriptsAndStyles(self, doc):
        # remove scripts
        scripts = Parser.getElementsByTag(doc, tag="script")
        for item in scripts:
            Parser.remove(item)

        # remove styles
        styles = Parser.getElementsByTag(doc, tag="style")
        for item in styles:
            Parser.remove(item)

        # remove comments
        comments = Parser.getComments(doc)
        for item in comments:
            Parser.remove(item)

        return doc

    def cleanBadTags(self, doc):

        # ids
        naughtyList = doc.xpath(self.queryNaughtyIDs, namespaces={"re": self.regexpNS})
        for node in naughtyList:
            Parser.remove(node)

        # class
        naughtyClasses = doc.xpath(self.queryNaughtyClasses, namespaces={"re": self.regexpNS})
        for node in naughtyClasses:
            Parser.remove(node)

        # name
        naughtyNames = doc.xpath(self.queryNaughtyNames, namespaces={"re": self.regexpNS})
        for node in naughtyNames:
            Parser.remove(node)

        return doc

    def removeNodesViaRegEx(self, doc, pattern):
        for selector in ["id", "class"]:
            reg = "//*[re:test(@%s, '%s', 'i')]" % (selector, pattern)
            naughtyList = doc.xpath(reg, namespaces={"re": self.regexpNS})
            for node in naughtyList:
                Parser.remove(node)
        return doc

    def cleanUpSpanTagsInParagraphs(self, doc):
        spans = doc.cssselect("p > span")
        for item in spans:
            item.drop_tag()
        return doc

    def getFlushedBuffer(self, replacementText, doc):
        return Parser.textToPara(replacementText)

    def getReplacementNodes(self, doc, div):
        replacementText = []
        nodesToReturn = []
        nodesToRemove = []
        childs = Parser.childNodesWithText(div)

        for kid in childs:
            # node is a p
            # and already have some replacement text
            if Parser.getTag(kid) == "p" and len(replacementText) > 0:
                newNode = self.getFlushedBuffer("".join(replacementText), doc)
                nodesToReturn.append(newNode)
                replacementText = []
                nodesToReturn.append(kid)
            # node is a text node
            elif Parser.isTextNode(kid):
                kidTextNode = kid
                kidText = Parser.getText(kid)
                replaceText = self.tabsAndNewLinesReplcesments.replaceAll(kidText)
                if (len(replaceText)) > 1:
                    prevSibNode = Parser.previousSibling(kidTextNode)
                    while (
                        prevSibNode is not None
                        and Parser.getTag(prevSibNode) == "a"
                        and Parser.getAttribute(prevSibNode, "grv-usedalready") != "yes"
                    ):
                        outer = " " + Parser.outerHtml(prevSibNode) + " "
                        replacementText.append(outer)
                        nodesToRemove.append(prevSibNode)
                        Parser.setAttribute(prevSibNode, attr="grv-usedalready", value="yes")
                        prev = Parser.previousSibling(prevSibNode)
                        prevSibNode = prev if prev is not None else None
                    # append replaceText
                    replacementText.append(replaceText)
                    #
                    nextSibNode = Parser.nextSibling(kidTextNode)
                    while (
                        nextSibNode is not None
                        and Parser.getTag(nextSibNode) == "a"
                        and Parser.getAttribute(nextSibNode, "grv-usedalready") != "yes"
                    ):
                        outer = " " + Parser.outerHtml(nextSibNode) + " "
                        replacementText.append(outer)
                        nodesToRemove.append(nextSibNode)
                        Parser.setAttribute(nextSibNode, attr="grv-usedalready", value="yes")
                        next = Parser.nextSibling(nextSibNode)
                        prevSibNode = next if next is not None else None

            # otherwise
            else:
                nodesToReturn.append(kid)

        # flush out anything still remaining
        if len(replacementText) > 0:
            newNode = self.getFlushedBuffer("".join(replacementText), doc)
            nodesToReturn.append(newNode)
            replacementText = []

        for n in nodesToRemove:
            Parser.remove(n)

        return nodesToReturn

    def replaceElementsWithPara(self, doc, div):
        Parser.replaceTag(div, "p")

    def convertDivsToParagraphs(self, doc, domType):
        badDivs = 0
        elseDivs = 0
        divs = Parser.getElementsByTag(doc, tag=domType)
        tags = ["a", "blockquote", "dl", "div", "img", "ol", "p", "pre", "table", "ul"]

        for div in divs:
            items = Parser.getElementsByTags(div, tags)
            if div is not None and len(items) == 0:
                self.replaceElementsWithPara(doc, div)
                badDivs += 1
            elif div is not None:
                replaceNodes = self.getReplacementNodes(doc, div)
                div.clear()

                for c, n in enumerate(replaceNodes):
                    div.insert(c, n)

                elseDivs += 1

        return doc
Пример #4
0
class DocumentCleaner(object):
    def __init__(self):

        self.regExRemoveNodes = (
            "^side$|combx|retweet|mediaarticlerelated|menucontainer|navbar"
            "|comment|PopularQuestions|contact|foot|footer|Footer|footnote"
            "|cnn_strycaptiontxt|links|meta$|scroll|shoutbox|sponsor"
            "|tags|socialnetworking|socialNetworking|cnnStryHghLght"
            "|cnn_stryspcvbx|^inset$|pagetools|post-attributes"
            "|welcome_form|contentTools2|the_answers"
            "|communitypromo|runaroundLeft|subscribe|vcard|articleheadings"
            "|date|^print$|popup|author-dropdown|tools|socialtools|byline"
            "|konafilter|KonaFilter|breadcrumbs|^fn$|wp-caption-text"
            "|source|legende|ajoutVideo|timestamp")

        self.regexpNS = "http://exslt.org/regular-expressions"
        self.queryNaughtyIDs = "//*[re:test(@id, '%s', 'i')]" % self.regExRemoveNodes
        self.queryNaughtyClasses = "//*[re:test(@class, '%s', 'i')]" % self.regExRemoveNodes
        self.queryNaughtyNames = "//*[re:test(@name, '%s', 'i')]" % self.regExRemoveNodes

        self.divToPElementsPattern = r"<(a|blockquote|dl|div|img|ol|p|pre|table|ul)"
        self.captionPattern = "^caption$"
        self.googlePattern = " google "
        self.entriesPattern = "^[^entry-]more.*$"
        self.facebookPattern = "[^-]facebook"
        self.twitterPattern = "[^-]twitter"

        self.tabsAndNewLinesReplcesments = ReplaceSequence()\
                                            .create("\n", "\n\n")\
                                            .append("\t")\
                                            .append("^\\s+$")

    def clean(self, article):

        docToClean = article.doc
        docToClean = self.cleanEmTags(docToClean)
        docToClean = self.removeDropCaps(docToClean)
        docToClean = self.removeScriptsAndStyles(docToClean)
        docToClean = self.cleanBadTags(docToClean)
        docToClean = self.removeNodesViaRegEx(docToClean, self.captionPattern)
        docToClean = self.removeNodesViaRegEx(docToClean, self.googlePattern)
        docToClean = self.removeNodesViaRegEx(docToClean, self.entriesPattern)
        docToClean = self.removeNodesViaRegEx(docToClean, self.facebookPattern)
        docToClean = self.removeNodesViaRegEx(docToClean, self.twitterPattern)
        docToClean = self.cleanUpSpanTagsInParagraphs(docToClean)
        docToClean = self.convertDivsToParagraphs(docToClean, 'div')
        docToClean = self.convertDivsToParagraphs(docToClean, 'span')
        return docToClean

    def cleanEmTags(self, doc):
        ems = Parser.getElementsByTag(doc, tag='em')
        for node in ems:
            images = Parser.getElementsByTag(node, tag='img')
            if len(images) == 0:
                node.drop_tag()
        return doc

    def removeDropCaps(self, doc):
        items = cache.cssselect("span[class~=dropcap], span[class~=drop_cap]",
                                doc)
        for item in items:
            item.drop_tag()

        return doc

    def removeScriptsAndStyles(self, doc):
        # remove scripts
        scripts = Parser.getElementsByTag(doc, tag='script')
        for item in scripts:
            Parser.remove(item)

        # remove styles
        styles = Parser.getElementsByTag(doc, tag='style')
        for item in styles:
            Parser.remove(item)

        # remove comments
        comments = Parser.getComments(doc)
        for item in comments:
            Parser.remove(item)

        return doc

    def cleanBadTags(self, doc):

        # ids
        naughtyList = cache.xpath(self.queryNaughtyIDs,
                                  doc,
                                  namespaces={'re': self.regexpNS})
        for node in naughtyList:
            Parser.remove(node)

        # class
        naughtyClasses = cache.xpath(self.queryNaughtyClasses,
                                     doc,
                                     namespaces={'re': self.regexpNS})
        for node in naughtyClasses:
            Parser.remove(node)

        # name
        naughtyNames = cache.xpath(self.queryNaughtyNames,
                                   doc,
                                   namespaces={'re': self.regexpNS})
        for node in naughtyNames:
            Parser.remove(node)

        return doc

    def removeNodesViaRegEx(self, doc, pattern):
        for selector in ['id', 'class']:
            reg = "//*[re:test(@%s, '%s', 'i')]" % (selector, pattern)
            naughtyList = cache.xpath(reg,
                                      doc,
                                      namespaces={'re': self.regexpNS})
            for node in naughtyList:
                Parser.remove(node)
        return doc

    def cleanUpSpanTagsInParagraphs(self, doc):
        spans = cache.cssselect('p > span', doc)
        for item in spans:
            item.drop_tag()
        return doc

    def getFlushedBuffer(self, replacementText, doc):
        return Parser.textToPara(replacementText)

    def getReplacementNodes(self, doc, div):
        replacementText = []
        nodesToReturn = []
        nodesToRemove = []
        childs = Parser.childNodesWithText(div)

        for kid in childs:
            # node is a p
            # and already have some replacement text
            if Parser.getTag(kid) == 'p' and len(replacementText) > 0:
                newNode = self.getFlushedBuffer(''.join(replacementText), doc)
                nodesToReturn.append(newNode)
                replacementText = []
                nodesToReturn.append(kid)
            # node is a text node
            elif Parser.isTextNode(kid):
                kidTextNode = kid
                kidText = Parser.getText(kid)
                replaceText = self.tabsAndNewLinesReplcesments.replaceAll(
                    kidText)
                if (len(replaceText)) > 1:
                    prevSibNode = Parser.previousSibling(kidTextNode)
                    while prevSibNode is not None \
                        and Parser.getTag(prevSibNode) == "a" \
                        and Parser.getAttribute(prevSibNode, 'grv-usedalready') != 'yes':
                        outer = " " + Parser.outerHtml(prevSibNode) + " "
                        replacementText.append(outer)
                        nodesToRemove.append(prevSibNode)
                        Parser.setAttribute(prevSibNode,
                                            attr='grv-usedalready',
                                            value='yes')
                        prev = Parser.previousSibling(prevSibNode)
                        prevSibNode = prev if prev is not None else None
                    # append replaceText
                    replacementText.append(replaceText)
                    #
                    nextSibNode = Parser.nextSibling(kidTextNode)
                    while nextSibNode is not None \
                        and Parser.getTag(nextSibNode) == "a" \
                        and Parser.getAttribute(nextSibNode, 'grv-usedalready') != 'yes':
                        outer = " " + Parser.outerHtml(nextSibNode) + " "
                        replacementText.append(outer)
                        nodesToRemove.append(nextSibNode)
                        Parser.setAttribute(nextSibNode,
                                            attr='grv-usedalready',
                                            value='yes')
                        next = Parser.nextSibling(nextSibNode)
                        prevSibNode = next if next is not None else None

            # otherwise
            else:
                nodesToReturn.append(kid)

        # flush out anything still remaining
        if (len(replacementText) > 0):
            newNode = self.getFlushedBuffer(''.join(replacementText), doc)
            nodesToReturn.append(newNode)
            replacementText = []

        #
        for n in nodesToRemove:
            Parser.remove(n)

        return nodesToReturn

    def replaceElementsWithPara(self, doc, div):
        Parser.replaceTag(div, 'p')

    def convertDivsToParagraphs(self, doc, domType):
        badDivs = 0
        elseDivs = 0
        divs = Parser.getElementsByTag(doc, tag=domType)
        tags = [
            'a', 'blockquote', 'dl', 'div', 'img', 'ol', 'p', 'pre', 'table',
            'ul'
        ]

        for div in divs:
            items = Parser.getElementsByTags(div, tags)
            if div is not None and len(items) == 0:
                self.replaceElementsWithPara(doc, div)
                badDivs += 1
            elif div is not None:
                replaceNodes = self.getReplacementNodes(doc, div)
                div.clear()

                for c, n in enumerate(replaceNodes):
                    div.insert(c, n)

                elseDivs += 1

        return doc
Пример #5
0
class DocumentCleaner(object):

    def __init__(self, config, article):
        # config
        self.config = config

        # parser
        self.parser = self.config.get_parser()

        # article
        self.article = article

        # nodes to remove regexp
        self.remove_nodes_re = (
        "^side$|combx|retweet|mediaarticlerelated|menucontainer|"
        "navbar|storytopbar-bucket|utility-bar|inline-share-tools"
        "|comment|PopularQuestions|contact|foot|footer|Footer|footnote"
        "|cnn_strycaptiontxt|cnn_html_slideshow|cnn_strylftcntnt"
        "|^links$|meta$|shoutbox|sponsor"
        "|tags|socialnetworking|socialNetworking|cnnStryHghLght"
        "|cnn_stryspcvbx|^inset$|pagetools|post-attributes"
        "|welcome_form|contentTools2|the_answers"
        "|communitypromo|runaroundLeft|subscribe|vcard|articleheadings"
        "|date|^print$|popup|author-dropdown|tools|socialtools|byline"
        "|konafilter|KonaFilter|breadcrumbs|^fn$|wp-caption-text"
        "|legende|ajoutVideo|timestamp|js_replies|printfriendly|share"
        )

        # dailymail remove nodes
        self.remove_nodes_re += "|related-carousel|xwv-related-videos-container"

        # nytimes remove nodes
        self.remove_nodes_re += "|visually-hidden|robots-nocontent"

        # *.wikipedia.org
        self.remove_nodes_re += "|mw-editsection|^cite_ref|noprint|References|siteSub"
        self.remove_nodes_re += "|collapsed|mw-headline-anchor|filetoc|noviewer"

        # *.wiktionary.org
        self.remove_nodes_re += "|ib-brac"

        # *.wikibooks.org
        self.remove_nodes_re += "|status-icon"

        # www.wikidata.org
        self.remove_nodes_re += "|wikibase-edittoolbar-container"

        # http://www.dailymail.co.uk/news/article-2742786/Complacent-Home-Office-loses-175-000-illegal-immigrants-Fresh-humiliation-officials-admit-went-missing-refused-permission-stay.html
        self.remove_nodes_re += "|most-read-news-wrapper|most-watched-videos-wrapper"

        self.regexp_namespace = "http://exslt.org/regular-expressions"
        self.nauthy_ids_re = "//*[re:test(@id, '%s', 'i')]" % self.remove_nodes_re
        self.nauthy_classes_re = "//*[re:test(@class, '%s', 'i')]" % self.remove_nodes_re
        self.nauthy_names_re = "//*[re:test(@name, '%s', 'i')]" % self.remove_nodes_re
        self.nauthy_tags = ["noscript"]
        self.google_re = " google "
        self.entries_re = "^[^entry-]more.*$"
        self.facebook_re = "[^-]facebook"
        self.facebook_braodcasting_re = "facebook-broadcasting"
        self.twitter_re = "[^-]twitter"
        self.tablines_replacements = ReplaceSequence()\
                                            .create("\n", "\n\n")\
                                            .append("\t")\
                                            .append("^\\s+$")

    def set_known_host_remove_selectors(self):
        self.known_host_remove_selectors = HostUtils.host_selectors(_Const().get_known_host_remove_selectors,
                                                                    self.article.domain)

    def clean(self):
        doc_to_clean = self.article.doc
        doc_to_clean = self.remove_scripts_styles(doc_to_clean)
        self.set_known_host_remove_selectors()
        if self.known_host_remove_selectors:
            return self.remove_host_specific_nodes(doc_to_clean)
        doc_to_clean = self.clean_body_classes(doc_to_clean)
        doc_to_clean = self.clean_article_tags(doc_to_clean)
        doc_to_clean = self.remove_drop_caps(doc_to_clean)
        doc_to_clean = self.clean_bad_tags(doc_to_clean)
        doc_to_clean = self.remove_nodes_regex(doc_to_clean, self.google_re)
        doc_to_clean = self.remove_nodes_regex(doc_to_clean, self.entries_re)
        doc_to_clean = self.remove_nodes_regex(doc_to_clean, self.facebook_re)
        doc_to_clean = self.remove_nodes_regex(doc_to_clean, self.facebook_braodcasting_re)
        doc_to_clean = self.remove_nodes_regex(doc_to_clean, self.twitter_re)
        doc_to_clean = self.clean_para_spans(doc_to_clean)
        doc_to_clean = self.div_to_para(doc_to_clean, 'div')
        doc_to_clean = self.div_to_para(doc_to_clean, 'span')
        return doc_to_clean

    def clean_body_classes(self, doc):
        # we don't need body classes
        # in case it matches an unwanted class all the document
        # will be empty
        elements = self.parser.getElementsByTag(doc, tag="body")
        if elements:
            self.parser.delAttribute(elements[0], attr="class")
        return doc

    def clean_article_tags(self, doc):
        articles = self.parser.getElementsByTag(doc, tag='article')
        for article in articles:
            for attr in ['id', 'name', 'class']:
                self.parser.delAttribute(article, attr=attr)
        return doc

    def remove_drop_caps(self, doc):
        items = self.parser.css_select(doc, "span[class~=dropcap], span[class~=drop_cap]")
        for item in items:
            self.parser.drop_tag(item)

        return doc

    def remove_scripts_styles(self, doc):
        # remove scripts
        scripts = self.parser.getElementsByTag(doc, tag='script')
        for item in scripts:
            self.parser.remove(item)

        # remove styles
        styles = self.parser.getElementsByTag(doc, tag='style')
        for item in styles:
            self.parser.remove(item)

        # remove comments
        comments = self.parser.getComments(doc)
        for item in comments:
            self.parser.remove(item)

        return doc

    def clean_bad_tags(self, doc):
        # ids
        naughty_list = self.parser.xpath_re(doc, self.nauthy_ids_re)
        for node in naughty_list:
            self.parser.remove(node)

        # class
        naughty_classes = self.parser.xpath_re(doc, self.nauthy_classes_re)
        for node in naughty_classes:
            self.parser.remove(node)

        # name
        naughty_names = self.parser.xpath_re(doc, self.nauthy_names_re)
        for node in naughty_names:
            self.parser.remove(node)

        for nauthy_tag in self.nauthy_tags:
            nodes = self.parser.getElementsByTag(doc, tag=nauthy_tag)
            for node in nodes:
                images = self.parser.getElementsByTag(node, tag='img')
                if images:
                    parent = node.getparent()
                    parent_index = parent.index(node)
                    for image in images:
                        parent.insert(parent_index, image)
                else:
                    self.parser.remove(node)

        return doc

    def remove_host_specific_nodes(self, doc):
        nodes = self.parser.css_select(doc, self.known_host_remove_selectors)
        for node in nodes:
            self.parser.remove(node)

        return doc

    def remove_nodes_regex(self, doc, pattern):
        for selector in ['id', 'class']:
            reg = "//*[re:test(@%s, '%s', 'i')]" % (selector, pattern)
            naughty_list = self.parser.xpath_re(doc, reg)
            for node in naughty_list:
                self.parser.remove(node)
        return doc

    def clean_para_spans(self, doc):
        spans = self.parser.css_select(doc, 'p span')
        for item in spans:
            self.parser.drop_tag(item)
        return doc

    def get_flushed_buffer(self, replacement_text, doc):
        return self.parser.textToPara(replacement_text)

    def get_replacement_nodes(self, doc, div):
        replacement_text = []
        nodes_to_return = []
        nodes_to_remove = []
        childs = self.parser.childNodesWithText(div)

        for kid in childs:
            # node is a p
            # and already have some replacement text
            if self.parser.getTag(kid) == 'p' and len(replacement_text) > 0:
                newNode = self.get_flushed_buffer(''.join(replacement_text), doc)
                nodes_to_return.append(newNode)
                replacement_text = []
                nodes_to_return.append(kid)
            # node is a text node
            elif self.parser.isTextNode(kid):
                kid_text_node = kid
                kid_text = self.parser.getText(kid)
                replace_text = self.tablines_replacements.replaceAll(kid_text)
                if(len(replace_text)) > 1:
                    previous_sibling_node = self.parser.previousSibling(kid_text_node)
                    while previous_sibling_node is not None \
                        and self.parser.getTag(previous_sibling_node) == "a" \
                        and self.parser.getAttribute(previous_sibling_node, 'grv-usedalready') != 'yes':
                        outer = " " + self.parser.outerHtml(previous_sibling_node) + " "
                        replacement_text.append(outer)
                        nodes_to_remove.append(previous_sibling_node)
                        self.parser.setAttribute(previous_sibling_node,
                                    attr='grv-usedalready', value='yes')
                        prev = self.parser.previousSibling(previous_sibling_node)
                        previous_sibling_node = prev if prev is not None else None
                    next_sibling_node = self.parser.nextSibling(kid_text_node)
                    while next_sibling_node is not None \
                        and self.parser.getTag(next_sibling_node) == "a" \
                        and self.parser.getAttribute(next_sibling_node, 'grv-usedalready') != 'yes':
                        outer = " " + self.parser.outerHtml(next_sibling_node) + " "
                        replacement_text.append(outer)
                        nodes_to_remove.append(next_sibling_node)
                        self.parser.setAttribute(next_sibling_node,
                                    attr='grv-usedalready', value='yes')
                        next = self.parser.nextSibling(next_sibling_node)
                        previous_sibling_node = next if next is not None else None

            # otherwise
            else:
                nodes_to_return.append(kid)

        # flush out anything still remaining
        if(len(replacement_text) > 0):
            new_node = self.get_flushed_buffer(''.join(replacement_text), doc)
            nodes_to_return.append(new_node)
            replacement_text = []

        for n in nodes_to_remove:
            self.parser.remove(n)

        return nodes_to_return

    def replace_with_para(self, doc, div):
        self.parser.replaceTag(div, 'p')

    def div_to_para(self, doc, dom_type):
        bad_divs = 0
        else_divs = 0
        divs = self.parser.getElementsByTag(doc, tag=dom_type)
        tags = ['a', 'blockquote', 'dl', 'div', 'img', 'ol', 'p', 'pre', 'table', 'ul']

        for div in divs:
            items = self.parser.getElementsByTags(div, tags)
            if div is not None and len(items) == 0:
                self.replace_with_para(doc, div)
                bad_divs += 1
            elif div is not None:
                replaceNodes = self.get_replacement_nodes(doc, div)
                for child in self.parser.childNodes(div):
                    div.remove(child)

                for c, n in enumerate(replaceNodes):
                    div.insert(c, n)

                else_divs += 1

        return doc
Пример #6
0
class DocumentCleaner(object):

    def __init__(self):

        self.regExRemoveNodes = (
        "^side$|combx|retweet|fontresize|mediaarticlerelated|menucontainer|navbar"
        "|comment|PopularQuestions|foot|footer|Footer|footnote"
        "|cnn_strycaptiontxt|links|meta$|scroll|shoutbox|sponsor"
        "|tags|socialnetworking|socialNetworking|cnnStryHghLght"
        "|cnn_stryspcvbx|^inset$|pagetools|post-attributes"
        "|welcome_form|contentTools2|the_answers|rating"
        "|communitypromo|runaroundLeft|subscribe|vcard|articleheadings|articlead"
        "|date|^print$|popup|author-dropdown|tools|socialtools"
        "|konafilter|KonaFilter|breadcrumbs|^fn$|wp-caption-text"
        "|source|legende|ajoutVideo|timestamp|menu"
        )
        self.regExNotRemoveNodes = ("and|no|article|body|column|main|shadow")
        self.regexpNS = "http://exslt.org/regular-expressions"
        self.divToPElementsPattern = r"<(a|blockquote|dl|div|img|ol|p|pre|table|ul)"
        self.captionPattern = "^caption$"
        self.googlePattern = " google "
        self.entriesPattern = "^[^entry-]more.*$"
        self.facebookPattern = "[^-]facebook"
        self.facebookBroadcastingPattern = "facebook-broadcasting"
        self.twitterPattern = "[^-]twitter"
        self.tabsAndNewLinesReplcesments = ReplaceSequence()\
                                            .create("\n", "\n\n")\
                                            .append("\t")\
                                            .append("^\\s+$")
        self.todel = self.regExRemoveNodes.lower().split('|')
        self.notdel = self.regExNotRemoveNodes.lower().split('|')
        


    def clean(self, article):

        docToClean = article.doc
        nodelist = self.getNodesToDelete(docToClean)
        for node in nodelist: Parser.remove(node)  
        docToClean = self.removeListsWithLinks(docToClean)
        docToClean = self.dropTags(docToClean,['em','strong'])
        docToClean = self.removeDropCaps(docToClean)
        docToClean = self.removeNodesViaRegEx(docToClean, self.captionPattern)
        docToClean = self.removeNodesViaRegEx(docToClean, self.googlePattern)
        docToClean = self.removeNodesViaRegEx(docToClean, self.entriesPattern)
        docToClean = self.removeNodesViaRegEx(docToClean, self.facebookPattern)
        docToClean = self.removeNodesViaRegEx(docToClean, self.facebookBroadcastingPattern)
        docToClean = self.removeNodesViaRegEx(docToClean, self.twitterPattern)
        docToClean = self.cleanUpSpanTagsInParagraphs(docToClean)
	docToClean = self.keepLineBreaks(docToClean)
        docToClean = self.convertDivsToParagraphs(docToClean, 'div')
        docToClean = self.convertDivsToParagraphs(docToClean, 'span')
        return docToClean

    def getNodesToDelete(self, doc):
        nodelist = []
        for node in doc:
            if node.tag in ['script','noscript','style','option'] or isinstance(node,lxml.html.HtmlComment):
		nodelist.append(node)
		continue
            if node.tag in ['p','span','b','h1','h2','h3','h4','h5'] and len(node) == 0: continue;
            ids = ''
            if node.attrib.has_key('class'):
               ids += ' ' + node.attrib['class'].lower()
            if node.attrib.has_key('id'):
               ids += ' ' + node.attrib['id'].lower()
            if node.attrib.has_key('name'):
               ids += ' ' + node.attrib['name'].lower()
            good_word = ''
            for word in self.notdel:
		if ids.find(word) >= 0: 
                    good_word = word
                    continue
            bad_word = ''
            for word in self.todel:
	        if ids.find(word) >= 0: 
                    bad_word = word
                    break
            if (bad_word != '' and good_word == '') or (bad_word != '' and bad_word.find(good_word) >= 0):
                nodelist.append(node)
                continue 
            nodelist += self.getNodesToDelete(node)
        return nodelist

    def keepLineBreaks(self, doc):
        items=Parser.getElementsByTag(doc, tag='br')
	for n in items:
	    if n.tail is not None: n.tail = u'\ufffc ' + n.tail
            else: n.tail = u'\ufffc'
            n.drop_tag()

        items=Parser.getElementsByTag(doc, tag='p')
	for n in items:
	    if n.tail is not None: n.tail = u'\ufffc ' + n.tail
            else: 
                n.tail = u'\ufffc'
#                if n.text is None: n.drop_tag()  # drop empty p
	return doc

    def removeWrapedLinks(self, e):
        if e is None or len(e) != 1 or e[0].tag != 'a': return []
        text = ''
        if e.text is not None: text += e.text
        if e[0].tail is not None: text += e[0].tail
        if e.tail is not None: text += e.tail
        if re.search('[^ \t\r\n]',text): return []
        toRemove = [e] + self.removeWrapedLinks(Parser.nextSibling(e))
        return toRemove

    def removeListsWithLinks(self, doc):
        for tag in ['ol','ul']:
            items=Parser.getElementsByTag(doc, tag=tag)
            for item in items:
		fa = 0
                for li in item:
                    if Parser.getElementsByTag(li, tag='a'):
                        fa += 1
                        if fa > 2:
                            parent = item.getparent()
                            Parser.remove(item)
                            if parent is not None:
                                if len(parent) == 0 or len(Parser.getText(parent).split()) < 4:
                                    Parser.remove(parent)
                            break
                    else:
                       fa = 0

        items=Parser.getElementsByTag(doc, tag='a')
        for a in items:
                e = a.getparent()
		if e is None: continue
	        text = Parser.getText(e)
		ldels = []
                textcount = 0
		for link in e:
	            ltext = Parser.getText(link)
                    if link.tag != 'a' and len(ltext) <= 2: continue
		    if link.tag != 'a' and len(ltext) > 2:
                        ldels = []
                        break
                    if ltext == '': continue
	            ldel = text.split(ltext,1)
	            ld = ldel[0].strip()
	            ldels.append(ld)
                    if len(ldel) == 1: break
	            text = ldel[1]
	        if len(ldels) == 0 or ldels[0] == ',': continue
	        else:
                    del ldels[0]
                    flag = 0; flag1 = 0; flag2 = 0; flag3 = 0
	            for ldel in ldels:
			if ldel == ldels[0]: flag += 1
                        if len(ldel) > 3 or ldel.find(',') >= 0: flag1 = 1
			if ldel != '': flag2 = 1
                        if len(ldel) > 1: flag3 = 1
                    if flag2 == 0 and len(ldels) > 1: 
			Parser.remove(e)
			continue
                    if  len(ldels) == 2 and ldels[0] == '|' and ldels[1] == '|': 
			Parser.remove(e)
			continue
                    if  len(ldels) > 3 and flag3 == 0: 
			Parser.remove(e)
			continue
                    if flag <= 2 and (len(ldels) <= 2 or flag1 != 0): 
			continue
		         
	        Parser.remove(e)

        return doc

        items=Parser.getElementsByTag(doc, tag='a')
        for a in items:
                e = a.getparent()
		if e is None: continue
		if len(e) == 1: 
		    toRemove = self.removeWrapedLinks(e)
		    if len(toRemove) > 2:
		        for bn in toRemove:
		            Parser.remove(bn)

        return doc

    def dropTags(self, doc, tags):
        for tag in tags:
            ems = Parser.getElementsByTag(doc, tag=tag)
            for node in ems:
                images = Parser.getElementsByTag(node, tag='img')
                if len(images) == 0:
                    node.drop_tag()
        return doc

    def removeDropCaps(self, doc):
        items = doc.cssselect("span[class~=dropcap], span[class~=drop_cap]")
        for item in items:
            item.drop_tag()

        return doc

    def removeNodesViaRegEx(self, doc, pattern):
        for selector in ['id', 'class']:
            reg = "//*[re:test(@%s, '%s', 'i')]" % (selector, pattern)
            naughtyList = doc.xpath(reg, namespaces={'re': self.regexpNS})
            for node in naughtyList:
                Parser.remove(node)
        return doc

    def cleanUpSpanTagsInParagraphs(self, doc):
        spans = doc.cssselect('p > span')
        for item in spans:
            item.drop_tag()
        return doc

    def getFlushedBuffer(self, replacementText, doc):
        return Parser.textToPara(replacementText)

    def getReplacementNodes(self, doc, div):
        replacementText = []
        nodesToReturn = []
        nodesToRemove = []
        childs = Parser.childNodesWithText(div)

        for kid in childs:
            # node is a p
            # and already have some replacement text
            if Parser.getTag(kid) == 'p' and len(replacementText) > 0:
                newNode = self.getFlushedBuffer(''.join(replacementText), doc)
                nodesToReturn.append(newNode)
                replacementText = []
                nodesToReturn.append(kid)
            # node is a text node
            elif Parser.isTextNode(kid):
                kidTextNode = kid
                kidText = Parser.getText(kid)
                replaceText = self.tabsAndNewLinesReplcesments.replaceAll(kidText)
                if(len(replaceText)) > 0:
                    prevSibNode = Parser.previousSibling(kidTextNode)
                    while prevSibNode is not None \
                        and Parser.getTag(prevSibNode) == "a" \
                        and Parser.getAttribute(prevSibNode, 'grv-usedalready') != 'yes':
                        outer = " " + Parser.outerHtml(prevSibNode) + " "
                        replacementText.append(outer)
                        nodesToRemove.append(prevSibNode)
                        Parser.setAttribute(prevSibNode,
                                    attr='grv-usedalready', value='yes')
                        prevSibNode = Parser.previousSibling(prevSibNode)
                    # append replaceText
                    replacementText.append(replaceText)
                    #
                    nextSibNode = Parser.nextSibling(kidTextNode)
                    while nextSibNode is not None \
                        and Parser.getTag(nextSibNode) == "a" \
                        and Parser.getAttribute(nextSibNode, 'grv-usedalready') != 'yes':
                        outer = " " + Parser.outerHtml(nextSibNode) + " "
                        replacementText.append(outer)
                        nodesToRemove.append(nextSibNode)
                        Parser.setAttribute(nextSibNode,
                                    attr='grv-usedalready', value='yes')
                        prevSibNode = Parser.nextSibling(nextSibNode)

            # otherwise
            else:
                if Parser.getTag(kid) == "a" and Parser.getAttribute(kid, 'grv-usedalready') == 'yes':
                    continue
                if(len(replacementText) > 0):
                    newNode = self.getFlushedBuffer(''.join(replacementText), doc)
                    nodesToReturn.append(newNode)
                    replacementText = []
                nodesToReturn.append(kid)

        # flush out anything still remaining
        if(len(replacementText) > 0):
            newNode = self.getFlushedBuffer(''.join(replacementText), doc)
            nodesToReturn.append(newNode)
            replacementText = []

        for n in nodesToRemove:
            Parser.remove(n)

        return nodesToReturn

    def replaceElementsWithPara(self, doc, div):
        Parser.replaceTag(div, 'p')

    def convertDivsToParagraphs(self, doc, domType):
        badDivs = 0
        elseDivs = 0
        divs = Parser.getElementsByTag(doc, tag=domType)
        tags = ['a', 'blockquote', 'dl', 'div', 'img', 'ol', 'p', 'pre', 'table', 'ul']

        for div in divs:
            items = Parser.getElementsByTags(div, tags)
            if div is not None and len(items) == 0:
                self.replaceElementsWithPara(doc, div)
                badDivs += 1
            elif div is not None:
                replaceNodes = self.getReplacementNodes(doc, div)
		text = div.tail
                div.clear()

                for c, n in enumerate(replaceNodes):
                    div.insert(c, n)
                div.tail = text
                elseDivs += 1

        return doc