Exemplo n.º 1
0
 def test_common_tags(self):
     """ test get_common_tags
     """
     html = "<!DOCTYPE html>\n" + \
            "<html>" + \
            "<head><title>TEST</title></head>" + \
            "<body>" + \
            "</body></html>"
     tags = HTMLCleaner.get_common_tags()
     clean_html = HTMLCleaner(tags)
     self.assertTrue(clean_html.clean(html) == "TEST")
Exemplo n.º 2
0
    def get_replace_html():
        """ Get the list of HTML tags we need to remove
            :return list
        """

        replace = OrderedDict()

        # replace needed non ascii
        replace[u"½"] = u"&#189;"
        replace[u"¼"] = u"&#188;"
        replace[u"¾"] = u"&#190;"

        # remove \n</a>
        replace["\n</a>"] = "</a>"

        # site
        replace["<span itemscope itemtype='http://schema.org/Recipe'>"] = ""
        replace[" - Blue Apron"] = ""

        tags = HTMLCleaner.get_common_tags()
        for (_from, _to) in tags.items():
            replace[_from] = _to

        # headers and footer
        replace["<header(.*?)</header>"] = ""
        replace["<footer(.*?)</footer>"] = ""

        tags = HTMLCleaner.get_layout_tags()
        for (_from, _to) in tags.items():
            replace[_from] = _to  # tags

        tags = HTMLCleaner.get_style_tags()
        for (_from, _to) in tags.items():
            replace[_from] = _to

        # layout
        replace[
            "<section class='section-rec-reviews container' id='reviews'>" +
            "(.*?)</section>"] = ""
        replace[
            "Recipe: (.*?)<section class='section-rec-basics js-RecipeArea' " +
            "data-area-name='basics' id='basics'>"] = ""
        replace[
            "<section class='section-rec-tools container' id='tools'>" +
            "(.*?)</section>"] = ""
        replace["Per Serving(.*?)</section>"] = ""
        replace["\n\n\n"] = ""
        replace[' class="rec-splash-img"'] = ""
        replace['class="img-max"'] = ""
        replace[' class="ingredients-img"'] = ""
        replace[
            "<section class='section-rec-instructions container' " +
            "id='instructions'>(.*?)</section>"] = ""
        replace[
            "<section class='section-rec-techniques container' " +
            "id='techniques'>(.*?)</section>"] = ""
        replace[r" to download a PDF of this recipe."] = ""
        replace[
            "<section class='section-rec-ingredients container' " +
            "id='ingredients'>"] = ""

        # a
        replace["<a class='js-StepStoryLaunch(.*?)>(.*?)</a>"] = ""
        replace["<a class='js-IngModalLink'(.*?)>"] = ""
        replace["<a class='js-SubStory vid-tip'(.*?)>"] = ""
        replace["<a href=\"\"(.*?)>(.*?)</a>"] = ""
        replace["<a(.*?)>(.*?)</a>"] = ""

        tags = HTMLCleaner.clean_up_tags()
        for (_from, _to) in tags.items():
            replace[_from] = _to

        replace["Servings"] = "\n"
        replace["About\n\n"] = ""
        replace["\nCalories:"] = "\nCalories: "
        replace['</section>'] = ""

        for i in ["1", "2", "3", "4", "5", "6"]:
            replace[i + "\n\n"] = i + ") "

        # in case there's no text on the instruction
        for i in ["1", "2", "3", "4", "5", "6"]:
            _from = i + "\n\t"
            _to = i + ") Step " + i + ": "
            replace[_from] = _to
            del (_from, _to)

        replace["<img alt=\"Introducing our Market(.*?) />"] = ""
        replace["<img alt=\"Recipe cards\" (.*?) />"] = ""
        replace[r"\) <img"] = "\n<img"

        replace[r"\) \n"] = "\n"
        replace["</a>"] = ""

        return replace