示例#1
0
    def arrange(self, parameters: Dict[str, str], contents: str,
                rss_url_prefix: str, favicon_url: str) -> str:
        result: str = contents

        try:
            result: str = contents
            if len(result.strip()) > 0:
                # I probably do not use etree as I should
                result = re.sub(r'<\?xml [^>]*?>', '', result).strip()
                result = re.sub(r'<\?xml-stylesheet [^>]*?>', '',
                                result).strip()

                dom = etree.fromstring(result)

                self.arrange_feed_top_level_element(dom, rss_url_prefix,
                                                    parameters, favicon_url)

                for item in self.get_items(dom):
                    self._arrange_item(item, parameters)
                    self._arrange_feed_link(item, parameters)

                result = '<?xml version="1.0" encoding="UTF-8"?>\n' + \
                    to_string(dom)

        except etree.XMLSyntaxError as e:
            logging.getLogger().info(
                "[ %s ] - Unable to parse rss feed for module '%s' (%s), let's proceed anyway",
                datetime.datetime.now().strftime("%Y-%m-%d - %H:%M"),
                self.module_name, str(e))

        return result
示例#2
0
    def _manage_reddit_preview_images(self, content) -> str:
        """Use directly the image instead of the preview

        Args:
            content ([type]): html content

        Returns:
            str: the content where preview images have been replaced by target
        """
        content_without_preview: str = content
        img_previews = re.findall(IMG_PREVIEW_REDDIT, content)
        for preview in img_previews:
            content_without_preview = content.replace(
                preview[0], "https://i.redd.it/%s" % preview[1])

        dom = etree.HTML(content_without_preview)
        for a in xpath(dom, "//a"):
            if "href" in a.attrib and a.attrib["href"].find(
                    "://preview.redd.it/") > -1:
                img = etree.Element("img")
                img.set(
                    "src", a.attrib["href"].replace("preview.redd.it",
                                                    "i.redd.it"))
                a.getparent().append(img)
                a.getparent().remove(a)

        content_without_preview = to_string(dom)

        return content_without_preview
示例#3
0
    def get_feed(self, parameters: dict, session: requests.Session) -> str:
        feed = session.get(url=self.get_rss_url(), headers={}).text
        feed = re.sub(r'<link>[^<]*</link>', '', feed)
        link = '<link>'
        feed = feed.replace('<guid isPermaLink="false">', link)
        feed = feed.replace('<guid isPermaLink="true">', link)
        feed = feed.replace('</guid>', '</link>')
        feed = feed.replace(link, '<link>%s?%surl=' % (
            self.url_prefix, self._getAuthentificationSuffix(parameters)))

        # I probably do not use etree as I should
        feed = re.sub(r'<\?xml [^>]*?>', '', feed).strip()
        dom = etree.fromstring(feed)

        # available filters : international, politique, societe, les-decodeurs, sport, planete, sciences, campus, afrique, pixels, actualites-medias, sante, big-browser, disparitions, podcasts
        if "filter" in parameters:
            # filter only on passed category
            xpath_expression = utils.dom_utils.get_xpath_expression_for_filters(
                parameters, "link[contains(text(), '/%s/')]", "not(link[contains(text(), '/%s/')])")

            utils.dom_utils.delete_nodes(dom.xpath(xpath_expression))

        feed = to_string(dom)

        return feed
示例#4
0
    def _arrange_item(self, item: etree._Element, parameters: dict):
        descriptions = self.get_descriptions(item)
        thumbnail_url: str = self.get_img_url(item)

        if len(descriptions) > 0:
            description: etree._Element = descriptions[0]
            img_url = self._add_thumbnail_in_description(
                item, description, parameters, thumbnail_url)
            if thumbnail_url == "" and img_url != "":
                self.set_thumbnail_item(item, img_url)
            """
            n = self._get_source(item)
            if n is not None:
                description.append(n)
            """

            description_xml: str = ""
            if descriptions[0].text is not None:
                description_xml = descriptions[0].text
            for child in descriptions[0].getchildren():
                description_xml += to_string(child)

            parent_obj = descriptions[0].getparent()
            parent_obj.remove(descriptions[0])

            description = etree.Element(descriptions[0].tag)  # "description")
            description.text = html.unescape(description_xml.strip()).replace(
                "&nbsp;", " ")
            if "translateto" in parameters:
                dom = etree.HTML(description.text)
                translate_dom(dom, parameters["translateto"])
                description.text = to_string(dom)

            parent_obj.append(description)

            if "debug" in parameters and parameters["debug"] == "true":
                p = etree.Element("p")
                i = etree.Element("i")
                i.text = "Session id: %s" % self.session_id
                p.append(i)
                descriptions[0].append(p)
示例#5
0
    def _get_thumbnail_url_from_description(
            self, description: etree._Element) -> str:
        thumbnail_url: str = ""
        imgs = xpath(description, ".//img")
        if len(imgs) > 0:
            thumbnail_url = imgs[0].attrib["url"]
        else:
            m = re.match(IMG_URL_REGEX, to_string(description))
            if m is not None:
                thumbnail_url = m.group(1)

        return thumbnail_url
示例#6
0
    def _post_processing(self, url: str):
        if len(self.contents.strip()) > 0:
            dom = etree.HTML(self.contents)
            self._post_process_tweets(dom)
            self._replace_prefix_urls(dom)
            self._manage_translation(dom, url)
            self.contents = to_string(dom)\
                .replace("<html>", "")\
                .replace("</html>", "")\
                .replace("<body>", "")\
                .replace("</body>", "")\
                .replace("<video", "<video preload=\"none\"")

            self.contents = self.contents.replace("data-src-lazyload", "src")
            self.contents = self.contents.replace("</br>", "")
示例#7
0
    def get_feed(self, parameters: dict, session: requests.Session) -> str:
        if "filter" in parameters and parameters["filter"] == (
                "tennis" or "football" or "rugby" or "cyclisme" or "golf"):
            # filter only on passed category, eg /sport24/rss/tennis
            feed = session.get(url=self.get_rss_url() % parameters["filter"],
                               headers={}).text
        else:
            feed = session.get(url=self.get_rss_url() % "accueil",
                               headers={}).text

        # I probably do not use etree as I should
        feed = feed.replace('<?xml version="1.0" encoding="UTF-8"?>', '')
        regex = re.compile(r"&(?!amp;|lt;|gt;)")
        myxml = regex.sub("&amp;", feed)
        dom = etree.fromstring(myxml)
        description_img: str = ""

        xpath_expression = "//item[not(enclosure)]"
        if "filter" in parameters and parameters["filter"] == "flash":
            xpath_expression = "//item[enclosure]"
            description_img = "<img src=\"https://pbs.twimg.com/profile_images/932616523285516294/sqt32oQY.jpg\"/>"

        utils.dom_utils.delete_nodes(dom.xpath(xpath_expression))

        for link in xpath(dom, "//item/link"):
            if link is not None and text(link) is not None:
                link.text = self.get_handler_url_with_parameters(
                    {"url": text(link).strip()})

        feed = to_string(dom)

        title = ""
        if "filter" in parameters:
            title = " - " + parameters["filter"]

        feed = feed.replace(
            "<title>Sport24 - Toute l'actualite</title>",
            "<title>Sport24%s</title>" % string.capwords(title))

        if description_img != "":
            feed = feed.replace("<description>",
                                "<description>" + description_img)

        return feed
示例#8
0
    def get_feed(self, parameters: dict, session: Session) -> str:
        rss_url: str = self.get_rss_url()

        if "sub" in parameters:
            rss_url = "https://www.reddit.com/r/%s/.rss" % parameters["sub"]

        feed = session.get(url=rss_url, headers={}).text

        feed = re.sub(r'<\?xml [^>]*?>', '', feed).strip()
        # I probably do not use etree as I should
        dom = etree.fromstring(feed)

        for entry in xpath(dom, "//atom:entry", namespaces=NAMESPACES):
            content = cast(
                str,
                xpath(entry, "./atom:content", namespaces=NAMESPACES)[0].text)

            # try to replace thumbnail with real picture
            imgs = re.findall(r'"http[^"]*jpg"', content)
            thumb: str = ""
            other: str = ""
            for img in imgs:
                if "thumbs.redditmedia" in img:
                    thumb = img
                else:
                    other = img
            if thumb != "" and other != "":
                xpath(entry, "./atom:content",
                      namespaces=NAMESPACES)[0].text = content.replace(
                          thumb, other).replace("<td> &#32;",
                                                "</tr><tr><td> &#32;")

            for link in xpath(entry, "./atom:link", namespaces=NAMESPACES):
                link.attrib["href"] = self.get_handler_url_with_parameters(
                    {"url": cast(str, link.attrib["href"].strip())})

        feed = to_string(dom)

        return feed
示例#9
0
    def get_reddit_content(self, url: str, session: Session,
                           with_comments: bool) -> PyRSSWContent:
        content: str = ""
        page = session.get(url="%s/.json" % url, headers=self._get_headers())
        json_content = page.content

        try:
            root = json.loads(json_content)
        except JSONDecodeError as _:
            content = "<strong>Status code: %d<br/></strong>" % page.status_code
            content += to_string(etree.HTML(page.content, parser=None))
            root = {}
        datatypes = self._get_datatypes_json(root, "t3")  # t3 : content
        for data in datatypes:
            content += "<h1>%s</h1>" % get_node_value_if_exists(data, "title")
            self_html: str = get_node_value_if_exists(data, "selftext_html")
            post_hint: str = get_node_value_if_exists(data, "post_hint")
            removed_by: str = get_node_value_if_exists(
                data, "removed_by") + get_node_value_if_exists(
                    data, "removed_by_category")
            if removed_by == "":
                content = self._get_content_from_data(data=data,
                                                      session=session,
                                                      self_html=self_html,
                                                      post_hint=post_hint)
            else:
                content = "Content removed"

        comments: str = ""
        if with_comments:
            comments = "<hr/><h2>Comments</h2>"
            comments_json = self._get_datatypes_json(root,
                                                     "t1")  # t1 : comments
            for comment_json in comments_json:
                comments += self.get_comments(comment_json)

        content = "<article>%s%s</article>" % (content, comments)

        return PyRSSWContent(content)
示例#10
0
    def get_feed(self, parameters: dict, session: requests.Session) -> str:
        feed = session.get(url=self.get_rss_url()).text

        # I probably do not use etree as I should
        feed = re.sub(r'<\?xml [^>]*?>', '', feed).strip()
        dom = etree.fromstring(feed)

        if "filter" in parameters:
            # filter only on passed category
            xpath_expression = utils.dom_utils.get_xpath_expression_for_filters(
                parameters, "category/text() = '%s'",
                "not(category/text() = '%s')")

            utils.dom_utils.delete_nodes(dom.xpath(xpath_expression))

        # replace video links, they must be processed by getContent
        for node in xpath(dom, "//link|//guid"):
            node.text = "%s" % self.get_handler_url_with_parameters(
                {"url": cast(str, node.text)})

        feed = to_string(dom)

        return feed
示例#11
0
    def get_feed(self, parameters: dict, session: requests.Session) -> str:
        r = session.get(url=self.get_rss_url(), headers={})

        # force encoding
        r.encoding = "utf-8"
        feed = r.text.replace("<link>", "<link>%s?url=" % self.url_prefix)
        feed = re.sub(
            r'<guid isPermaLink="false">https://lesjoiesducode.fr/\?p=[^<]*</guid>',
            r"", feed)

        # I probably do not use etree as I should
        feed = re.sub(r'<\?xml [^>]*?>', '', feed).strip()

        dom = etree.fromstring(feed)
        for item in xpath(dom, "//item"):
            for child in item.getchildren(
            ):  # did not find how to xpath content:encoded tag
                if child.tag.endswith("encoded"):
                    c = self._clean_content('<div class="blog-post">' +
                                            child.text + '</div>')
                    child.text = c  # "<![CDATA[" + c + "]]>"

        return to_string(dom)
示例#12
0
    def get_feed(self, parameters: dict, session: requests.Session) -> str:
        feed = session.get(url=self.get_rss_url(), headers={}).text

        feed = re.sub(r'<guid>[^<]*</guid>', '', feed)

        # I probably do not use etree as I should
        feed = re.sub(r'<\?xml [^>]*?>', '', feed).strip()
        dom = etree.fromstring(feed)

        if "filter" in parameters:
            # filter only on passed category
            xpath_expression = utils.dom_utils.get_xpath_expression_for_filters(
                parameters, "link[contains(text(), '/%s/')]",
                "not(link[contains(text(), '/%s/')])")

            utils.dom_utils.delete_nodes(dom.xpath(xpath_expression))

        for link in xpath(dom, "//item/link"):
            link.text = self.get_handler_url_with_parameters(
                {"url": cast(str, link.text).strip()})

        feed = to_string(dom)

        return feed
示例#13
0
    def _add_thumbnail_in_description(self, item: etree._Element,
                                      description: etree._Element,
                                      parameters: Dict[str, str],
                                      thumbnail_url: str) -> str:
        img_url: str = thumbnail_url

        nsfw: str = "false" if "nsfw" not in parameters else parameters["nsfw"]
        if description.text is not None:
            description_thumbnail_url: str = self._get_thumbnail_url_from_description(
                description)
            if description_thumbnail_url == "":
                # if description does not have a picture, add one from enclosure or media:content tag if any

                title_node: etree._Element = cast(etree._Element,
                                                  self.get_title(item))
                if "translateto" in parameters:
                    translate_dom(title_node, parameters["translateto"])
                if img_url == "":
                    # uses the ThumbnailHandler to fetch an image from google search images
                    img_url = "%s/thumbnails?request=%s&blur=%s" % (
                        self.serving_url_prefix,
                        quote_plus(
                            re.sub(r"</?title[^>]*>", "",
                                   to_string(title_node)).strip()), nsfw)

                img = etree.Element("img")
                img.set("src", img_url)
                description.append(img)
            else:
                img_url = description_thumbnail_url

        # blur description images
        if nsfw == "true":
            self._manage_blur_image_link(item, description)

        return img_url
示例#14
0
    def get_feed(self, parameters: dict, session: requests.Session) -> str:
        feed = session.get(url=self.get_rss_url(), headers={}).text

        # I probably do not use etree as I should
        feed = feed.replace('<?xml version="1.0" encoding="utf-8"?>', '')
        dom = etree.fromstring(feed)

        if "filter" in parameters:
            # filter only on passed category, eg /eurosport/rss/tennis
            xpath_expression = utils.dom_utils.get_xpath_expression_for_filters(
                parameters, "category/text() = '%s'",
                "not(category/text() = '%s')")

            utils.dom_utils.delete_nodes(dom.xpath(xpath_expression))

        # replace video links, they must be processed by getContent
        for node in xpath(dom, "//link|//guid"):
            # if link.text.find("/video.shtml") > -1:
            node.text = "%s" % self.get_handler_url_with_parameters(
                {"url": cast(str, node.text)})

        feed = to_string(dom).replace("\\u0027", "'").replace("\\u0022", "'")

        return feed
示例#15
0
    def get_content(self, url: str, parameters: dict,
                    session: requests.Session) -> PyRSSWContent:
        page = session.get(url=url, headers={})

        dom = etree.HTML(page.text)
        title = utils.dom_utils.get_content(dom, ["//h1"])
        h1s = xpath(dom, "//h1")
        if len(h1s) > 0:
            #sometimes there is 2 h1 for the same title in the page
            h1s[0].getparent().remove(h1s[0])
        imgsrc = ""
        imgs = dom.xpath("//img[@srcset]")
        if len(imgs) > 0:
            imgsrc = imgs[0].get("srcset")

        utils.dom_utils.delete_xpaths(dom, [
            '//*[@class="s24-art-cross-linking"]',
            '//*[@class="fig-media__button"]', '//*[@class="s24-art-pub-top"]'
        ])

        self._process_dugout(session, dom)

        for img in dom.xpath("//img[@data-srcset]"):
            if "src" not in img.attrib:
                img.attrib["src"] = img.get("data-srcset").split(" ")[0]

        contents = dom.xpath('//*[@class="s24-art__content s24-art__resize"]')
        if len(contents) > 0:
            if imgsrc != "":
                bodies = contents[0].xpath('//*[@class="s24-art-body"]')
                if len(bodies) > 0:
                    img = etree.Element("img")
                    img.set("src", imgsrc)
                    bodies[0].insert(0, img)
            content = to_string(contents[0])
        else:
            content = utils.dom_utils.get_content(
                dom,
                [
                    # handles golf.lefigaro structure
                    '//article[contains(@class,"fig-content")]',
                    # handles lefigaro.fr/sports
                    '//article[contains(@class,"fig-main")]'
                ])

        content = "%s%s" % (title, content)
        return PyRSSWContent(
            content, """
            #sport24_handler .object-left {
                display: block;
                text-align: center;
                width: auto;
                max-width: fit-content;
                float: left;
                margin: 5px;
            }

            #sport24_handler .object-left img {
                float:none;
                margin:0;
            }

            #sport24_handler .embed {
                clear:both;
            }
            
            #sport24_handler div.object-right {
                text-align:center;
            }
        """)