def arrange(self, parameters: Dict[str, str], contents: str, rss_url_prefix: str, favicon_url: str) -> str: result: str = contents try: result: str = contents if len(result.strip()) > 0: # I probably do not use etree as I should result = re.sub(r'<\?xml [^>]*?>', '', result).strip() result = re.sub(r'<\?xml-stylesheet [^>]*?>', '', result).strip() dom = etree.fromstring(result) self.arrange_feed_top_level_element(dom, rss_url_prefix, parameters, favicon_url) for item in self.get_items(dom): self._arrange_item(item, parameters) self._arrange_feed_link(item, parameters) result = '<?xml version="1.0" encoding="UTF-8"?>\n' + \ to_string(dom) except etree.XMLSyntaxError as e: logging.getLogger().info( "[ %s ] - Unable to parse rss feed for module '%s' (%s), let's proceed anyway", datetime.datetime.now().strftime("%Y-%m-%d - %H:%M"), self.module_name, str(e)) return result
def _manage_reddit_preview_images(self, content) -> str: """Use directly the image instead of the preview Args: content ([type]): html content Returns: str: the content where preview images have been replaced by target """ content_without_preview: str = content img_previews = re.findall(IMG_PREVIEW_REDDIT, content) for preview in img_previews: content_without_preview = content.replace( preview[0], "https://i.redd.it/%s" % preview[1]) dom = etree.HTML(content_without_preview) for a in xpath(dom, "//a"): if "href" in a.attrib and a.attrib["href"].find( "://preview.redd.it/") > -1: img = etree.Element("img") img.set( "src", a.attrib["href"].replace("preview.redd.it", "i.redd.it")) a.getparent().append(img) a.getparent().remove(a) content_without_preview = to_string(dom) return content_without_preview
def get_feed(self, parameters: dict, session: requests.Session) -> str: feed = session.get(url=self.get_rss_url(), headers={}).text feed = re.sub(r'<link>[^<]*</link>', '', feed) link = '<link>' feed = feed.replace('<guid isPermaLink="false">', link) feed = feed.replace('<guid isPermaLink="true">', link) feed = feed.replace('</guid>', '</link>') feed = feed.replace(link, '<link>%s?%surl=' % ( self.url_prefix, self._getAuthentificationSuffix(parameters))) # I probably do not use etree as I should feed = re.sub(r'<\?xml [^>]*?>', '', feed).strip() dom = etree.fromstring(feed) # available filters : international, politique, societe, les-decodeurs, sport, planete, sciences, campus, afrique, pixels, actualites-medias, sante, big-browser, disparitions, podcasts if "filter" in parameters: # filter only on passed category xpath_expression = utils.dom_utils.get_xpath_expression_for_filters( parameters, "link[contains(text(), '/%s/')]", "not(link[contains(text(), '/%s/')])") utils.dom_utils.delete_nodes(dom.xpath(xpath_expression)) feed = to_string(dom) return feed
def _arrange_item(self, item: etree._Element, parameters: dict): descriptions = self.get_descriptions(item) thumbnail_url: str = self.get_img_url(item) if len(descriptions) > 0: description: etree._Element = descriptions[0] img_url = self._add_thumbnail_in_description( item, description, parameters, thumbnail_url) if thumbnail_url == "" and img_url != "": self.set_thumbnail_item(item, img_url) """ n = self._get_source(item) if n is not None: description.append(n) """ description_xml: str = "" if descriptions[0].text is not None: description_xml = descriptions[0].text for child in descriptions[0].getchildren(): description_xml += to_string(child) parent_obj = descriptions[0].getparent() parent_obj.remove(descriptions[0]) description = etree.Element(descriptions[0].tag) # "description") description.text = html.unescape(description_xml.strip()).replace( " ", " ") if "translateto" in parameters: dom = etree.HTML(description.text) translate_dom(dom, parameters["translateto"]) description.text = to_string(dom) parent_obj.append(description) if "debug" in parameters and parameters["debug"] == "true": p = etree.Element("p") i = etree.Element("i") i.text = "Session id: %s" % self.session_id p.append(i) descriptions[0].append(p)
def _get_thumbnail_url_from_description( self, description: etree._Element) -> str: thumbnail_url: str = "" imgs = xpath(description, ".//img") if len(imgs) > 0: thumbnail_url = imgs[0].attrib["url"] else: m = re.match(IMG_URL_REGEX, to_string(description)) if m is not None: thumbnail_url = m.group(1) return thumbnail_url
def _post_processing(self, url: str): if len(self.contents.strip()) > 0: dom = etree.HTML(self.contents) self._post_process_tweets(dom) self._replace_prefix_urls(dom) self._manage_translation(dom, url) self.contents = to_string(dom)\ .replace("<html>", "")\ .replace("</html>", "")\ .replace("<body>", "")\ .replace("</body>", "")\ .replace("<video", "<video preload=\"none\"") self.contents = self.contents.replace("data-src-lazyload", "src") self.contents = self.contents.replace("</br>", "")
def get_feed(self, parameters: dict, session: requests.Session) -> str: if "filter" in parameters and parameters["filter"] == ( "tennis" or "football" or "rugby" or "cyclisme" or "golf"): # filter only on passed category, eg /sport24/rss/tennis feed = session.get(url=self.get_rss_url() % parameters["filter"], headers={}).text else: feed = session.get(url=self.get_rss_url() % "accueil", headers={}).text # I probably do not use etree as I should feed = feed.replace('<?xml version="1.0" encoding="UTF-8"?>', '') regex = re.compile(r"&(?!amp;|lt;|gt;)") myxml = regex.sub("&", feed) dom = etree.fromstring(myxml) description_img: str = "" xpath_expression = "//item[not(enclosure)]" if "filter" in parameters and parameters["filter"] == "flash": xpath_expression = "//item[enclosure]" description_img = "<img src=\"https://pbs.twimg.com/profile_images/932616523285516294/sqt32oQY.jpg\"/>" utils.dom_utils.delete_nodes(dom.xpath(xpath_expression)) for link in xpath(dom, "//item/link"): if link is not None and text(link) is not None: link.text = self.get_handler_url_with_parameters( {"url": text(link).strip()}) feed = to_string(dom) title = "" if "filter" in parameters: title = " - " + parameters["filter"] feed = feed.replace( "<title>Sport24 - Toute l'actualite</title>", "<title>Sport24%s</title>" % string.capwords(title)) if description_img != "": feed = feed.replace("<description>", "<description>" + description_img) return feed
def get_feed(self, parameters: dict, session: Session) -> str: rss_url: str = self.get_rss_url() if "sub" in parameters: rss_url = "https://www.reddit.com/r/%s/.rss" % parameters["sub"] feed = session.get(url=rss_url, headers={}).text feed = re.sub(r'<\?xml [^>]*?>', '', feed).strip() # I probably do not use etree as I should dom = etree.fromstring(feed) for entry in xpath(dom, "//atom:entry", namespaces=NAMESPACES): content = cast( str, xpath(entry, "./atom:content", namespaces=NAMESPACES)[0].text) # try to replace thumbnail with real picture imgs = re.findall(r'"http[^"]*jpg"', content) thumb: str = "" other: str = "" for img in imgs: if "thumbs.redditmedia" in img: thumb = img else: other = img if thumb != "" and other != "": xpath(entry, "./atom:content", namespaces=NAMESPACES)[0].text = content.replace( thumb, other).replace("<td>  ", "</tr><tr><td>  ") for link in xpath(entry, "./atom:link", namespaces=NAMESPACES): link.attrib["href"] = self.get_handler_url_with_parameters( {"url": cast(str, link.attrib["href"].strip())}) feed = to_string(dom) return feed
def get_reddit_content(self, url: str, session: Session, with_comments: bool) -> PyRSSWContent: content: str = "" page = session.get(url="%s/.json" % url, headers=self._get_headers()) json_content = page.content try: root = json.loads(json_content) except JSONDecodeError as _: content = "<strong>Status code: %d<br/></strong>" % page.status_code content += to_string(etree.HTML(page.content, parser=None)) root = {} datatypes = self._get_datatypes_json(root, "t3") # t3 : content for data in datatypes: content += "<h1>%s</h1>" % get_node_value_if_exists(data, "title") self_html: str = get_node_value_if_exists(data, "selftext_html") post_hint: str = get_node_value_if_exists(data, "post_hint") removed_by: str = get_node_value_if_exists( data, "removed_by") + get_node_value_if_exists( data, "removed_by_category") if removed_by == "": content = self._get_content_from_data(data=data, session=session, self_html=self_html, post_hint=post_hint) else: content = "Content removed" comments: str = "" if with_comments: comments = "<hr/><h2>Comments</h2>" comments_json = self._get_datatypes_json(root, "t1") # t1 : comments for comment_json in comments_json: comments += self.get_comments(comment_json) content = "<article>%s%s</article>" % (content, comments) return PyRSSWContent(content)
def get_feed(self, parameters: dict, session: requests.Session) -> str: feed = session.get(url=self.get_rss_url()).text # I probably do not use etree as I should feed = re.sub(r'<\?xml [^>]*?>', '', feed).strip() dom = etree.fromstring(feed) if "filter" in parameters: # filter only on passed category xpath_expression = utils.dom_utils.get_xpath_expression_for_filters( parameters, "category/text() = '%s'", "not(category/text() = '%s')") utils.dom_utils.delete_nodes(dom.xpath(xpath_expression)) # replace video links, they must be processed by getContent for node in xpath(dom, "//link|//guid"): node.text = "%s" % self.get_handler_url_with_parameters( {"url": cast(str, node.text)}) feed = to_string(dom) return feed
def get_feed(self, parameters: dict, session: requests.Session) -> str: r = session.get(url=self.get_rss_url(), headers={}) # force encoding r.encoding = "utf-8" feed = r.text.replace("<link>", "<link>%s?url=" % self.url_prefix) feed = re.sub( r'<guid isPermaLink="false">https://lesjoiesducode.fr/\?p=[^<]*</guid>', r"", feed) # I probably do not use etree as I should feed = re.sub(r'<\?xml [^>]*?>', '', feed).strip() dom = etree.fromstring(feed) for item in xpath(dom, "//item"): for child in item.getchildren( ): # did not find how to xpath content:encoded tag if child.tag.endswith("encoded"): c = self._clean_content('<div class="blog-post">' + child.text + '</div>') child.text = c # "<![CDATA[" + c + "]]>" return to_string(dom)
def get_feed(self, parameters: dict, session: requests.Session) -> str: feed = session.get(url=self.get_rss_url(), headers={}).text feed = re.sub(r'<guid>[^<]*</guid>', '', feed) # I probably do not use etree as I should feed = re.sub(r'<\?xml [^>]*?>', '', feed).strip() dom = etree.fromstring(feed) if "filter" in parameters: # filter only on passed category xpath_expression = utils.dom_utils.get_xpath_expression_for_filters( parameters, "link[contains(text(), '/%s/')]", "not(link[contains(text(), '/%s/')])") utils.dom_utils.delete_nodes(dom.xpath(xpath_expression)) for link in xpath(dom, "//item/link"): link.text = self.get_handler_url_with_parameters( {"url": cast(str, link.text).strip()}) feed = to_string(dom) return feed
def _add_thumbnail_in_description(self, item: etree._Element, description: etree._Element, parameters: Dict[str, str], thumbnail_url: str) -> str: img_url: str = thumbnail_url nsfw: str = "false" if "nsfw" not in parameters else parameters["nsfw"] if description.text is not None: description_thumbnail_url: str = self._get_thumbnail_url_from_description( description) if description_thumbnail_url == "": # if description does not have a picture, add one from enclosure or media:content tag if any title_node: etree._Element = cast(etree._Element, self.get_title(item)) if "translateto" in parameters: translate_dom(title_node, parameters["translateto"]) if img_url == "": # uses the ThumbnailHandler to fetch an image from google search images img_url = "%s/thumbnails?request=%s&blur=%s" % ( self.serving_url_prefix, quote_plus( re.sub(r"</?title[^>]*>", "", to_string(title_node)).strip()), nsfw) img = etree.Element("img") img.set("src", img_url) description.append(img) else: img_url = description_thumbnail_url # blur description images if nsfw == "true": self._manage_blur_image_link(item, description) return img_url
def get_feed(self, parameters: dict, session: requests.Session) -> str: feed = session.get(url=self.get_rss_url(), headers={}).text # I probably do not use etree as I should feed = feed.replace('<?xml version="1.0" encoding="utf-8"?>', '') dom = etree.fromstring(feed) if "filter" in parameters: # filter only on passed category, eg /eurosport/rss/tennis xpath_expression = utils.dom_utils.get_xpath_expression_for_filters( parameters, "category/text() = '%s'", "not(category/text() = '%s')") utils.dom_utils.delete_nodes(dom.xpath(xpath_expression)) # replace video links, they must be processed by getContent for node in xpath(dom, "//link|//guid"): # if link.text.find("/video.shtml") > -1: node.text = "%s" % self.get_handler_url_with_parameters( {"url": cast(str, node.text)}) feed = to_string(dom).replace("\\u0027", "'").replace("\\u0022", "'") return feed
def get_content(self, url: str, parameters: dict, session: requests.Session) -> PyRSSWContent: page = session.get(url=url, headers={}) dom = etree.HTML(page.text) title = utils.dom_utils.get_content(dom, ["//h1"]) h1s = xpath(dom, "//h1") if len(h1s) > 0: #sometimes there is 2 h1 for the same title in the page h1s[0].getparent().remove(h1s[0]) imgsrc = "" imgs = dom.xpath("//img[@srcset]") if len(imgs) > 0: imgsrc = imgs[0].get("srcset") utils.dom_utils.delete_xpaths(dom, [ '//*[@class="s24-art-cross-linking"]', '//*[@class="fig-media__button"]', '//*[@class="s24-art-pub-top"]' ]) self._process_dugout(session, dom) for img in dom.xpath("//img[@data-srcset]"): if "src" not in img.attrib: img.attrib["src"] = img.get("data-srcset").split(" ")[0] contents = dom.xpath('//*[@class="s24-art__content s24-art__resize"]') if len(contents) > 0: if imgsrc != "": bodies = contents[0].xpath('//*[@class="s24-art-body"]') if len(bodies) > 0: img = etree.Element("img") img.set("src", imgsrc) bodies[0].insert(0, img) content = to_string(contents[0]) else: content = utils.dom_utils.get_content( dom, [ # handles golf.lefigaro structure '//article[contains(@class,"fig-content")]', # handles lefigaro.fr/sports '//article[contains(@class,"fig-main")]' ]) content = "%s%s" % (title, content) return PyRSSWContent( content, """ #sport24_handler .object-left { display: block; text-align: center; width: auto; max-width: fit-content; float: left; margin: 5px; } #sport24_handler .object-left img { float:none; margin:0; } #sport24_handler .embed { clear:both; } #sport24_handler div.object-right { text-align:center; } """)