def get_content(self, url: str, parameters: dict, session: requests.Session) -> PyRSSWContent: page = session.get(url=url, headers={}) dom = etree.HTML(page.text) title = utils.dom_utils.get_content(dom, ["//h1"]) utils.dom_utils.delete_xpaths(dom, [ '//*[@class="content-info"]', '//*[@class="modal"]', '//*[@class="comments text-center"]', '//*[@id="undercomments"]', '//*[@style="padding:10px"]', '//*[@class="hrdash"]', '//*[@class="row heading bottomnav"]', '//*[@id="picdumpnav"]', '//*[@class="container-fluid"]', '//*[@id="myModal"]' ]) main_bodies = dom.xpath('//*[@id="mainbody"]') if len(main_bodies) > 0: content = self._replace_urls( etree.tostring(main_bodies[0], encoding='unicode')) else: content = self._replace_urls( etree.tostring(dom, encoding='unicode')) content = self._clean_content(content) content = content.replace("<video ", "<video width=\"100%\" controls ") content = content.replace('autoplay=""', '') content = content.replace('playsinline=""', '') content = re.sub(r'poster=(["\'])/', r'poster=\1https://www.evilmilk.com/', content) content = "%s%s" % (title, content) return PyRSSWContent( content, """ #pyrssw_wrapper #evilmilk_handler div img { max-height: 90vh;margin: 0 auto; display: block;} #pyrssw_wrapper #evilmilk_handler div video {height: 90vh;} """)
def get_content(self, url: str, parameters: dict, session: requests.Session) -> PyRSSWContent: content: str = "" content = session.get(url=url).text dom = etree.HTML(content) if dom is not None: descriptions = dom.xpath( "//*[contains(@class, \"detailDescSummary\")]") if len(descriptions) > 0: # move images to a readable node node = descriptions[0] node.append(etree.Element("br")) cpt = 1 for li in dom.xpath("//li[contains(@class,\"carouselListItem\")][@data-srco]"): new_img = etree.Element("img") new_img.attrib["src"] = li.attrib["data-srco"].replace( "182x136", "800x600") new_img.attrib["alt"] = "Images #%d" % cpt node.append(new_img) node.append(etree.Element("br")) node.append(etree.Element("br")) cpt = cpt + 1 content = utils.dom_utils.get_content( dom, ['//*[contains(@class, "detailDescSummary")]']) content += utils.dom_utils.get_content( dom, ['//*[contains(@class, "detailInfos")]']) return PyRSSWContent(""" <div class=\"main-content\"> %s </div>""" % (content))
def get_content(self, url: str, parameters: dict, session: requests.Session) -> PyRSSWContent: content: str = "" # for some reasons logicimmo website does not work with sessions page = requests.get(url=url) dom = etree.HTML(page.text) if dom is not None: descriptions = dom.xpath( "//div[@class=\"offer-description-text\"]") if len(descriptions) > 0: # move images to a readable node node = descriptions[0] cpt = 1 for img in dom.xpath("//img[contains(@src,'182x136')]"): new_img = etree.Element("img") new_img.attrib["src"] = img.attrib["src"].replace( "182x136", "800x600") new_img.attrib["alt"] = "Images #%d" % cpt node.append(new_img) node.append(etree.Element("br")) node.append(etree.Element("br")) cpt = cpt + 1 for node in dom.xpath("//*[contains(@class,\"carousel-wrapper\")]"): node.getparent().remove(node) utils.dom_utils.delete_xpaths(dom, [ '//*[@id="photo"]', '//button' ]) # remove orignal nodes containing photos content = utils.dom_utils.get_content( dom, ['//*[contains(@class, "offer-block")]']).replace("182x136", "800x600") content += utils.dom_utils.get_content( dom, ['//*[contains(@class, "offer-description")]']) return PyRSSWContent(""" <div class=\"main-content\"> %s </div>""" % (content))
def get_content(self, url: str, parameters: dict, session: requests.Session) -> PyRSSWContent: return PyRSSWContent( self.get_readable_content( session, url, headers={ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", "Cache-Control": "no-cache", "Content-Encoding": "identity", "Accept-Charset": "utf-8", "Content-Type": "application/x-www-form-urlencoded; charset=utf-8", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:75.0) Gecko/20100101 Firefox/75.0", "Connection": "keep-alive", "Pragma": "no-cache" }), "")
def get_content(self, url: str, parameters: dict, session: requests.Session) -> PyRSSWContent: content, url_next_page_2 = self._get_content(url, session, True) if url_next_page_2 != "": # add a page 2 next_content, url_next_page_3 = self._get_content( url_next_page_2, session) content += next_content if url_next_page_3 != "" and url_next_page_2 != url_next_page_3 and url_next_page_3.find( "page,1,") == -1: # add a page 3 (sometimes there is a redirection with an ongoing page) next_content, url_next_page_3 = self._get_content( url_next_page_3, session) content += next_content return PyRSSWContent( content, """ #pyrssw_wrapper #izismile_handler div img {float:none; max-height: 90vh;margin: 0 auto; display: block;} """)
def get_content(self, url: str, parameters: dict, session: requests.Session) -> PyRSSWContent: content: str = "" page = session.get(url=url) json_obj: Optional[dict] = self._load_json( page.text, "__NEXT_DATA__\" type=\"application/json\">") if json_obj is not None and "props" in json_obj[ "root"] and "pageProps" in json_obj["root"][ "props"] and "ad" in json_obj["root"]["props"]["pageProps"]: node = json_obj["root"]["props"]["pageProps"]["ad"] content = "<p><b>%s</b></p>" % get_node_value_if_exists( node, "subject") content += "<p><b>%s</b></p>" % self._get_price(node) content += "<p>%s</p>" % self._get_location(node) content += "<hr/>" content += "<b>%s</b>" % get_node_value_if_exists(node, "body") content += "<hr/>" other_imgs: str = "" _, other_imgs = self._process_images(node) content += other_imgs content += "<hr/>" content += "<p>%s</p>" % get_node_value_if_exists( node, "category_name") content += "<hr/>" if "attributes" in node: for attribute in node["attributes"]: key_label: str = get_node_value_if_exists( attribute, "key_label") if key_label != "": content += "<p><strong>%s</strong>: %s</p>" % ( key_label, get_node_value_if_exists(attribute, "value_label")) return PyRSSWContent(""" <div class=\"main-content\"> %s </div>""" % (content))
def get_reddit_content(self, url: str, session: Session, with_comments: bool) -> PyRSSWContent: content: str = "" page = session.get(url="%s/.json" % url, headers=self._get_headers()) json_content = page.content try: root = json.loads(json_content) except JSONDecodeError as _: content = "<strong>Status code: %d<br/></strong>" % page.status_code content += to_string(etree.HTML(page.content, parser=None)) root = {} datatypes = self._get_datatypes_json(root, "t3") # t3 : content for data in datatypes: content += "<h1>%s</h1>" % get_node_value_if_exists(data, "title") self_html: str = get_node_value_if_exists(data, "selftext_html") post_hint: str = get_node_value_if_exists(data, "post_hint") removed_by: str = get_node_value_if_exists( data, "removed_by") + get_node_value_if_exists( data, "removed_by_category") if removed_by == "": content = self._get_content_from_data(data=data, session=session, self_html=self_html, post_hint=post_hint) else: content = "Content removed" comments: str = "" if with_comments: comments = "<hr/><h2>Comments</h2>" comments_json = self._get_datatypes_json(root, "t1") # t1 : comments for comment_json in comments_json: comments += self.get_comments(comment_json) content = "<article>%s%s</article>" % (content, comments) return PyRSSWContent(content)
def get_content(self, url: str, parameters: dict, session: requests.Session) -> PyRSSWContent: self._update_headers(session) page = session.get(url=url) dom = etree.HTML(page.text) utils.dom_utils.delete_xpaths(dom, [ '//*[contains(@class, "BookmarkButtonstyled")]', '//*[contains(@class, "TagsWithIcon")]', '//button', '//svg' ]) # move images to a readable node (see readability) cpt = 1 nodes = dom.xpath("//*[contains(@class,\"ShowMoreText\")]//p") if len(nodes) > 0: node = nodes[0] for div in dom.xpath("//div[@data-background]"): new_img = etree.Element("img") new_img.attrib["src"] = div.attrib["data-background"] new_img.attrib["alt"] = "Images #%d" % cpt node.append(new_img) node.append(etree.Element("br")) node.append(etree.Element("br")) cpt = cpt + 1 content = utils.dom_utils.get_content(dom, [ '//*[contains(@data-test, "summary")]', "//*[contains(@class,\"ann_expiree g-vspace-400\")]" ]) # expired article content += utils.dom_utils.get_content( dom, ['//*[@id="showcase-description"]']) if content == "": raise Exception("Unable to get content: blacklisted?") return PyRSSWContent(content)
def get_content(self, url: str, parameters: dict, session: requests.Session) -> PyRSSWContent: self._authent(parameters, session) try: page = session.get(url=url) content = page.text dom = etree.HTML(content) utils.dom_utils.delete_xpaths(dom, [ '//*[contains(@class, "meta__social")]', '//*[contains(@class, "breadcrumb")]', '//*[contains(@class, "article__reactions")]', '//*[contains(@class, "services")]', '//*[contains(@class, "article__footer-single")]', '//*[contains(@class, "wp-socializer")]', '//*[contains(@class, "insert")]', '//*[@id="comments"]', # blog '//*[contains(@class, "post-navigation")]', # blog '//*[contains(@class, "entry-footer")]', # blog '//*[contains(@class, "catcher")]', # tribune '//aside' ]) self.process_pictures(dom) self.process_inread(dom) # le monde rss provides many sub websites with different html architecture content = utils.dom_utils.get_content(dom, [ '//*[contains(@class, "zone--article")]', '//*[contains(@class, "article--content")]', # tribune '//*[@id="post-container"]', '//*[@id="main"]' # blog ]) finally: self._unauthent(session) return PyRSSWContent(content)
def get_content(self, url: str, parameters: dict, session: requests.Session) -> PyRSSWContent: page = session.get(url=url) content = page.text.replace(">", ">\n") content = re.sub(r'src="data:image[^"]*', '', content) content = content.replace("data-src", "style='height:100%;width:100%' src") content = content.replace('data-fs-media', '') content = content.replace('class="fs-media"', '') dom = etree.HTML(content) # rework images imgs = dom.xpath('//img[contains(@class, "img-responsive")]') for img in imgs: new_img = etree.Element("img") new_img.set("src", img.attrib["src"]) img.getparent().getparent().getparent().getparent().getparent( ).append(new_img) title = utils.dom_utils.get_content(dom, ["//h1"]) utils.dom_utils.delete_xpaths(dom, [ '//*[contains(@class, "module-toretain")]', '//*[contains(@class, "image-module")]', '//*[contains(@class, "social-button")]', '//section[contains(@class, "breadcrumb")]', '//section[contains(@class, "author-box")]', '//*[contains(@class, "ICON-QUICKREAD")]/parent::*/parent::*' ]) content = "%s%s" % (title, utils.dom_utils.get_content( dom, ['//div[contains(@class,"article-column")]'])) return PyRSSWContent(content)
def get_content(self, url: str, parameters: dict, session: requests.Session) -> PyRSSWContent: content: str = "" content = session.get(url=url).text json_obj = json.loads(content) if json_obj is not None: content = "<p><b>%s</b></p>" % get_node_value_if_exists( json_obj, "title") content += "<p><b>%s</b></p>" % self._get_price(json_obj) content += "<p>%s - %s</p>" % (get_node_value_if_exists( json_obj, "postalCode"), get_node_value_if_exists(json_obj, "city")) content += "<hr/>" content += "<b>%s</b>" % get_node_value_if_exists( json_obj, "description") content += "<hr/>" content += self._build_imgs(self._get_img_urls(json_obj)) return PyRSSWContent(""" <div class=\"main-content\"> %s </div>""" % (content))
def get_content(self, url: str, parameters: dict, session: requests.Session) -> PyRSSWContent: page = session.get(url=url, headers={}) dom = etree.HTML(page.text) title = utils.dom_utils.get_content(dom, ["//h1"]) h1s = xpath(dom, "//h1") if len(h1s) > 0: #sometimes there is 2 h1 for the same title in the page h1s[0].getparent().remove(h1s[0]) imgsrc = "" imgs = dom.xpath("//img[@srcset]") if len(imgs) > 0: imgsrc = imgs[0].get("srcset") utils.dom_utils.delete_xpaths(dom, [ '//*[@class="s24-art-cross-linking"]', '//*[@class="fig-media__button"]', '//*[@class="s24-art-pub-top"]' ]) self._process_dugout(session, dom) for img in dom.xpath("//img[@data-srcset]"): if "src" not in img.attrib: img.attrib["src"] = img.get("data-srcset").split(" ")[0] contents = dom.xpath('//*[@class="s24-art__content s24-art__resize"]') if len(contents) > 0: if imgsrc != "": bodies = contents[0].xpath('//*[@class="s24-art-body"]') if len(bodies) > 0: img = etree.Element("img") img.set("src", imgsrc) bodies[0].insert(0, img) content = to_string(contents[0]) else: content = utils.dom_utils.get_content( dom, [ # handles golf.lefigaro structure '//article[contains(@class,"fig-content")]', # handles lefigaro.fr/sports '//article[contains(@class,"fig-main")]' ]) content = "%s%s" % (title, content) return PyRSSWContent( content, """ #sport24_handler .object-left { display: block; text-align: center; width: auto; max-width: fit-content; float: left; margin: 5px; } #sport24_handler .object-left img { float:none; margin:0; } #sport24_handler .embed { clear:both; } #sport24_handler div.object-right { text-align:center; } """)
def get_content(self, url: str, parameters: dict, session: requests.Session) -> PyRSSWContent: page = session.get(url=url) content = page.text.replace(">", ">\n") content = re.sub(r'src="data:image[^"]*', '', content) content = content.replace("data-src", "style='height:100%;width:100%' src") dom = etree.HTML(content) utils.dom_utils.delete_xpaths( dom, [ '//*[contains(@class, "block-share")]', '//*[@id="newsletter-onvousrepond"]', '//*[contains(@class, "partner-block")]', '//*[contains(@class, "a-lire-aussi")]', '//aside[contains(@class, "tags")]', '//*[contains(@class, "breadcrumb")]', '//*[contains(@class, "col-left")]', '//*[contains(@class, "col-right")]', '//*[contains(@class, "c-signature")]', '//*[contains(@class, "publication-date__modified")]', '//*[contains(@class, "social-aside")]', # france3 regions '//*[contains(@class, "aside-img__content")]', # france3 regions # france3 regions '//*[contains(@class, "social-button-content")]', '//*[contains(@class, "tags-button-content")]', # france3 regions '//*[contains(@class, "article-share")]', # france3 regions # france3 regions '//*[contains(@class, "article-share-fallback")]', # france3 regions '//*[contains(@class, "article-share-fallback")]', '//*[contains(@class, "related-content")]', '//*[contains(@class, "article__thematics")]', '//*[contains(@class, "article__related ")]', '//*[contains(@class, "subjects-title")]', '//*[contains(@class, "subjects-list")]', '//*[contains(@class, "audio-component")]', '//*[contains(@class, "social-zone")]', '//*[contains(@class, "c-signature__images")]', '//*[contains(@class, "article__share")]', '//*[contains(@class, "audio-player-container")]' ]) content = utils.dom_utils.get_content( dom, [ '//div[contains(@class,"article-detail-block")]', # francetvinfos '//article[contains(@class,"page-content")]', '//article[contains(@id,"node")]', # france3 regions '//main[contains(@class,"article")]', # france3 regions '//article[contains(@class,"content-live")]', # live '//*[contains(@class, "article__column--left")]', # la1ere '//div[contains(@class, "content")]', # sport.francetvinfo.fr '//*[contains(@class,"article-detail-block")]' ]) if len(content.replace("\n", "").strip()) < 150: # less than 150 chars, we did not manage to get the content, use readability facility content = super().get_readable_content(session, url) # avoid loosing topCallImage because of remove script content = content.replace("id=\"topCallImage\"", "id=\"topCallImage--\"") return PyRSSWContent( content, """ #franceinfo_handler img.also-link__content__img {float:left;margin:0 10px 10px 0;} """)
def get_content(self, url: str, parameters: dict, session: requests.Session) -> PyRSSWContent: page = session.get(url=url, headers={}) content = self._clean_content(page.text) return PyRSSWContent(content)
def get_content(self, url: str, parameters: dict, session: requests.Session) -> PyRSSWContent: content = "" if url.find("/video.shtml") > -1 and url.find("_vid") > -1: content = self._get_video_content(url, session) elif url.find("www.rugbyrama.fr") > -1: page = session.get(url=url) dom = etree.HTML(page.text) self._process_lazy_img(dom) utils.dom_utils.delete_xpaths(dom, [ '//div[contains(@class, "storyfull__header")]', '//div[contains(@class, "storyfull__publisher-social-button")]', '//*[contains(@class, "outbrain-container")]', '//*[contains(@class, "related-stories")]', '//*[@id="header-sharing"]' ]) content = utils.dom_utils.get_content( dom, ['//div[contains(@class, "storyfull")]']) elif url.find("/live.shtml") > -1 or url.find("/liveevent.shtml") > -1: page = session.get(url=url) dom = etree.HTML(page.text) utils.dom_utils.delete_xpaths(dom, [ '//*[@class="nav-tab"]', '//*[@class="live-match-nav__sharing"]', '//*[@class="livecomments-nav"]', '//*[@id="subnavigation-nav-tabs"]', '//*[contains(@class,"livecomments-header")]', '//*[contains(@class,"score-cards--hide-desktop-sm")]' ]) self._process_lazy_img(dom) content = utils.dom_utils.get_content( dom, [ '//div[@id="content"]', # handles live scores '//section[@id="content"]', # handles live scores '//*[@class="livecomments-content"]' # handler live transfers ]) content = utils.dom_utils.get_content( dom, [ # add score if any '//*[contains(@class,"heromatch__col heromatch__col--center")]' ]) + content else: content = self._get_content(url, session) content = content.replace("width=\"100%\"", "style=\"width:100%\"") return PyRSSWContent( content, """ # eurosport_handler .storyfull__ng-picture img {width:100%} # eurosport_handler .live-summary__seo-picture img {width:100%} # eurosport_handler .img-link img { float: none; display: block; margin: 0 auto; } # eurosport_handler .storyfull__publisher-time span::before { content: ' | '; } # eurosport_handler .heromatch__status { display: block; } # eurosport_handler .heromatch__col heromatch__col--center, #eurosport_handler .heromatch__score, #eurosport_handler .heromatch__score-dash, #eurosport_handler .heromatch__score { display: inline-block; } # eurosport_handler img.livecomments-icon, #eurosport_handler img.isg-interchange { float:none; } """)