Exemplo n.º 1
0
    def get_content(self, url: str, parameters: dict,
                    session: requests.Session) -> PyRSSWContent:
        page = session.get(url=url, headers={})
        dom = etree.HTML(page.text)
        title = utils.dom_utils.get_content(dom, ["//h1"])

        utils.dom_utils.delete_xpaths(dom, [
            '//*[@class="content-info"]', '//*[@class="modal"]',
            '//*[@class="comments text-center"]', '//*[@id="undercomments"]',
            '//*[@style="padding:10px"]', '//*[@class="hrdash"]',
            '//*[@class="row heading bottomnav"]', '//*[@id="picdumpnav"]',
            '//*[@class="container-fluid"]', '//*[@id="myModal"]'
        ])

        main_bodies = dom.xpath('//*[@id="mainbody"]')
        if len(main_bodies) > 0:
            content = self._replace_urls(
                etree.tostring(main_bodies[0], encoding='unicode'))
        else:
            content = self._replace_urls(
                etree.tostring(dom, encoding='unicode'))
        content = self._clean_content(content)

        content = content.replace("<video ", "<video width=\"100%\" controls ")
        content = content.replace('autoplay=""', '')
        content = content.replace('playsinline=""', '')
        content = re.sub(r'poster=(["\'])/',
                         r'poster=\1https://www.evilmilk.com/', content)
        content = "%s%s" % (title, content)

        return PyRSSWContent(
            content, """
            #pyrssw_wrapper #evilmilk_handler div img { max-height: 90vh;margin: 0 auto; display: block;}
            #pyrssw_wrapper #evilmilk_handler div video {height: 90vh;}
        """)
Exemplo n.º 2
0
    def get_content(self, url: str, parameters: dict, session: requests.Session) -> PyRSSWContent:
        content: str = ""

        content = session.get(url=url).text

        dom = etree.HTML(content)
        if dom is not None:

            descriptions = dom.xpath(
                "//*[contains(@class, \"detailDescSummary\")]")
            if len(descriptions) > 0:
                # move images to a readable node
                node = descriptions[0]
                node.append(etree.Element("br"))
                cpt = 1
                for li in dom.xpath("//li[contains(@class,\"carouselListItem\")][@data-srco]"):
                    new_img = etree.Element("img")
                    new_img.attrib["src"] = li.attrib["data-srco"].replace(
                        "182x136", "800x600")
                    new_img.attrib["alt"] = "Images #%d" % cpt

                    node.append(new_img)
                    node.append(etree.Element("br"))
                    node.append(etree.Element("br"))
                    cpt = cpt + 1

            content = utils.dom_utils.get_content(
                dom, ['//*[contains(@class, "detailDescSummary")]'])
            content += utils.dom_utils.get_content(
                dom, ['//*[contains(@class, "detailInfos")]'])

        return PyRSSWContent("""
                <div class=\"main-content\">
                    %s
                </div>""" % (content))
Exemplo n.º 3
0
    def get_content(self, url: str, parameters: dict, session: requests.Session) -> PyRSSWContent:
        content: str = ""

        # for some reasons logicimmo website does not work with sessions
        page = requests.get(url=url)

        dom = etree.HTML(page.text)
        if dom is not None:

            descriptions = dom.xpath(
                "//div[@class=\"offer-description-text\"]")
            if len(descriptions) > 0:
                # move images to a readable node
                node = descriptions[0]
                cpt = 1
                for img in dom.xpath("//img[contains(@src,'182x136')]"):
                    new_img = etree.Element("img")
                    new_img.attrib["src"] = img.attrib["src"].replace(
                        "182x136", "800x600")
                    new_img.attrib["alt"] = "Images #%d" % cpt

                    node.append(new_img)
                    node.append(etree.Element("br"))
                    node.append(etree.Element("br"))
                    cpt = cpt + 1

            for node in dom.xpath("//*[contains(@class,\"carousel-wrapper\")]"):
                node.getparent().remove(node)

            utils.dom_utils.delete_xpaths(dom, [
                '//*[@id="photo"]',
                '//button'
            ])

            # remove orignal nodes containing photos
            content = utils.dom_utils.get_content(
                dom, ['//*[contains(@class, "offer-block")]']).replace("182x136", "800x600")
            content += utils.dom_utils.get_content(
                dom, ['//*[contains(@class, "offer-description")]'])

        return PyRSSWContent("""
            <div class=\"main-content\">
                %s
            </div>""" % (content))
Exemplo n.º 4
0
 def get_content(self, url: str, parameters: dict,
                 session: requests.Session) -> PyRSSWContent:
     return PyRSSWContent(
         self.get_readable_content(
             session,
             url,
             headers={
                 "Accept":
                 "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
                 "Cache-Control": "no-cache",
                 "Content-Encoding": "identity",
                 "Accept-Charset": "utf-8",
                 "Content-Type":
                 "application/x-www-form-urlencoded; charset=utf-8",
                 "Upgrade-Insecure-Requests": "1",
                 "User-Agent":
                 "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:75.0) Gecko/20100101 Firefox/75.0",
                 "Connection": "keep-alive",
                 "Pragma": "no-cache"
             }), "")
Exemplo n.º 5
0
    def get_content(self, url: str, parameters: dict,
                    session: requests.Session) -> PyRSSWContent:
        content, url_next_page_2 = self._get_content(url, session, True)

        if url_next_page_2 != "":
            # add a page 2
            next_content, url_next_page_3 = self._get_content(
                url_next_page_2, session)
            content += next_content

            if url_next_page_3 != "" and url_next_page_2 != url_next_page_3 and url_next_page_3.find(
                    "page,1,") == -1:
                # add a page 3 (sometimes there is a redirection with an ongoing page)
                next_content, url_next_page_3 = self._get_content(
                    url_next_page_3, session)
                content += next_content

        return PyRSSWContent(
            content, """
            #pyrssw_wrapper #izismile_handler div img {float:none; max-height: 90vh;margin: 0 auto; display: block;}
        """)
Exemplo n.º 6
0
    def get_content(self, url: str, parameters: dict,
                    session: requests.Session) -> PyRSSWContent:
        content: str = ""

        page = session.get(url=url)
        json_obj: Optional[dict] = self._load_json(
            page.text, "__NEXT_DATA__\" type=\"application/json\">")
        if json_obj is not None and "props" in json_obj[
                "root"] and "pageProps" in json_obj["root"][
                    "props"] and "ad" in json_obj["root"]["props"]["pageProps"]:
            node = json_obj["root"]["props"]["pageProps"]["ad"]
            content = "<p><b>%s</b></p>" % get_node_value_if_exists(
                node, "subject")
            content += "<p><b>%s</b></p>" % self._get_price(node)
            content += "<p>%s</p>" % self._get_location(node)
            content += "<hr/>"
            content += "<b>%s</b>" % get_node_value_if_exists(node, "body")
            content += "<hr/>"

            other_imgs: str = ""
            _, other_imgs = self._process_images(node)
            content += other_imgs

            content += "<hr/>"
            content += "<p>%s</p>" % get_node_value_if_exists(
                node, "category_name")
            content += "<hr/>"
            if "attributes" in node:
                for attribute in node["attributes"]:
                    key_label: str = get_node_value_if_exists(
                        attribute, "key_label")
                    if key_label != "":
                        content += "<p><strong>%s</strong>: %s</p>" % (
                            key_label,
                            get_node_value_if_exists(attribute, "value_label"))

        return PyRSSWContent("""
                <div class=\"main-content\">
                    %s
                </div>""" % (content))
Exemplo n.º 7
0
    def get_reddit_content(self, url: str, session: Session,
                           with_comments: bool) -> PyRSSWContent:
        content: str = ""
        page = session.get(url="%s/.json" % url, headers=self._get_headers())
        json_content = page.content

        try:
            root = json.loads(json_content)
        except JSONDecodeError as _:
            content = "<strong>Status code: %d<br/></strong>" % page.status_code
            content += to_string(etree.HTML(page.content, parser=None))
            root = {}
        datatypes = self._get_datatypes_json(root, "t3")  # t3 : content
        for data in datatypes:
            content += "<h1>%s</h1>" % get_node_value_if_exists(data, "title")
            self_html: str = get_node_value_if_exists(data, "selftext_html")
            post_hint: str = get_node_value_if_exists(data, "post_hint")
            removed_by: str = get_node_value_if_exists(
                data, "removed_by") + get_node_value_if_exists(
                    data, "removed_by_category")
            if removed_by == "":
                content = self._get_content_from_data(data=data,
                                                      session=session,
                                                      self_html=self_html,
                                                      post_hint=post_hint)
            else:
                content = "Content removed"

        comments: str = ""
        if with_comments:
            comments = "<hr/><h2>Comments</h2>"
            comments_json = self._get_datatypes_json(root,
                                                     "t1")  # t1 : comments
            for comment_json in comments_json:
                comments += self.get_comments(comment_json)

        content = "<article>%s%s</article>" % (content, comments)

        return PyRSSWContent(content)
Exemplo n.º 8
0
    def get_content(self, url: str, parameters: dict,
                    session: requests.Session) -> PyRSSWContent:
        self._update_headers(session)
        page = session.get(url=url)
        dom = etree.HTML(page.text)

        utils.dom_utils.delete_xpaths(dom, [
            '//*[contains(@class, "BookmarkButtonstyled")]',
            '//*[contains(@class, "TagsWithIcon")]', '//button', '//svg'
        ])
        # move images to a readable node (see readability)
        cpt = 1
        nodes = dom.xpath("//*[contains(@class,\"ShowMoreText\")]//p")
        if len(nodes) > 0:
            node = nodes[0]
            for div in dom.xpath("//div[@data-background]"):
                new_img = etree.Element("img")
                new_img.attrib["src"] = div.attrib["data-background"]
                new_img.attrib["alt"] = "Images #%d" % cpt

                node.append(new_img)
                node.append(etree.Element("br"))
                node.append(etree.Element("br"))
                cpt = cpt + 1

        content = utils.dom_utils.get_content(dom, [
            '//*[contains(@data-test, "summary")]',
            "//*[contains(@class,\"ann_expiree g-vspace-400\")]"
        ])  # expired article

        content += utils.dom_utils.get_content(
            dom, ['//*[@id="showcase-description"]'])

        if content == "":
            raise Exception("Unable to get content: blacklisted?")

        return PyRSSWContent(content)
Exemplo n.º 9
0
    def get_content(self, url: str, parameters: dict, session: requests.Session) -> PyRSSWContent:
        self._authent(parameters, session)
        try:
            page = session.get(url=url)
            content = page.text

            dom = etree.HTML(content)

            utils.dom_utils.delete_xpaths(dom, [
                '//*[contains(@class, "meta__social")]',
                '//*[contains(@class, "breadcrumb")]',
                '//*[contains(@class, "article__reactions")]',
                '//*[contains(@class, "services")]',
                '//*[contains(@class, "article__footer-single")]',
                '//*[contains(@class, "wp-socializer")]',
                '//*[contains(@class, "insert")]',
                '//*[@id="comments"]',  # blog
                '//*[contains(@class, "post-navigation")]',  # blog
                '//*[contains(@class, "entry-footer")]',  # blog
                '//*[contains(@class, "catcher")]',  # tribune
                '//aside'
            ])

            self.process_pictures(dom)
            self.process_inread(dom)

            # le monde rss provides many sub websites with different html architecture
            content = utils.dom_utils.get_content(dom, [
                '//*[contains(@class, "zone--article")]',
                '//*[contains(@class, "article--content")]',  # tribune
                '//*[@id="post-container"]',
                '//*[@id="main"]'                               # blog
            ])

        finally:
            self._unauthent(session)
        return PyRSSWContent(content)
Exemplo n.º 10
0
    def get_content(self, url: str, parameters: dict,
                    session: requests.Session) -> PyRSSWContent:
        page = session.get(url=url)
        content = page.text.replace(">", ">\n")

        content = re.sub(r'src="data:image[^"]*', '', content)
        content = content.replace("data-src",
                                  "style='height:100%;width:100%' src")
        content = content.replace('data-fs-media', '')
        content = content.replace('class="fs-media"', '')
        dom = etree.HTML(content)

        # rework images
        imgs = dom.xpath('//img[contains(@class, "img-responsive")]')
        for img in imgs:
            new_img = etree.Element("img")
            new_img.set("src", img.attrib["src"])
            img.getparent().getparent().getparent().getparent().getparent(
            ).append(new_img)

        title = utils.dom_utils.get_content(dom, ["//h1"])
        utils.dom_utils.delete_xpaths(dom, [
            '//*[contains(@class, "module-toretain")]',
            '//*[contains(@class, "image-module")]',
            '//*[contains(@class, "social-button")]',
            '//section[contains(@class, "breadcrumb")]',
            '//section[contains(@class, "author-box")]',
            '//*[contains(@class, "ICON-QUICKREAD")]/parent::*/parent::*'
        ])

        content = "%s%s" % (title,
                            utils.dom_utils.get_content(
                                dom,
                                ['//div[contains(@class,"article-column")]']))

        return PyRSSWContent(content)
Exemplo n.º 11
0
    def get_content(self, url: str, parameters: dict,
                    session: requests.Session) -> PyRSSWContent:
        content: str = ""

        content = session.get(url=url).text

        json_obj = json.loads(content)
        if json_obj is not None:
            content = "<p><b>%s</b></p>" % get_node_value_if_exists(
                json_obj, "title")
            content += "<p><b>%s</b></p>" % self._get_price(json_obj)
            content += "<p>%s - %s</p>" % (get_node_value_if_exists(
                json_obj,
                "postalCode"), get_node_value_if_exists(json_obj, "city"))
            content += "<hr/>"
            content += "<b>%s</b>" % get_node_value_if_exists(
                json_obj, "description")
            content += "<hr/>"
            content += self._build_imgs(self._get_img_urls(json_obj))

        return PyRSSWContent("""
                <div class=\"main-content\">
                    %s
                </div>""" % (content))
Exemplo n.º 12
0
    def get_content(self, url: str, parameters: dict,
                    session: requests.Session) -> PyRSSWContent:
        page = session.get(url=url, headers={})

        dom = etree.HTML(page.text)
        title = utils.dom_utils.get_content(dom, ["//h1"])
        h1s = xpath(dom, "//h1")
        if len(h1s) > 0:
            #sometimes there is 2 h1 for the same title in the page
            h1s[0].getparent().remove(h1s[0])
        imgsrc = ""
        imgs = dom.xpath("//img[@srcset]")
        if len(imgs) > 0:
            imgsrc = imgs[0].get("srcset")

        utils.dom_utils.delete_xpaths(dom, [
            '//*[@class="s24-art-cross-linking"]',
            '//*[@class="fig-media__button"]', '//*[@class="s24-art-pub-top"]'
        ])

        self._process_dugout(session, dom)

        for img in dom.xpath("//img[@data-srcset]"):
            if "src" not in img.attrib:
                img.attrib["src"] = img.get("data-srcset").split(" ")[0]

        contents = dom.xpath('//*[@class="s24-art__content s24-art__resize"]')
        if len(contents) > 0:
            if imgsrc != "":
                bodies = contents[0].xpath('//*[@class="s24-art-body"]')
                if len(bodies) > 0:
                    img = etree.Element("img")
                    img.set("src", imgsrc)
                    bodies[0].insert(0, img)
            content = to_string(contents[0])
        else:
            content = utils.dom_utils.get_content(
                dom,
                [
                    # handles golf.lefigaro structure
                    '//article[contains(@class,"fig-content")]',
                    # handles lefigaro.fr/sports
                    '//article[contains(@class,"fig-main")]'
                ])

        content = "%s%s" % (title, content)
        return PyRSSWContent(
            content, """
            #sport24_handler .object-left {
                display: block;
                text-align: center;
                width: auto;
                max-width: fit-content;
                float: left;
                margin: 5px;
            }

            #sport24_handler .object-left img {
                float:none;
                margin:0;
            }

            #sport24_handler .embed {
                clear:both;
            }
            
            #sport24_handler div.object-right {
                text-align:center;
            }
        """)
Exemplo n.º 13
0
    def get_content(self, url: str, parameters: dict,
                    session: requests.Session) -> PyRSSWContent:
        page = session.get(url=url)
        content = page.text.replace(">", ">\n")

        content = re.sub(r'src="data:image[^"]*', '', content)
        content = content.replace("data-src",
                                  "style='height:100%;width:100%' src")
        dom = etree.HTML(content)

        utils.dom_utils.delete_xpaths(
            dom,
            [
                '//*[contains(@class, "block-share")]',
                '//*[@id="newsletter-onvousrepond"]',
                '//*[contains(@class, "partner-block")]',
                '//*[contains(@class, "a-lire-aussi")]',
                '//aside[contains(@class, "tags")]',
                '//*[contains(@class, "breadcrumb")]',
                '//*[contains(@class, "col-left")]',
                '//*[contains(@class, "col-right")]',
                '//*[contains(@class, "c-signature")]',
                '//*[contains(@class, "publication-date__modified")]',
                '//*[contains(@class, "social-aside")]',  # france3 regions
                '//*[contains(@class, "aside-img__content")]',  # france3 regions
                # france3 regions
                '//*[contains(@class, "social-button-content")]',
                '//*[contains(@class, "tags-button-content")]',  # france3 regions
                '//*[contains(@class, "article-share")]',  # france3 regions
                # france3 regions
                '//*[contains(@class, "article-share-fallback")]',
                # france3 regions
                '//*[contains(@class, "article-share-fallback")]',
                '//*[contains(@class, "related-content")]',
                '//*[contains(@class, "article__thematics")]',
                '//*[contains(@class, "article__related ")]',
                '//*[contains(@class, "subjects-title")]',
                '//*[contains(@class, "subjects-list")]',
                '//*[contains(@class, "audio-component")]',
                '//*[contains(@class, "social-zone")]',
                '//*[contains(@class, "c-signature__images")]',
                '//*[contains(@class, "article__share")]',
                '//*[contains(@class, "audio-player-container")]'
            ])

        content = utils.dom_utils.get_content(
            dom,
            [
                '//div[contains(@class,"article-detail-block")]',
                # francetvinfos
                '//article[contains(@class,"page-content")]',
                '//article[contains(@id,"node")]',  # france3 regions
                '//main[contains(@class,"article")]',  # france3 regions
                '//article[contains(@class,"content-live")]',  # live
                '//*[contains(@class, "article__column--left")]',  # la1ere
                '//div[contains(@class, "content")]',
                # sport.francetvinfo.fr
                '//*[contains(@class,"article-detail-block")]'
            ])

        if len(content.replace("\n", "").strip()) < 150:
            # less than 150 chars, we did not manage to get the content, use readability facility
            content = super().get_readable_content(session, url)

        # avoid loosing topCallImage because of remove script
        content = content.replace("id=\"topCallImage\"",
                                  "id=\"topCallImage--\"")

        return PyRSSWContent(
            content, """
            #franceinfo_handler img.also-link__content__img {float:left;margin:0 10px 10px 0;}
        """)
Exemplo n.º 14
0
 def get_content(self, url: str, parameters: dict,
                 session: requests.Session) -> PyRSSWContent:
     page = session.get(url=url, headers={})
     content = self._clean_content(page.text)
     return PyRSSWContent(content)
Exemplo n.º 15
0
    def get_content(self, url: str, parameters: dict,
                    session: requests.Session) -> PyRSSWContent:
        content = ""

        if url.find("/video.shtml") > -1 and url.find("_vid") > -1:
            content = self._get_video_content(url, session)
        elif url.find("www.rugbyrama.fr") > -1:
            page = session.get(url=url)
            dom = etree.HTML(page.text)
            self._process_lazy_img(dom)
            utils.dom_utils.delete_xpaths(dom, [
                '//div[contains(@class, "storyfull__header")]',
                '//div[contains(@class, "storyfull__publisher-social-button")]',
                '//*[contains(@class, "outbrain-container")]',
                '//*[contains(@class, "related-stories")]',
                '//*[@id="header-sharing"]'
            ])
            content = utils.dom_utils.get_content(
                dom, ['//div[contains(@class, "storyfull")]'])
        elif url.find("/live.shtml") > -1 or url.find("/liveevent.shtml") > -1:
            page = session.get(url=url)
            dom = etree.HTML(page.text)
            utils.dom_utils.delete_xpaths(dom, [
                '//*[@class="nav-tab"]',
                '//*[@class="live-match-nav__sharing"]',
                '//*[@class="livecomments-nav"]',
                '//*[@id="subnavigation-nav-tabs"]',
                '//*[contains(@class,"livecomments-header")]',
                '//*[contains(@class,"score-cards--hide-desktop-sm")]'
            ])
            self._process_lazy_img(dom)
            content = utils.dom_utils.get_content(
                dom,
                [
                    '//div[@id="content"]',  # handles live scores
                    '//section[@id="content"]',  # handles live scores
                    '//*[@class="livecomments-content"]'  # handler live transfers
                ])

            content = utils.dom_utils.get_content(
                dom,
                [
                    # add score if any
                    '//*[contains(@class,"heromatch__col heromatch__col--center")]'
                ]) + content
        else:
            content = self._get_content(url, session)

        content = content.replace("width=\"100%\"", "style=\"width:100%\"")

        return PyRSSWContent(
            content, """
            # eurosport_handler .storyfull__ng-picture img {width:100%}
            # eurosport_handler .live-summary__seo-picture img {width:100%}
            # eurosport_handler .img-link img {
                float: none;
                display: block;
                margin: 0 auto;
            }

            # eurosport_handler .storyfull__publisher-time span::before {
                content: ' | ';
            }

            # eurosport_handler .heromatch__status {
                display: block;
            }

            # eurosport_handler .heromatch__col heromatch__col--center, #eurosport_handler .heromatch__score, #eurosport_handler  .heromatch__score-dash, #eurosport_handler .heromatch__score {
                display: inline-block;
            }

            # eurosport_handler img.livecomments-icon, #eurosport_handler img.isg-interchange {
                float:none;
            }
        """)