Python FeedEntryItemLoader.add_xpath示例，feeds.loaders.FeedEntryItemLoader.add_xpath Python示例

示例#1

0

显示文件

文件： verbraucherrecht_at.py 项目： misspenalty/feeds

    def parse_item(self, response):
        il = FeedEntryItemLoader(
            response=response,
            base_url='{}/cms/'.format(self._link),
            timezone=self._timezone,
            remove_elems=['.news-latest-date', '.news-single-rightbox', 'hr',
                          'h7'],
            remove_elems_xpath=['//div[@class="news-single-item"]/b[1]',
                                '//div[@class="news-single-item"]/br[1]'],
        )

        il.add_value(
            'title',
            response.xpath('//head/title/text()').re_first(r'::: (.*)'))

        il.add_value('link', response.url)

        il.add_value(
            'updated',
            response.xpath('//div[@class="news-single-rightbox"]').
            re_first(r'(\d{2}\.\d{2}\.\d{4})'))

        il.add_value(
            'author_name',
            response.xpath('//head/meta[@name="publisher"]/@content').
            re_first('recht.at, (.*);'))
        il.add_xpath('author_name', '//head/meta[@name="author"]/@content')
        il.add_value('author_name', self.name)

        il.add_xpath('author_email', '//head/meta[@name="reply-to"]/@content')

        il.add_css('content_html', '.news-single-item h7 font strong')
        il.add_css('content_html', '.news-single-item')

        yield il.load_item()

示例#2

0

显示文件

文件： puls4_com.py 项目： Lukas0907/feeds

 def _parse_episode(self, response):
     il = FeedEntryItemLoader(
         response=response,
         base_url="https://{}".format(self.name),
         timezone="Europe/Vienna",
         dayfirst=True,
     )
     il.add_value("link", response.url)
     il.add_xpath(
         "title",
         '//meta[@name="title"]/@content',
         re=r"(?s)(.*?)(?: vom .*)? - puls4\.com",
     )
     il.add_value(
         "updated",
         "{} {}".format(
             response.xpath('//meta[@name="title"]/@content').re_first(
                 r".*vom (\d{2}\.\d{2}\.\d{4}).*"
             ),
             response.meta["time"] or "00:00",
         ),
     )
     il.add_value(
         "content_html",
         '<img src="{}">'.format(
             response.xpath('//meta[@property="og:image"]/@content').extract_first()
         ),
     )
     il.add_css("content_html", ".player-video-description-intro::text")
     return il.load_item()

示例#3

0

显示文件

 def _parse_episode(self, response):
     il = FeedEntryItemLoader(
         response=response,
         base_url=f"https://{self.name}",
         timezone="Europe/Vienna",
         dayfirst=True,
     )
     il.add_value("link", response.url)
     il.add_xpath(
         "title",
         '//meta[@name="title"]/@content',
         re=r"(?s)(.*?)(?: vom .*)? - puls4\.com",
     )
     il.add_value(
         "updated",
         "{} {}".format(
             response.xpath('//meta[@name="title"]/@content').re_first(
                 r".*vom (\d{2}\.\d{2}\.\d{4}).*"),
             response.meta["time"] or "00:00",
         ),
     )
     il.add_value(
         "content_html",
         '<img src="{}">'.format(
             response.xpath(
                 '//meta[@property="og:image"]/@content').extract_first()),
     )
     il.add_css("content_html", ".player-video-description-intro::text")
     return il.load_item()

示例#4

0

显示文件

文件： openwrt_org.py 项目： Lukas0907/feeds

 def parse_release_changelog(self, response):
     il = FeedEntryItemLoader(
         response=response, parent=response.meta["il"], base_url=self._base_url
     )
     il.add_value("content_html", "<h1>Detailed Changelog</h1>")
     il.add_xpath("content_html", "//h1/following-sibling::*")
     return il.load_item()

示例#5

0

显示文件

 def parse_release_changelog(self, response):
     il = FeedEntryItemLoader(response=response,
                              parent=response.meta["il"],
                              base_url=self._base_url)
     il.add_value("content_html", "<h1>Detailed Changelog</h1>")
     il.add_xpath("content_html", "//h1/following-sibling::*")
     return il.load_item()

示例#6

0

显示文件

文件： openwrt_org.py 项目： misspenalty/feeds

 def parse_release_changelog(self, response):
     il = FeedEntryItemLoader(
         response=response,
         parent=response.meta['il'],
         base_url=self._base_url,
     )
     il.add_value('content_html', '<h1>Detailed Changelog</h1>')
     il.add_xpath('content_html', '//h1/following-sibling::*')
     yield il.load_item()

示例#7

0

显示文件

文件： gnucash_org.py 项目： Lukas0907/feeds

 def _parse_news(self, response):
     il = FeedEntryItemLoader(response=response, parent=response.meta["il"])
     il.add_xpath(
         "content_html",
         '//div[@class="newsheader" and .//a[@id="{}"]]'
         '/following-sibling::div[@class="newsinner"]'.format(
             response.meta["news_id"]
         ),
     )
     return il.load_item()

示例#8

0

显示文件

文件： cbird_at.py 项目： Lukas0907/feeds

 def parse_item(self, response):
     il = FeedEntryItemLoader(
         selector=response.xpath('//div[@class="main"]'), timezone="Europe/Vienna"
     )
     il.add_xpath("title", "h1/text()")
     il.add_value("link", response.url)
     il.add_xpath("content_html", "h1/following-sibling::*")
     il.add_value("updated", response.url.rstrip("/").split("/")[-1].split("_")[0])
     il.add_value("author_name", self.name)
     return il.load_item()

示例#9

0

显示文件

文件： wienerlinien_at.py 项目： misspenalty/feeds

 def parse_item(self, response):
     remove_elems = ['h1', '.delayed-image-load']
     change_tags = {'noscript': 'div'}
     il = FeedEntryItemLoader(response=response,
                              parent=response.meta['il'],
                              remove_elems=remove_elems,
                              change_tags=change_tags,
                              base_url='http://{}'.format(self.name))
     il.add_xpath('content_html', '//div[@id="main-inner"]')
     yield il.load_item()

示例#10

0

显示文件

文件： cbird_at.py 项目： rodarima/PyFeeds

 def parse_item(self, response):
     il = FeedEntryItemLoader(
         selector=response.xpath('//div[@class="main"]'),
         timezone="Europe/Vienna")
     il.add_xpath("title", "h1/text()")
     il.add_value("link", response.url)
     il.add_xpath("content_html", "h1/following-sibling::*")
     il.add_value("updated",
                  response.url.rstrip("/").split("/")[-1].split("_")[0])
     il.add_value("author_name", self.name)
     return il.load_item()

示例#11

0

显示文件

文件： diepresse_com.py 项目： Lukas0907/feeds

 def parse_node(self, response, node):
     url = node.xpath("rss:loc/text()").extract_first()
     il = FeedEntryItemLoader(selector=node)
     il.add_value("link", url)
     il.add_xpath("title", "news:news/news:title/text()")
     keywords = node.xpath("news:news/news:keywords/text()").extract_first()
     if keywords:
         il.add_value("category", keywords.split(", "))
     il.add_xpath("updated", "news:news/news:publication_date/text()")
     return scrapy.Request(
         url, self.parse_item, meta={"il": il, "handle_httpstatus_list": [404]}
     )

示例#12

0

显示文件

 def parse_node(self, response, node):
     il = FeedEntryItemLoader(selector=node)
     url = node.xpath("link/text()").extract_first()
     il.add_value("link", url)
     il.add_xpath("updated", "pubDate/text()")
     il.add_xpath(
         "title",
         "title/text()",
         # Use re.DOTALL since some titles have newlines in them.
         re=re.compile("(?:Artikel|Tagebuch): (.*)", re.DOTALL),
     )
     return scrapy.Request(url, self._parse_article, meta={"il": il})

示例#13

0

显示文件

文件： wienerlinien_at.py 项目： marcelogp/PyFeeds

 def parse_item(self, response):
     remove_elems = ["h1", ".delayed-image-load"]
     change_tags = {"noscript": "div"}
     il = FeedEntryItemLoader(
         response=response,
         parent=response.meta["il"],
         remove_elems=remove_elems,
         change_tags=change_tags,
         base_url="https://www.{}".format(self.name),
     )
     il.add_xpath("content_html", '//div[@id="main-inner"]')
     return il.load_item()

示例#14

0

显示文件

文件： lwn_net.py 项目： rodarima/PyFeeds

 def _parse_article(self, response):
     remove_elems = [
         ".FeatureByline",
         ".GAByline",
         ".Form",
         "form",
         ".MakeALink",
         "br",
     ]
     change_tags = {"div.BigQuote": "blockquote"}
     il = FeedEntryItemLoader(
         response=response,
         parent=response.meta["il"],
         remove_elems=remove_elems,
         change_tags=change_tags,
         base_url=f"https://{self.name}",
     )
     text = response.css(".ArticleText").extract_first()
     # Remove 'Log in to post comments'.
     text = re.sub(r'<hr width="60%" align="left">.*to post comments\)',
                   "",
                   text,
                   flags=re.S)
     il.add_css("title", "h1::text")
     il.add_value("content_html", text)
     il.add_css("author_name", ".FeatureByline b ::text")
     il.add_css("author_name", ".GAByline a ::text")
     il.add_css(
         "author_name",
         ".GAByline p ::text",
         re="This article was contributed by (.*)",
     )
     il.add_xpath(
         "updated",
         '//div[@class="FeatureByline"]/text()[preceding-sibling::br]',
         TakeFirst(),
     )
     il.add_xpath("updated", '//div[@class="GAByline"]/p[1]/text()')
     # Last resort if date cannot be extracted and it's a weekly edition.
     if "updated" in response.meta:
         il.add_value("updated", response.meta["updated"])
     if response.css(".MakeALink"):
         # Get subscriber link for paywalled content.
         return scrapy.FormRequest.from_response(
             response,
             formcss=".MakeALink form",
             callback=self._subscriber_link,
             meta={"il": il},
         )
     else:
         il.add_value("link", response.url)
         return il.load_item()

示例#15

0

显示文件

 def parse_node(self, response, node):
     url = node.xpath("rss:loc/text()").extract_first()
     il = FeedEntryItemLoader(selector=node)
     il.add_value("link", url)
     il.add_xpath("title", "news:news/news:title/text()")
     keywords = node.xpath("news:news/news:keywords/text()").extract_first()
     if keywords:
         il.add_value("category", keywords.split(", "))
     il.add_xpath("updated", "news:news/news:publication_date/text()")
     return scrapy.Request(url,
                           self.parse_item,
                           meta={
                               "il": il,
                               "handle_httpstatus_list": [404]
                           })

示例#16

0

显示文件

文件： biblioweb_at.py 项目： Lukas0907/feeds

    def parse_content(self, response):
        parts = self._extract_parts(response)
        il = FeedEntryItemLoader(
            response=response, timezone="Europe/Vienna", dayfirst=True
        )
        il.add_value("path", self._library)
        il.add_value("title", " - ".join(parts[: self._find_first_meta(parts)]))
        il.add_value("link", response.url)
        il.add_xpath("updated", "//td/span/text()", re="In der Bibliothek seit: (.*)")

        _content = ["<ul>"]
        for part in parts:
            _content.append("<li>{}</li>".format(part))
        _content.append("</ul>")
        il.add_value("content_html", "".join(_content))
        return il.load_item()

示例#17

0

显示文件

文件： gnucash_org.py 项目： Lukas0907/feeds

    def parse_node(self, response, node):
        # Reuse most of the existing fields
        il = FeedEntryItemLoader(selector=node, base_url=self.feed_link)
        il.add_xpath("title", "atom:title/text()")
        il.add_xpath("link", "atom:link/@href")
        il.add_xpath("author_name", "atom:author/atom:name/text()")
        il.add_xpath("author_email", "atom:author/atom:email/text()")
        il.add_xpath("updated", "atom:updated/text()")

        # All news items are stored on a single page and may be referred to via
        # an ID. Extract an item's id and use it to subsequently extract the
        # corresponding news text.
        url, news_id = node.xpath("atom:link/@href").extract_first().split("#")
        return scrapy.Request(
            url, self._parse_news, dont_filter=True, meta={"news_id": news_id, "il": il}
        )

示例#18

0

显示文件

 def parse_node(self, response, node):
     il = FeedEntryItemLoader(response=response,
                              base_url='http://{}'.format(self.name),
                              dayfirst=True)
     il.add_xpath('updated', '//pubDate/text()')
     il.add_value('author_name',
                  node.xpath('//dc:creator/text()').extract_first())
     il.add_xpath('category', '//category/text()')
     title = node.xpath('(//title)[2]/text()').extract()
     if not title:
         # Fallback to the first category if no title is provided
         # (e.g. comic).
         title = response.xpath('//category/text()').extract_first()
     il.add_value('title', title)
     link = node.xpath('(//link)[2]/text()').extract_first()
     il.add_value('link', link)
     return scrapy.Request(link, self._parse_article, meta={'il': il})

示例#19

0

显示文件

文件： help_gv_at.py 项目： Lukas0907/feeds

 def _parse_item(self, response):
     remove_elems = [
         "h1",
         ".nono",
         ".acceptance_org",
         ".state",
         "script",
         ".gentics-portletreload-position-notvisibleposition",
     ]
     remove_elems_xpath = [
         """
         //div[
             @class='advice' and
             child::div[@class='advice_text' and (
                 contains(., 'nicht die aktuelle Rechtslage') or
                 contains(., 'wird nicht laufend aktualisiert') or
                 contains(., 'Übersicht über bisherige "Themen des Monats"')
             )]
         ]
         """,
         # Remove table of contents.
         "//li[child::a[starts-with(@href, '#')]]",
         "//ul[not(li)]",
     ]
     change_tags = {"abbr": "span"}
     il = FeedEntryItemLoader(
         response=response,
         timezone="Europe/Vienna",
         base_url="https://www.{}".format(self.name),
         remove_elems=remove_elems,
         remove_elems_xpath=remove_elems_xpath,
         change_tags=change_tags,
         dayfirst=True,
     )
     il.add_value("link", response.url)
     il.add_xpath(
         "author_name",
         '//div[@class="acceptance_org"]/text()[preceding-sibling::br]',
     )
     il.add_css("title", "title::text", re=r"HELP.gv.at:\s*(.*)")
     il.add_value(
         "updated", response.css(".state").re_first(r"(\d{2}\.\d{2}\.\d{4})")
     )
     il.add_css("content_html", ".Content")
     return il.load_item()

示例#20

0

显示文件

 def _parse_episode(self, response):
     il = FeedEntryItemLoader(response=response,
                              base_url='http://{}'.format(self.name),
                              timezone=self._timezone,
                              dayfirst=True)
     il.add_value('link', response.url)
     il.add_xpath('title', '//meta[@name="title"]/@content',
                  re='(?s)(.*?)(?: vom .*)? - puls4\.com')
     il.add_value('updated', '{} {}'.format(
         response.xpath('//meta[@name="title"]/@content').
         re_first(r'.*vom (\d{2}\.\d{2}\.\d{4}).*'),
         response.meta['time'] or '00:00')
     )
     il.add_value('content_html', '<img src="{}">'.format(
         response.xpath('//meta[@property="og:image"]/@content').
         extract_first()))
     il.add_css('content_html', '.player-video-description-intro::text')
     yield il.load_item()

示例#21

0

显示文件

    def parse_content(self, response):
        parts = self._extract_parts(response)
        il = FeedEntryItemLoader(response=response,
                                 timezone="Europe/Vienna",
                                 dayfirst=True)
        il.add_value("path", self._library)
        il.add_value("title", " - ".join(parts[:self._find_first_meta(parts)]))
        il.add_value("link", response.url)
        il.add_xpath("updated",
                     "//td/span/text()",
                     re="In der Bibliothek seit: (.*)")

        _content = ["<ul>"]
        for part in parts:
            _content.append(f"<li>{part}</li>")
        _content.append("</ul>")
        il.add_value("content_html", "".join(_content))
        return il.load_item()

示例#22

0

显示文件

    def parse_content(self, response):
        parts = self._extract_parts(response)
        il = FeedEntryItemLoader(response=response,
                                 timezone='Europe/Vienna',
                                 dayfirst=True)
        il.add_value('path', self._library)
        il.add_value('title', ' - '.join(parts[:self._find_first_meta(parts)]))
        il.add_value('link', response.url)
        il.add_xpath('updated',
                     '//td/span/text()',
                     re='In der Bibliothek seit: (.*)')

        _content = ['<ul>']
        for part in parts:
            _content.append('<li>{}</li>'.format(part))
        _content.append('</ul>')
        il.add_value('content_html', ''.join(_content))
        yield il.load_item()

示例#23

0

显示文件

 def _parse_item(self, response):
     remove_elems = [
         "h1",
         ".nono",
         ".acceptance_org",
         ".state",
         "script",
         ".gentics-portletreload-position-notvisibleposition",
     ]
     remove_elems_xpath = [
         """
         //div[
             @class='advice' and
             child::div[@class='advice_text' and (
                 contains(., 'nicht die aktuelle Rechtslage') or
                 contains(., 'wird nicht laufend aktualisiert') or
                 contains(., 'Übersicht über bisherige "Themen des Monats"')
             )]
         ]
         """,
         # Remove table of contents.
         "//li[child::a[starts-with(@href, '#')]]",
         "//ul[not(li)]",
     ]
     change_tags = {"abbr": "span"}
     il = FeedEntryItemLoader(
         response=response,
         timezone="Europe/Vienna",
         base_url="https://www.{}".format(self.name),
         remove_elems=remove_elems,
         remove_elems_xpath=remove_elems_xpath,
         change_tags=change_tags,
         dayfirst=True,
     )
     il.add_value("link", response.url)
     il.add_xpath(
         "author_name",
         '//div[@class="acceptance_org"]/text()[preceding-sibling::br]',
     )
     il.add_css("title", "title::text", re=r"HELP.gv.at:\s*(.*)")
     il.add_value("updated",
                  response.css(".state").re_first(r"(\d{2}\.\d{2}\.\d{4})"))
     il.add_css("content_html", ".Content")
     return il.load_item()

示例#24

0

显示文件

文件： verbraucherrecht_at.py 项目： Lukas0907/feeds

    def parse_item(self, response):
        il = FeedEntryItemLoader(
            response=response,
            base_url="{}/cms/".format(self.feed_link),
            timezone="Europe/Vienna",
            remove_elems=[".news-latest-date", ".news-single-rightbox", "hr", "h7"],
            remove_elems_xpath=[
                '//div[@class="news-single-item"]/b[1]',
                '//div[@class="news-single-item"]/br[1]',
            ],
            dayfirst=True,
        )

        il.add_value(
            "title", response.xpath("//head/title/text()").re_first(r"::: (.*)")
        )

        il.add_value("link", response.url)

        il.add_value(
            "updated",
            response.xpath('//div[@class="news-single-rightbox"]').re_first(
                r"(\d{2}\.\d{2}\.\d{4})"
            ),
        )

        il.add_value(
            "author_name",
            response.xpath('//head/meta[@name="publisher"]/@content').re_first(
                "recht.at, (.*);"
            ),
        )
        il.add_xpath("author_name", '//head/meta[@name="author"]/@content')
        il.add_value("author_name", self.name)

        il.add_xpath("author_email", '//head/meta[@name="reply-to"]/@content')

        il.add_css("content_html", ".news-single-item h7 font strong")
        il.add_css("content_html", ".news-single-item")

        return il.load_item()

示例#25

0

显示文件

文件： verbraucherrecht_at.py 项目： marcelogp/PyFeeds

    def parse_item(self, response):
        il = FeedEntryItemLoader(
            response=response,
            base_url="{}/cms/".format(self.feed_link),
            timezone="Europe/Vienna",
            remove_elems=[
                ".news-latest-date", ".news-single-rightbox", "hr", "h7"
            ],
            remove_elems_xpath=[
                '//div[@class="news-single-item"]/b[1]',
                '//div[@class="news-single-item"]/br[1]',
            ],
            dayfirst=True,
        )

        il.add_value(
            "title",
            response.xpath("//head/title/text()").re_first(r"::: (.*)"))

        il.add_value("link", response.url)

        il.add_value(
            "updated",
            response.xpath('//div[@class="news-single-rightbox"]').re_first(
                r"(\d{2}\.\d{2}\.\d{4})"),
        )

        il.add_value(
            "author_name",
            response.xpath('//head/meta[@name="publisher"]/@content').re_first(
                "recht.at, (.*);"),
        )
        il.add_xpath("author_name", '//head/meta[@name="author"]/@content')
        il.add_value("author_name", self.name)

        il.add_xpath("author_email", '//head/meta[@name="reply-to"]/@content')

        il.add_css("content_html", ".news-single-item h7 font strong")
        il.add_css("content_html", ".news-single-item")

        return il.load_item()

示例#26

0

显示文件

文件： atv_at.py 项目： Lukas0907/feeds

 def parse_program(self, response):
     if not response.css(r".jsb_video\/FlashPlayer"):
         return
     data = json.loads(
         response.css(r".jsb_video\/FlashPlayer").xpath("@data-jsb").extract()[0]
     )
     data = data["config"]["initial_video"]["parts"][0]["tracking"]["nurago"]
     il = FeedEntryItemLoader(
         response=response,
         base_url="https://{}".format(self.name),
         timezone="Europe/Vienna",
         dayfirst=True,
     )
     il.add_value("link", data["clipurl"])
     il.add_value("title", data["programname"])
     il.add_value("updated", data["airdate"])
     il.add_xpath("content_html", '//p[@class="plot_summary"]')
     item = il.load_item()
     # Only include videos posted in the last 7 days.
     if item["updated"] + self._timerange > datetime.now(timezone.utc):
         return item

示例#27

0

显示文件

 def _parse_item(self, response):
     remove_elems = [
         'h1', '.nono', '.acceptance_org', '.state', 'script',
         '.gentics-portletreload-position-notvisibleposition'
     ]
     remove_elems_xpath = [
         """
         //div[
             @class='advice' and
             child::div[@class='advice_text' and (
                 contains(., 'nicht die aktuelle Rechtslage') or
                 contains(., 'wird nicht laufend aktualisiert') or
                 contains(., 'Übersicht über bisherige "Themen des Monats"')
             )]
         ]
         """,
         # Remove table of contents.
         "//li[child::a[starts-with(@href, '#')]]",
         "//ul[not(li)]",
     ]
     change_tags = {
         'abbr': 'span',
     }
     il = FeedEntryItemLoader(response=response,
                              timezone=self._timezone,
                              base_url='https://www.{}'.format(self.name),
                              remove_elems=remove_elems,
                              remove_elems_xpath=remove_elems_xpath,
                              change_tags=change_tags,
                              dayfirst=True)
     il.add_value('link', response.url)
     il.add_xpath(
         'author_name',
         '//div[@class="acceptance_org"]/text()[preceding-sibling::br]',
     )
     il.add_css('title', 'title::text', re=r'HELP.gv.at:\s*(.*)')
     il.add_value('updated',
                  response.css('.state').re_first(r'(\d{2}\.\d{2}\.\d{4})'))
     il.add_css('content_html', '.Content')
     yield il.load_item()

示例#28

0

显示文件

文件： atv_at.py 项目： rodarima/PyFeeds

 def parse_program(self, response):
     if not response.css(r".jsb_video\/FlashPlayer"):
         return
     data = json.loads(
         response.css(r".jsb_video\/FlashPlayer").xpath(
             "@data-jsb").extract()[0])
     data = data["config"]["initial_video"]["parts"][0]["tracking"][
         "nurago"]
     il = FeedEntryItemLoader(
         response=response,
         base_url=f"https://{self.name}",
         timezone="Europe/Vienna",
         dayfirst=True,
     )
     il.add_value("link", data["clipurl"])
     il.add_value("title", data["programname"])
     il.add_value("updated", data["airdate"])
     il.add_xpath("content_html", '//p[@class="plot_summary"]')
     item = il.load_item()
     # Only include videos posted in the last 7 days.
     if item["updated"] + self._timerange > datetime.now(timezone.utc):
         return item

示例#29

0

显示文件

 def parse_program(self, response):
     if not response.css('.jsb_video\/FlashPlayer'):
         return
     data = (
         json.loads(response.css('.jsb_video\/FlashPlayer').xpath(
             '@data-jsb').extract()[0])
     )
     data = (
         data['config']['initial_video']['parts'][0]['tracking']['nurago']
     )
     il = FeedEntryItemLoader(response=response,
                              base_url='http://{}'.format(self.name),
                              timezone=self._timezone,
                              dayfirst=True)
     il.add_value('link', data['clipurl'])
     il.add_value('title', data['programname'])
     il.add_value('updated', data['airdate'])
     il.add_xpath('content_html', '//p[@class="plot_summary"]')
     item = il.load_item()
     # Only include videos posted in the last 7 days.
     if (item['updated'] + self._timerange >
             delorean.utcnow().shift(self._timezone)):
         yield item

示例#30

0

显示文件

 def _parse_article(self, response):
     remove_elems = [
         '.FeatureByline', '.GAByline', '.Form', 'form', '.MakeALink', 'br'
     ]
     il = FeedEntryItemLoader(response=response,
                              parent=response.meta['il'],
                              remove_elems=remove_elems,
                              base_url='https://{}'.format(self.name),
                              dayfirst=True)
     text = response.css('.ArticleText').extract_first()
     # Remove 'Log in to post comments'.
     text = re.sub(r'<hr width="60%" align="left">.*to post comments\)',
                   '',
                   text,
                   flags=re.S)
     il.add_css('title', 'h1::text')
     il.add_value('content_html', text)
     il.add_css('author_name', '.FeatureByline b ::text')
     il.add_xpath(
         'updated',
         '//div[@class="FeatureByline"]/text()[preceding-sibling::br]',
         TakeFirst())
     il.add_xpath('updated', '//div[@class="GAByline"]/p[1]/text()')
     # Last resort if date cannot be extracted and it's a weekly edition.
     if 'updated' in response.meta:
         il.add_value('updated', response.meta['updated'])
     if response.css('.MakeALink'):
         # Get subscriber link for paywalled content.
         yield scrapy.FormRequest.from_response(
             response,
             formcss='.MakeALink form',
             callback=self._subscriber_link,
             meta={'il': il})
     else:
         il.add_value('link', response.url)
         yield il.load_item()

示例#31

0

显示文件

文件： openwrt_org.py 项目： Lukas0907/feeds

 def parse_release_notes(self, response):
     il = FeedEntryItemLoader(
         response=response, timezone="Europe/Berlin", base_url=self._base_url
     )
     il.add_xpath("title", "//h1/text()")
     il.add_value("link", response.url)
     il.add_xpath("updated", '//div[@class="docInfo"]', re="Last modified: (.*) by")
     il.add_value("content_html", "<h1>Release Notes</h1>")
     il.add_xpath("content_html", "//h1/following-sibling::*")
     return scrapy.Request(
         response.url.replace("notes-", "changelog-"),
         self.parse_release_changelog,
         meta={"il": il},
     )

示例#32

0

显示文件

文件： openwrt_org.py 项目： rodarima/PyFeeds

 def parse_release_notes(self, response):
     il = FeedEntryItemLoader(
         response=response,
         timezone="Europe/Berlin",
         base_url=self.feed_link,
         remove_elems=[".cookielaw-banner"],
     )
     il.add_xpath("title", "//h1/text()")
     il.add_value("link", response.url)
     il.add_xpath("updated",
                  '//div[@class="docInfo"]',
                  re="Last modified: (.*) by")
     il.add_value("content_html", "<h1>Release Notes</h1>")
     il.add_xpath("content_html", "//h1/following-sibling::*")
     return il.load_item()

示例#33

0

显示文件

文件： openwrt_org.py 项目： misspenalty/feeds

 def parse_release_notes(self, response):
     il = FeedEntryItemLoader(
         response=response,
         timezone=self._timezone,
         base_url=self._base_url,
     )
     il.add_xpath('title', '//h1/text()')
     il.add_value('link', response.url)
     il.add_xpath('updated',
                  '//div[@class="docInfo"]',
                  re='Last modified: (.*) by')
     il.add_value('content_html', '<h1>Release Notes</h1>')
     il.add_xpath('content_html', '//h1/following-sibling::*')
     yield scrapy.Request(response.url.replace('notes-', 'changelog-'),
                          self.parse_release_changelog,
                          meta={'il': il})

示例#34

0

显示文件

 def parse_release_notes(self, response):
     il = FeedEntryItemLoader(response=response,
                              timezone="Europe/Berlin",
                              base_url=self._base_url)
     il.add_xpath("title", "//h1/text()")
     il.add_value("link", response.url)
     il.add_xpath("updated",
                  '//div[@class="docInfo"]',
                  re="Last modified: (.*) by")
     il.add_value("content_html", "<h1>Release Notes</h1>")
     il.add_xpath("content_html", "//h1/following-sibling::*")
     return scrapy.Request(
         response.url.replace("notes-", "changelog-"),
         self.parse_release_changelog,
         meta={"il": il},
     )

示例#35

0

显示文件

 def parse_item(self, response):
     il = FeedEntryItemLoader(
         selector=response.xpath('//div[@id="maincontentbook"]'),
         base_url=self.feed_link,
     )
     il.add_xpath("title", '//h1[@class="p_book_title"]/text()')
     il.add_xpath("title", '//h3[@class="p_book_title_ebook"]/text()')
     il.add_value("link", response.url)
     il.add_value("author_name", self.feed_title)
     il.add_xpath("content_html", '//h1[@class="p_book_title"]/text()')
     il.add_xpath("content_html", '//h2[@class="p_book_author"]/text()')
     il.add_xpath("content_html", '//p[@class="p_book_publisher"]/text()')
     il.add_xpath("content_html", '//p[@class="p_book_isbn"]/text()')
     il.add_xpath("content_html", '(//span[@class="txt10px"])[1]/text()')
     il.add_xpath("content_html", '(//span[@class="txt10px"])[3]/text()')
     il.add_xpath("content_html", '//div[@class="bookcontent"]//text()')
     il.add_xpath("content_html", '//div[@class="p_book_image"]/img')
     il.add_xpath("content_html", '//span[@style="color:red;"]/b/text()')
     return il.load_item()

示例#36

0

显示文件

    def parse_item(self, response):
        il = FeedEntryItemLoader(
            selector=response.xpath('//div[@id="maincontentbook"]'))
        il.add_xpath('title', '//h1[@class="p_book_title"]/text()')
        il.add_xpath('title', '//h3[@class="p_book_title_ebook"]/text()')
        il.add_value('link', response.url)
        il.add_value('author_name', self._title)
        il.add_xpath('content_html', '//h1[@class="p_book_title"]/text()')
        il.add_xpath('content_html', '//h2[@class="p_book_author"]/text()')
        il.add_xpath('content_html', '//p[@class="p_book_publisher"]/text()')
        il.add_xpath('content_html', '//p[@class="p_book_isbn"]/text()')
        il.add_xpath('content_html', '(//span[@class="txt10px"])[1]/text()')
        il.add_xpath('content_html', '(//span[@class="txt10px"])[3]/text()')
        il.add_xpath('content_html', '//div[@class="bookcontent"]//text()')
        il.add_xpath('content_html', '//div[@class="p_book_image"]/img')
        il.add_xpath('content_html', '//span[@style="color:red;"]/b/text()')

        # NOTE: The page does not provide any usable timestamp so we convert
        # the bok_id parameter to unix epoch.
        bok_id = w3lib.url.url_query_parameter(response.url, 'bok_id', '0')
        timestamp = datetime.datetime.utcfromtimestamp(int(bok_id))
        il.add_value('updated', timestamp.isoformat())

        yield il.load_item()

示例#37

0

显示文件

文件： ak_ciando_com.py 项目： Lukas0907/feeds

 def parse_item(self, response):
     il = FeedEntryItemLoader(
         selector=response.xpath('//div[@id="maincontentbook"]'),
         base_url=self.feed_link,
     )
     il.add_xpath("title", '//h1[@class="p_book_title"]/text()')
     il.add_xpath("title", '//h3[@class="p_book_title_ebook"]/text()')
     il.add_value("link", response.url)
     il.add_value("author_name", self.feed_title)
     il.add_xpath("content_html", '//h1[@class="p_book_title"]/text()')
     il.add_xpath("content_html", '//h2[@class="p_book_author"]/text()')
     il.add_xpath("content_html", '//p[@class="p_book_publisher"]/text()')
     il.add_xpath("content_html", '//p[@class="p_book_isbn"]/text()')
     il.add_xpath("content_html", '(//span[@class="txt10px"])[1]/text()')
     il.add_xpath("content_html", '(//span[@class="txt10px"])[3]/text()')
     il.add_xpath("content_html", '//div[@class="bookcontent"]//text()')
     il.add_xpath("content_html", '//div[@class="p_book_image"]/img')
     il.add_xpath("content_html", '//span[@style="color:red;"]/b/text()')
     return il.load_item()