示例#1
0
    def parse_item(self, response):
        il = FeedEntryItemLoader(
            response=response,
            base_url='{}/cms/'.format(self._link),
            timezone=self._timezone,
            remove_elems=['.news-latest-date', '.news-single-rightbox', 'hr',
                          'h7'],
            remove_elems_xpath=['//div[@class="news-single-item"]/b[1]',
                                '//div[@class="news-single-item"]/br[1]'],
        )

        il.add_value(
            'title',
            response.xpath('//head/title/text()').re_first(r'::: (.*)'))

        il.add_value('link', response.url)

        il.add_value(
            'updated',
            response.xpath('//div[@class="news-single-rightbox"]').
            re_first(r'(\d{2}\.\d{2}\.\d{4})'))

        il.add_value(
            'author_name',
            response.xpath('//head/meta[@name="publisher"]/@content').
            re_first('recht.at, (.*);'))
        il.add_xpath('author_name', '//head/meta[@name="author"]/@content')
        il.add_value('author_name', self.name)

        il.add_xpath('author_email', '//head/meta[@name="reply-to"]/@content')

        il.add_css('content_html', '.news-single-item h7 font strong')
        il.add_css('content_html', '.news-single-item')

        yield il.load_item()
示例#2
0
    def parse_item(self, response):
        author_date = " ".join(response.css(".author-date ::text").extract())
        match = re.search(r"von\s+(.*)", author_date)
        author_name = match.group(1) if match else "Red."

        remove_elems = [
            "aside",
            "script",
            "h1",
            "source",
            ".breadcrumbs",
            ".author-date",
            ".artikel-social-kommentar",
            ".bild-copyright",
            ".ressortTitleMobile",
            ".article-number",
            ".artikel-kommentarlink",
            ".umfrage-wrapper",
            ".articleIssueInfo",
            "hr",
            "center div[style='padding: 10px; background:#efefef']",
        ]
        il = FeedEntryItemLoader(
            response=response,
            base_url="https://{}".format(self.name),
            remove_elems=remove_elems,
        )
        il.add_value("link", response.url)
        il.add_value("author_name", author_name)
        il.add_css("title", 'h1[itemprop="headline"]::text')
        il.add_value("updated", response.meta["updated"])
        il.add_css("content_html", "article")
        return il.load_item()
示例#3
0
    def parse(self, response):
        json_response = json.loads(response.text)

        if 'next' in json_response['_links']:
            yield Request(json_response['_links']['nextPage'],
                          meta={'dont_cache': True})

        for item in json_response['_embedded']['items']:
            il = FeedEntryItemLoader(response=response,
                                     timezone=self._timezone,
                                     dayfirst=False)
            il.add_value('title', item['title'])
            il.add_value(
                'content_html',
                '<img src="{}">'.format(item['playlist']['preview_image_url']))
            if item['description']:
                il.add_value('content_html',
                             item['description'].replace('\r\n', '<br>'))
            il.add_value('updated', item['date'])
            il.add_value(
                'link', item['url'].replace('api-tvthek.orf.at',
                                            'tvthek.orf.at'))
            yield Request(item['_links']['profile']['href'],
                          self._parse_profile,
                          meta={'item': il},
                          dont_filter=True)
示例#4
0
 def parse(self, response):
     articles = json.loads(response.text)
     for article in articles:
         il = FeedEntryItemLoader()
         il.add_value('title', article['title'])
         il.add_value('link', article['url'])
         if 'thumbnail_url_1_1' in article:
             il.add_value(
                 'content_html',
                 '<img src="{}">'.format(article['thumbnail_url_1_1']))
         il.add_value('content_html', article['body'])
         il.add_value('updated',
                      delorean.epoch(article['publish_date'] / 1000))
         il.add_value('author_name', [
             contribution['contributor']['full_name']
             for contribution in article['contributions']
         ])
         il.add_value('category', article['channel']['name'])
         for topic in article['topics'] + [article['primary_topic']]:
             if topic and 'name' in topic:
                 il.add_value('category', topic['name'].title())
         if article['nsfw']:
             il.add_value('category', 'nsfw')
         if article['nsfb']:
             il.add_value('category', 'nsfb')
         il.add_value('path', response.meta['locale'])
         yield il.load_item()
示例#5
0
 def parse(self, response):
     m = re.search("window.DELINSKI, {listViewEntities: (.*)}",
                   response.text)
     restaurants = sorted(
         json.loads(m.group(1))["restaurants"]["entities"].values(),
         key=lambda r: int(r["created"]),
         reverse=True,
     )
     for restaurant in restaurants[:20]:
         il = FeedEntryItemLoader(timezone="UTC", base_url=response.url)
         url = response.urljoin(restaurant["url"])
         il.add_value("link", url)
         il.add_value("title", restaurant["name"])
         content = """
         <img src="{image}">
         <ul>
             <li>{address}</li>
             <li>{price_range_human}</li>
             <li>{cuisine_text}</li>
         </ul>
         """
         il.add_value("content_html", content.format(**restaurant))
         il.add_value("updated",
                      datetime.utcfromtimestamp(int(restaurant["created"])))
         yield scrapy.Request(url, self._parse_restaurant, meta={"il": il})
示例#6
0
 def parse_movies(self, response):
     entries = json.loads(response.text)["hits"]
     for entry in entries:
         il = FeedEntryItemLoader(response=response,
                                  base_url="https://{}".format(self.name))
         il.add_value("path", "{}".format(response.meta["movies"]))
         il.add_value(
             "link",
             "https://www.{}/kino/{}".format(self.name, entry["prod_id"]))
         il.add_value("title", entry["prod"])
         il.add_value("content_html", entry["comment"])
         for image in entry["images"] or []:
             il.add_value(
                 "content_html",
                 '<img src="https://faltercdn2.falter.at/events/1080/{}">'.
                 format(image["filename"]),
             )
         if "stream" in entry:
             il.add_value("content_html",
                          '<a href="{s}">{s}</a>'.format(s=entry["stream"]))
         for key, value in entry.items():
             if key.startswith("has_") and value:
                 il.add_value("category", key.replace("has_", ""))
             elif key.startswith("is_") and value:
                 il.add_value("category", key.replace("is_", ""))
         il.add_value("updated", entry["index_date"])
         yield il.load_item()
示例#7
0
    def _parse_article(self, response):
        title = response.css('meta[property="og:title"]::attr(content)').extract_first()
        if not title:
            raise DropResponse(
                "Skipping {} because ran into bot detection".format(response.url),
                transient=True,
            )

        remove_elems = [
            "meta",
            ".ds-share-list",
            ".advert",
            ".layout-article-links",
            ".ds-chapter-list",
            ".layout-article-meta",
        ]
        change_tags = {
            ".article__lead-image": "figure",
            ".article__description": "h2",
            ".article__footnote": "i",
        }
        il = FeedEntryItemLoader(
            response=response,
            base_url="https://{}".format(self.name),
            remove_elems=remove_elems,
            change_tags=change_tags,
        )
        il.add_value("link", response.url)
        il.add_value("title", title)
        il.add_css("updated", "time.article__dateline-datetime::attr('datetime')")
        il.add_css("content_html", ".article__lead-image")
        il.add_css("content_html", ".article__description")
        il.add_css("content_html", ".layout-article-body")
        il.add_value("path", response.meta["ressort"])
        return il.load_item()
示例#8
0
    def _parse_video_page(self, response):
        match = re.search(
            r"https?://(?:www\.)?servustv\.com/videos/(?P<id>[aA]{2}-\w+|\d+-\d+)",
            response.url,
        )
        if not match:
            return
        video_id = match.group("id").upper()

        il = FeedEntryItemLoader(response=response)
        il.add_value("link", response.url)
        section = response.css(
            "meta[property='article:section']::attr('content')").extract_first(
            )
        if section != "Allgemein":
            il.add_value("title", section)
        il.add_css("title", "title::text", re="(.*) - Servus TV")
        image_url = response.css(
            "meta[property='og:image']::attr('content')").extract_first()
        il.add_value("content_html", '<img src="{}">'.format(image_url))
        il.add_css("content_html",
                   "meta[property='og:description']::attr('content')")
        il.add_css("content_html", "#media-asset-content-container")

        match = re.search(r'"dateModified":\s*"([^"]+)"', response.text)
        if match:
            il.add_value("updated", match.group(1))

        stream_url = "https://stv.rbmbtnx.net/api/v1/manifests/%s.m3u8" % video_id

        yield Request(stream_url, self._parse_stream, meta={"il": il})
示例#9
0
    def _parse_article(self, response):
        feed_entry = response.meta["feed_entry"]

        il = FeedEntryItemLoader(parent=response.meta["il"])
        try:
            response.text
        except AttributeError:
            # Response is not text (e.g. PDF, ...).
            il.add_value("title", feed_entry.get("title"))
            il.add_value("content_html", feed_entry.get("summary"))
            return il.load_item()

        doc = Document(response.text, url=response.url)
        il.add_value("title", doc.short_title() or feed_entry.get("title"))
        summary = feed_entry.get("summary")
        try:
            content = doc.summary(html_partial=True)
            if summary and len(summary) > len(content):
                # Something probably went wrong if the extracted content is shorter than
                # the summary.
                raise Unparseable
        except Unparseable:
            content = summary
        il.add_value("content_html", content)

        return il.load_item()
示例#10
0
 def _parse_episode(self, response):
     il = FeedEntryItemLoader(
         response=response,
         base_url=f"https://{self.name}",
         timezone="Europe/Vienna",
         dayfirst=True,
     )
     il.add_value("link", response.url)
     il.add_xpath(
         "title",
         '//meta[@name="title"]/@content',
         re=r"(?s)(.*?)(?: vom .*)? - puls4\.com",
     )
     il.add_value(
         "updated",
         "{} {}".format(
             response.xpath('//meta[@name="title"]/@content').re_first(
                 r".*vom (\d{2}\.\d{2}\.\d{4}).*"),
             response.meta["time"] or "00:00",
         ),
     )
     il.add_value(
         "content_html",
         '<img src="{}">'.format(
             response.xpath(
                 '//meta[@property="og:image"]/@content').extract_first()),
     )
     il.add_css("content_html", ".player-video-description-intro::text")
     return il.load_item()
示例#11
0
 def parse_item(self, response):
     remove_elems = [
         'aside',
         'script',
         'h1',
         '.breadcrumbs',
         '.author-date',
         '.artikel-social-kommentar',
         '.bild-copyright',
         '.ressortTitleMobile',
         '.article-number',
         '.artikel-kommentarlink',
         '.umfrage-wrapper',
         '.articleIssueInfo',
     ]
     il = FeedEntryItemLoader(response=response,
                              timezone=self._timezone,
                              base_url='http://{}'.format(self.name),
                              remove_elems=remove_elems)
     il.add_value('link', response.url)
     author_name = (
         response.css('.author-date ::text').re(r'(?:Von)?\s*(\w+ \w+)')
         or 'Red.')
     il.add_value('author_name', author_name)
     il.add_css('title', 'h1[itemprop="headline"]::text')
     il.add_css('updated',
                'meta[property="article:published_time"]::attr(content)',
                re='([^+]*)')
     il.add_css('content_html', 'article')
     yield il.load_item()
示例#12
0
 def parse_broadcast(self, response):
     broadcast = json.loads(response.text)
     il = FeedEntryItemLoader(response=response,
                              timezone=self._timezone,
                              dayfirst=False)
     link = 'https://{}/programm/{}/{}'.format(self.name,
                                               response.meta['oe1_day'],
                                               broadcast['programKey'])
     il.add_value('link', link)
     il.add_value('title', broadcast['programTitle'])
     il.add_value('title', broadcast['title'])
     if broadcast.get('streams'):
         stream = 'http://loopstream01.apa.at/?channel=oe1&id={}'.format(
             broadcast['streams'][0]['loopStreamId'])
         il.add_value('enclosure_iri', stream)
         il.add_value('enclosure_type', 'audio/mpeg')
     il.add_value('updated', broadcast['niceTimeISO'])
     if broadcast['subtitle']:
         il.add_value('content_html',
                      '<strong>{}</strong>'.format(broadcast['subtitle']))
     for item in broadcast['items']:
         if 'title' in item:
             il.add_value('content_html',
                          '<h3>{}</h3>'.format(item['title']))
         il.add_value('content_html', item.get('description'))
     il.add_value('content_html', broadcast['description'])
     yield il.load_item()
示例#13
0
 def parse_node(self, response, node):
     il = FeedEntryItemLoader(response=response,
                              base_url=f"https://{self.name}")
     updated = dateutil_parse(node.xpath("dc:date/text()").extract_first())
     il.add_value("updated", updated)
     title = node.xpath("rss:title/text()").extract_first()
     paywalled = title.startswith("[$]")
     if paywalled:
         title = title.replace("[$] ", "")
         il.add_value("category", "paywalled")
     link = node.xpath("rss:link/text()").extract_first()
     link = link.replace("rss", "")
     link = link.replace("http://", "https://")
     meta = {"il": il}
     if paywalled and not self._subscribed:
         il.add_value("title", title)
         il.add_value("author_name",
                      node.xpath("dc:creator/text()").extract_first())
         il.add_value("content_text",
                      node.xpath("rss:description/text()").extract_first())
         il.add_value("link", link)
         return il.load_item()
     else:
         if "LWN.net Weekly Edition for" in title:
             meta["updated"] = updated
             callback = self._parse_weekly_edition
             link += "bigpage"
         else:
             callback = self._parse_article
         # Don't include link yet, we will use the subscriber link later.
         # So subscriber articles can be shared from the feed reader and
         # read in browser without logging in.
         return scrapy.Request(link, callback, meta=meta)
示例#14
0
    def _parse_article(self, response):
        if response.status == 410:
            # Articles has been deleted.
            return

        remove_elems = [
            '.bildtext .author', 'iframe',
        ]
        change_tags = {
            'h1': 'h2'
        }
        il = FeedEntryItemLoader(response=response,
                                 timezone=self._timezone,
                                 base_url='https://www.{}'.format(self.name),
                                 remove_elems=remove_elems,
                                 change_tags=change_tags,
                                 dayfirst=False,
                                 yearfirst=False)
        if response.css('.payment'):
            il.add_value('category', 'paywalled')
        il.add_css('link', 'link[rel="canonical"]::attr(href)')
        il.add_css('title', 'meta[property="og:title"]::attr(content)')
        il.add_css('author_name', '.druckheadline::text',
                   re='·\s*(.*)\s*·')
        il.add_css('updated',
                   'meta[http-equiv="last-modified"]::attr(content)')
        il.add_css('content_html', '.druckcontent')
        il.add_value('path', response.meta['ressort'])
        yield il.load_item()
示例#15
0
    def parse_album(self, response):
        def _replace_track_info(elem):
            parts = list(
                map(lambda x: x.text_content().strip(), elem.getchildren()))
            return '<p>{} <i>({})</i></p>'.format(parts[0], parts[1])

        title = response.xpath('//h1[@class="c-product-block__title"]//text()'
                               ).extract()[-1].strip()
        artist = response.xpath(
            '//div[contains(@class,"c-product-block__contributors")]/p/text()'
        ).re_first('[^,]+')
        il = FeedEntryItemLoader(
            response=response,
            base_url="https://{}/".format(self.name),
            remove_elems=[
                '.c-product-block__title', '.c-product__product-purchase',
                '.c-track__format-specific-info', '.c-track__duration',
                '.c-track__details', '.c-tracklist__initial-tracks',
                '.c-tabs-block__tabs-links', 'button'
            ],
            replace_elems={'.c-track__all-format-info': _replace_track_info})
        il.add_value("title", '{} - {}'.format(artist, title))
        il.add_value("link", response.url)
        il.add_value("author_name", 'bot')
        il.add_css("content_html", 'div.c-page--product')
        return il.load_item()
示例#16
0
    def _parse_article_url(self, response):
        if 'Fehler' in response.css('h2 ::text').extract_first():
            self.logger.info('Skipping {} as it returned an error'.format(
                response.url))
            return

        remove_elems = ['div[style="padding-top:10px;"]']
        il = FeedEntryItemLoader(response=response,
                                 timezone=self._timezone,
                                 base_url='http://{}'.format(self.name),
                                 dayfirst=True,
                                 remove_elems=remove_elems)
        il.add_value('link', response.url)
        il.add_value('author_name', 'VKI')
        date = response.css('.issue').re_first(
            'veröffentlicht:\s*([0-9]{2}\.[0-9]{2}\.[0-9]{4})')
        il.add_value('updated', date)
        url = (response.xpath('//a[text()="Druckversion"]/@onclick').re_first(
            r"window\.open\('(.*)'\);"))
        il.add_css('title', 'h1::text')
        if url:
            yield scrapy.Request(response.urljoin(url),
                                 callback=self._parse_article,
                                 meta={'il': il})
        else:
            il.add_value('category', 'paywalled')
            il.add_css('content_html', '.primary')
            il.add_css('content_html', 'div[style="padding-top:10px;"] > h3')
            yield il.load_item()
示例#17
0
 def _parse_article(self, response):
     remove_elems = [
         ".caption-credit",
         ".gallery-image-credit",
         "#social-left",
         "ul.toc",
         "h3:contains('Table of Contents')",
         "br",
         ".sidebar:contains('Further Reading')",
         ".credit",
     ]
     change_tags = {".sidebar": "blockquote", "aside": "blockquote"}
     replace_elems = {"div.image": self._div_to_img}
     il = FeedEntryItemLoader(
         response=response,
         parent=response.meta["il"],
         remove_elems=remove_elems,
         replace_elems=replace_elems,
         change_tags=change_tags,
     )
     if response.meta.get("first_page", False):
         il.add_value("link", response.url)
         il.add_css("author_name", ".byline a span ::text")
         il.add_css("content_html", "header h2")
         il.add_value("path", response.meta["path"])
     il.add_css("content_html", ".article-content")
     if response.css(".next"):
         return scrapy.Request(
             response.css(".numbers a::attr(href)").extract()[-1],
             self._parse_article,
             meta={"il": il, "path": response.meta["path"]},
         )
     else:
         return il.load_item()
示例#18
0
    def _parse_article(self, response):
        def _fix_img_src(elem):
            if "data-original" in elem.attrib:
                elem.attrib["src"] = elem.attrib["data-original"]
            return elem

        remove_elems = [
            ".credit",
            ".hide-caption",
            ".toggle-caption",
            ".enlarge-options",
            ".enlarge_measure",
            ".enlarge_html",
            ".ad-backstage",
            'p:first-of-type:contains("Editor\'s Note: This is an excerpt of")',
            'p:contains("Did you enjoy this newsletter segment?")',
        ]
        replace_elems = {"img": _fix_img_src}
        change_tags = {".image": "figure", ".credit-caption": "figcaption"}

        il = FeedEntryItemLoader(
            response=response,
            base_url=self._base_url,
            remove_elems=remove_elems,
            replace_elems=replace_elems,
            change_tags=change_tags,
        )
        il.add_css("title", "h1 ::text")
        il.add_value("link", response.url)
        il.add_css("content_html", "#storytext")
        il.add_value("path", response.meta["path"])
        il.add_css("updated", '.dateblock time::attr("datetime")')
        il.add_css("author_name", ".byline__name a::text")

        yield il.load_item()
示例#19
0
 def parse_item(self, response):
     remove_elems = [
         "aside",
         "script",
         "h1",
         "source",
         ".breadcrumbs",
         ".author-date",
         ".artikel-social-kommentar",
         ".bild-copyright",
         ".ressortTitleMobile",
         ".article-number",
         ".artikel-kommentarlink",
         ".umfrage-wrapper",
         ".articleIssueInfo",
     ]
     il = FeedEntryItemLoader(
         response=response,
         base_url="https://{}".format(self.name),
         remove_elems=remove_elems,
     )
     il.add_value("link", response.url)
     author_name = (
         response.css(".author-date ::text").re(r"(?:Von)?\s*(\w+ \w+)")
         or "Red.")
     il.add_value("author_name", author_name)
     il.add_css("title", 'h1[itemprop="headline"]::text')
     il.add_value("updated", response.meta["updated"])
     il.add_css("content_html", "article")
     return il.load_item()
示例#20
0
 def parse_release_changelog(self, response):
     il = FeedEntryItemLoader(response=response,
                              parent=response.meta["il"],
                              base_url=self._base_url)
     il.add_value("content_html", "<h1>Detailed Changelog</h1>")
     il.add_xpath("content_html", "//h1/following-sibling::*")
     return il.load_item()
示例#21
0
 def parse_archive_search(self, response):
     articles = json.loads(response.text)["articles"]["hits"]
     for i, item in enumerate(articles):
         il = FeedEntryItemLoader(
             response=response,
             base_url="https://{}".format(self.name),
             timezone="Europe/Vienna",
         )
         il.add_value("path", "magazine")
         link = response.urljoin(item["detail_link"])
         il.add_value("link", link)
         try:
             author = re.sub(
                 r"(?:.*:|Von)\s*(.*)", r"\1", ", ".join(item["authors"]).title()
             )
             il.add_value("author_name", author)
         except IndexError:
             pass
         il.add_value("title", item["title"])
         # All articles have the same date.
         # We add an offset so they are sorted in the right order.
         date = response.meta["issue_date"] + timedelta(seconds=i)
         il.add_value("updated", date)
         il.add_value("category", item["ressort"])
         yield scrapy.Request(link, self.parse_item_text, meta={"il": il})
示例#22
0
 def _parse_article(self, response):
     remove_elems = ['#issue', 'h1', '#slogan', '#logo', '#footer']
     il = FeedEntryItemLoader(response=response,
                              parent=response.meta['il'],
                              base_url='http://{}'.format(self.name),
                              remove_elems=remove_elems)
     il.add_css('content_html', '#page')
     yield il.load_item()
示例#23
0
 def _parse_article(self, response):
     remove_elems = ['iframe', 'script']
     il = FeedEntryItemLoader(response=response,
                              parent=response.meta['il'],
                              remove_elems=remove_elems,
                              base_url='http://{}'.format(self.name))
     il.add_css('content_html', '.entry-content')
     return il.load_item()
示例#24
0
 def parse_release_changelog(self, response):
     il = FeedEntryItemLoader(
         response=response,
         parent=response.meta['il'],
         base_url=self._base_url,
     )
     il.add_value('content_html', '<h1>Detailed Changelog</h1>')
     il.add_xpath('content_html', '//h1/following-sibling::*')
     yield il.load_item()
示例#25
0
 def parse(self, response):
     page = json.loads(response.text)
     yield generate_feed_header(title=page["name"],
                                link=page["link"],
                                path=response.meta["page_id"])
     for entry in page["posts"]["data"]:
         il = FeedEntryItemLoader()
         # updated_time also includes new comments not only updates to the
         # post.
         il.add_value("updated", entry["created_time"])
         il.add_value(
             "link",
             "https://www.{name}/{user_id}/posts/{post_id}".format(
                 name=self.name,
                 **dict(zip(["user_id", "post_id"],
                            entry["id"].split("_")))),
         )
         message = entry.get("message")
         name = entry.get("name")
         link = entry.get("link")
         if message:
             message = message.splitlines()
             title = message[0]
             if len(title.split()) < 10 and not title.startswith("http"):
                 # If the first line has less than ten words, it could be a
                 # title.
                 if title.upper() == title:
                     title = title.title()
                 del message[0]
             elif name and not name.startswith("http"):
                 # Fallback to the name (of the link).
                 title = name
             else:
                 # Fallback to the first ten words of the message.
                 title = " ".join(message[0].split(maxsplit=10)) + " ..."
             message = bleach.linkify("</p><p>".join(message))
             il.add_value("content_html", "<p>{}</p>".format(message))
         elif name:
             title = name
         else:
             title = link
         il.add_value("title", title)
         if link and name:
             il.add_value(
                 "content_html",
                 '<p><a href="{link}">{name}</a></p>'.format(link=link,
                                                             name=name),
             )
         picture = entry.get("picture")
         if picture:
             il.add_value(
                 "content_html",
                 '<a href="{link}"><img src="{image}"></a>'.format(
                     link=link, image=picture),
             )
         il.add_value("path", response.meta["page_id"])
         yield il.load_item()
示例#26
0
 def parse_item(self, response):
     remove_elems = ['h1', '.delayed-image-load']
     change_tags = {'noscript': 'div'}
     il = FeedEntryItemLoader(response=response,
                              parent=response.meta['il'],
                              remove_elems=remove_elems,
                              change_tags=change_tags,
                              base_url='http://{}'.format(self.name))
     il.add_xpath('content_html', '//div[@id="main-inner"]')
     yield il.load_item()
示例#27
0
 def _parse_article(self, response):
     remove_elems = ["#issue", "h1", "#slogan", "#logo", "#footer"]
     il = FeedEntryItemLoader(
         response=response,
         parent=response.meta["il"],
         base_url=f"https://{self.name}",
         remove_elems=remove_elems,
     )
     il.add_css("content_html", "#page")
     return il.load_item()
示例#28
0
 def parse_letter(self, response):
     account = response.meta["account"]
     il = FeedEntryItemLoader(response=response, base_url=self._links.get(account))
     il.add_value("path", account)
     il.add_value("link", response.url)
     il.add_css("title", "title::text")
     il.add_css("author_name", "div#message-heading div.by-line a::text")
     il.add_css("updated", "div#message-heading div.date::text")
     il.add_css("content_html", "div.message-body")
     yield il.load_item()
示例#29
0
 def parse_item(self, response):
     il = FeedEntryItemLoader(
         selector=response.xpath('//div[@class="main"]'),
         timezone="Europe/Vienna")
     il.add_xpath("title", "h1/text()")
     il.add_value("link", response.url)
     il.add_xpath("content_html", "h1/following-sibling::*")
     il.add_value("updated",
                  response.url.rstrip("/").split("/")[-1].split("_")[0])
     il.add_value("author_name", self.name)
     return il.load_item()
示例#30
0
 def _parse_restaurant(self, response):
     il = FeedEntryItemLoader(
         response=response,
         base_url=response.url,
         parent=response.meta["il"],
         remove_elems=[".external"],
     )
     il.add_css("content_html", ".content .right p")
     il.add_css("content_html", ".restaurant-link")
     il.add_css("category", ".tags a ::text")
     yield il.load_item()