Python SoccerNewsItem示例，scrapy_news.items.SoccerNewsItem Python示例

示例#1

0

显示文件

    def parse(self, response):
        url = response.url
        datetime = response.css(".news-date ::attr(datetime)").extract_first()
        headline = response.css(".news-title ::text").extract_first()
        subhead = response.css(".news-summary ::text").extract_first()
        author = response.css(".author-name a ::text").extract_first()
        body_text = " ".join(
            response.css(".news-text-content p ::text").extract())

        media_text = " ".join(
            response.css(".news-text-content .news-media-description p ::text"
                         ).extract())
        body_text = body_text.replace(media_text, "")

        notice = SoccerNewsItem(headline=headline,
                                subhead=subhead,
                                author=author,
                                body_text=body_text,
                                url=url,
                                datetime=datetime,
                                source=self.name)

        yield notice

示例#2

0

显示文件

文件： metro.py 项目： dcaled/FTR-18

    def parse(self, response):

        url = response.url
        datetime = response.css(".post-date ::text").extract_first()
        headline = response.css(".post-title ::text").extract_first()
        subhead = ""
        author = response.css(".author ::text").extract_first().strip()
        body_text = " ".join(response.css('.article-body p ::text').extract())

        rel_lst = response.css('.zopo-title span ::text').extract()
        vid_text = " ".join(response.css("p.vjs-no-js ::text").extract())
        mor_text = " ".join(response.css(".mor-link ::text").extract())
        #twt_lst = response.css('.embed-twitter p ::text').extract()
        #igm_lst = response.css(".instagram-media p ::text").extract()

        for i in range(0, len(rel_lst), 3):
            i_text = " ".join(rel_lst[i:i + 3])
            body_text = body_text.replace(i_text, "")

        body_text = body_text.replace(vid_text, "")
        body_text = body_text.replace(mor_text, "")

        #for i in twt_lst:
        #    body_text = body_text.replace(i.strip(), "")

        #for i in igm_lst:
        #    body_text = body_text.replace(i.strip(), "")

        notice = SoccerNewsItem(headline=headline,
                                subhead=subhead,
                                author=author,
                                body_text=body_text,
                                url=url,
                                datetime=datetime,
                                source=self.name)

        yield notice

示例#3

0

显示文件

    def parse(self, response):

        url = response.url
        datetime = response.css(".date ::attr(datetime)").extract_first()
        headline = response.css("h1 ::text").extract_first()
        sh_lst = response.css("h2 ::text").extract()
        author = response.css(".author-link ::text").extract_first()
        body_text = " ".join(response.css('.editor p ::text').extract())

        for i in range(len(sh_lst)):
            sh_lst[i] = sh_lst[i].strip()
        subhead = " ".join(sh_lst)

        rel_text = response.css('.relations p ::text').extract_first()
        if rel_text:
            body_text = body_text.replace(rel_text, "")
        box_text = " ".join(response.css('.box-left-55 p ::text').extract())
        body_text = body_text.replace(box_text, "")

        #twt_lst = response.css(".twitter-tweet ::text").extract()
        #igm_lst = response.css(".instagram-media ::text").extract()

        #for i in twt_lst:
        #    body_text = body_text.replace(i.strip(), "")

        #for i in igm_lst:
        #    body_text = body_text.replace(i.strip(), "")

        notice = SoccerNewsItem(headline=headline,
                                subhead=subhead,
                                author=author,
                                body_text=body_text,
                                url=url,
                                datetime=datetime,
                                source=self.name)

        yield notice

示例#4

0

显示文件

    def parse(self, response):

        url = response.url
        datetime = response.css(
            ".story-leaf-datetime ::attr(datetime)").extract_first()
        headline = response.css(".story-leaf-title ::text").extract_first()
        subhead = response.css(".story-leaf-subtitle ::text").extract_first()
        author = response.css(".story-leaf-author-link ::text").extract_first()
        body_text = " ".join(
            response.css(".story-leaf-txt-p p ::text").extract())

        rel_text = " ".join(
            response.css('p.story-leaf-relatednews-epigraph ::text').extract())
        body_text = body_text.replace(rel_text, "")

        notice = SoccerNewsItem(headline=headline,
                                subhead=subhead,
                                author=author,
                                body_text=body_text,
                                url=url,
                                datetime=datetime,
                                source=self.name)

        yield notice

示例#5

0

显示文件

文件： telegraph.py 项目： dcaled/FTR-18

    def parse(self, response):
        url = response.url
        datetime = response.css(
            ".component-content time::attr(datetime)").extract_first()
        headline = response.css("h1.headline__heading ::text").extract_first()
        subhead = response.css(".lead-asset-caption ::text").extract_first()
        author = response.css(".byline__author-name a ::text").extract_first()
        bt_lst = response.css(".articleBodyText p ::text").extract()

        for i in range(len(bt_lst)):
            bt_lst[i] = bt_lst[i].strip()
        body_text = " ".join(bt_lst)

        body_text = body_text.replace(subhead, "")

        notice = SoccerNewsItem(headline=headline,
                                subhead=subhead,
                                author=author,
                                body_text=body_text,
                                url=url,
                                datetime=datetime,
                                source=self.name)

        yield notice