示例#1
0
    def parse_article(self, response):

        author_url = response.css(".article__byline>a::attr(href)").get()
        yield scrapy.Request(author_url, callback=self.parse_author)

        hero_image = response.css(".article__featured-image::attr(src)").get()

        if hero_image:
            article_item = ArticleItem(
                title=response.css("h1::text").extract_first(),
                slug=self.extract_slug(response.url),
                author=response.css(".article__byline>a::text").extract_first().strip(),
                subject=response.meta.get("subject"),
                publish_date=self.extract_publish_date(response),
                text=self.extract_text(response),
                image_urls=[hero_image],
            )
        else:
            article_item = ArticleItem(
                title=response.css("h1::text").extract_first(),
                slug=self.extract_slug(response.url),
                author=response.css(".article__byline>a::text").extract_first().strip(),
                subject=response.meta.get("subject"),
                publish_date=self.extract_publish_date(response),
                text=self.extract_text(response),
                image_urls=[],
            )

        yield article_item
示例#2
0
    def parse_article(self, response):
        if response.url not in final_result:
            title = response.xpath(
                '//table/tr//p[@class="naslov"]/text()').get()
            link = response.url
            unparsed = response.xpath('//table//tr//p/text()').getall()
            publisher = 'Vreme'
            author = response.xpath('//table//em/text()').get()

            if author is None:
                author = ",".join(
                    response.xpath('//table/tr/td//p[last()]/text()').getall()
                ).splitlines()

            date = response.xpath(
                '//table/tr/td[@align="right"]/p//text()').get()

            parsed_content = [i.strip() for i in unparsed if i != '\n']
            content = "".join(parsed_content[2:])

            loader = ItemLoader(item=ArticleItem(), response=response)
            loader.add_value('article_title', str(title))
            loader.add_value('article_publisher', str(publisher))
            loader.add_value('article_url', str(link))
            loader.add_value('article_body', content)
            loader.add_value('article_date', str(date))
            loader.add_value('article_author', str("".join(author).lstrip()))
            yield loader.load_item()

            c.execute("INSERT INTO articles VALUES (null, ?, ?)",
                      (publisher, str(link)))
            conn.commit()
        else:
            pass
示例#3
0
    def parse_article(self, response):
        if response.url not in final_result:
            title = response.xpath(
                '//div[@class="article_head"]/h1/text()')[0].get()
            link = response.url
            content = response.xpath(
                '//div[@class="desc_holder cf main--content"]//text()').getall(
                )
            publisher = 'Južne Vesti'
            author = response.xpath(
                '//div[@class="article_head"]/div/span[2]/text()').get()
            date = response.xpath(
                '//p[@class="article--single__date dib color--lgray"]/text()'
            )[1].get().strip()

            loader = ItemLoader(item=ArticleItem(), response=response)
            loader.add_value('article_title', str(title))
            loader.add_value('article_publisher', str(publisher))
            loader.add_value('article_url', str(link))
            loader.add_value('article_body', str("".join(content).lstrip()))
            loader.add_value('article_date', str(date))
            loader.add_value('article_author', str(author))
            yield loader.load_item()

            c.execute("INSERT INTO articles VALUES (null, ?, ?)",
                      (publisher, str(link)))
            conn.commit()

        else:
            pass
示例#4
0
    def parse_item(self, response):

        sel = Selector(response)

        article = ArticleItem()

        article['source'] = 'Folha de S.Paulo'

        article['url'] = response.url

        title = sel.xpath(FOLHA_ARTICLE_TITLE).extract()
        article['title'] = title[0] if title else None

        pub_date = sel.xpath(FOLHA_ARTICLE_PUB_DATE).extract()[0]
        print pub_date, " <<<<<<  aqui"

        article['pub_date'] = datetime.strptime(pub_date, "%Y-%m-%d %H:%M")

        content = ' '.join(sel.xpath(FOLHA_ARTICLE_CONTENT).extract())
        article['body'] = content if content else None

        links = sel.xpath('//article//a/@href').extract()
        links = list(set(links))
        try:
            links.remove('javascript:;')
        except Exception:
            pass

        article['links'] = links

        return article
示例#5
0
    def parse_article(self, response):
        if response.url not in final_result:
            title = response.xpath(
                '//div[@class="content"]/article/div/h1/span/text()').get()
            link = response.url
            unparsed = response.xpath(
                '//div[@class="content"]/article/div/div[@class="entry"]//text()'
            ).getall()
            publisher = 'Печат'
            author = response.xpath(
                '//div[@class="content"]/article/div/p[@class="post-meta"]/span[@class="post-meta-author"]/a/text()'
            ).get().strip()
            date = response.xpath(
                '//div[@class="content"]/article/div/p[@class="post-meta"]/span[@class="tie-date"]/text()'
            ).get().strip()

            parsed_content = [i.strip() for i in unparsed if i != '\n']
            content = "".join(parsed_content[3:-1])

            loader = ItemLoader(item=ArticleItem(), response=response)
            loader.add_value('article_title', str(title))
            loader.add_value('article_publisher', str(publisher))
            loader.add_value('article_url', str(link))
            loader.add_value('article_body', str("".join(content).lstrip()))
            loader.add_value('article_date', str(date))
            loader.add_value('article_author', str(author))
            yield loader.load_item()

            c.execute("INSERT INTO articles VALUES (null, ?, ?)",
                      (publisher, str(link)))
            conn.commit()

        else:
            pass
示例#6
0
 def parse(self, response):
     item = ArticleItem()
     item['website'] = 'news.17173.com'
     item['url'] = response.url
     item['title'] = response.xpath('//h1[@class="gb-final-tit-article"]/text()').extract_first()
     item['content'] = '-'.join(response.xpath('//div[@id="mod_article"]//text()').extract())
     item['category'] = '游戏'
     item['publish_time'] = response.xpath('//div[@class="gb-final-mod-info"]/span[1]/text()').extract_first()
     yield item
示例#7
0
 def parse_article(self, response):
     al = ItemLoader(item=ArticleItem(), response=response)
     index = response.url[::-1].index('/')
     article_id = response.url[-index:]
     al.add_value('url', response.url)
     al.add_value('article_id', article_id)
     al.add_css('title', 'h1.entry-title::text')
     al.add_css('target', 'div.entry-content pre')
     al.add_css('image_urls', 'div.entry-content p img::attr(src)')
     return al.load_item()
示例#8
0
 def parse(self, response):
     articles = response.xpath("//div[@class='GN-lbox2B']")
     for article in articles:
         url = article.xpath('h1/a/@href').extract_first()
         if url:
             url = 'https:' + url
             item = ArticleItem()
             item['website'] = 'https://gnn.gamer.com.tw'
             item['title'] = article.xpath("h1/a/text()").extract_first()
             yield scrapy.Request(url=url,
                                  callback=self.parse_item,
                                  meta={'item': item})
示例#9
0
    def parse(self, response):

        item = ArticleItem()
        item['website'] = 'news.sina.com.cn'
        item['url'] = response.url
        item['title'] = response.xpath(
            '//h1[@class="main-title"]/text()').extract_first()
        item['content'] = response.xpath(
            '//div[@class="article"]//p/text()').extract()
        item['category'] = response.xpath(
            '//div[@class="channel-path"]//a/text()').extract_first()
        item['publish_time'] = response.xpath(
            '//span[@class="date"]/text()').extract_first()
        yield item
示例#10
0
 def parse(self, response):
     self.logger.info('Parse function called on {}'.format(response.url))
     last_modified = response.xpath
     urls = response.xpath("//a[contains(@href, '/stories/')]/@href")
     xml = response.body.decode("utf-8")
     urls = set()
     for url in re.findall(r'(https?://[^&"<>]+)', xml):
         if "orf.at/stories" in url and url not in urls:
             urls.add(url)
             article_item = ArticleItem()
             article_item["url"] = url
             yield response.follow(url,
                                   self.parse_article,
                                   meta={'article_item': article_item})
示例#11
0
    def parse_article(self, response):
        if response.url not in final_result:
            title = response.xpath('//b/text()').get()
            link = response.url
            unparsed_content = response.xpath('//body//p/text()').getall()
            publisher = 'НИН - Недељне Информативне Новине'
            author = response.xpath('//body/p[last()]/text()').get().strip()

            if author == '':
                author = response.xpath('//body//b/text()')[-1].get()
            else:
                author.strip()

            publication = response.xpath('//body/text()').get().strip().split(
                ",")

            if publication == ['']:
                publication = response.xpath(
                    '//head/title/text()').get().split(",")

            pub_num = publication[0]

            if len(publication) > 1:
                pub_date = publication[1]
            else:
                pub_date = ""

            parsed_content = [i.strip() for i in unparsed_content if i != '\n']
            content = "".join(parsed_content[:-1])

            loader = ItemLoader(item=ArticleItem(), response=response)
            loader.add_value('article_title', str(title))
            loader.add_value('article_publisher', str(publisher))
            loader.add_value('article_url', str(link))
            loader.add_value('article_body', content)
            loader.add_value('article_date', pub_date)
            loader.add_value('article_author', str(author[:50]))
            yield loader.load_item()
            c.execute("INSERT INTO articles VALUES (null, ?, ?)",
                      (publisher, str(link)))
            conn.commit()

            #TODO: create publikacija json object and send post request to rsj api
        else:
            pass
示例#12
0
    def parse_article(self, response):
        if response.url not in final_result:
            title = response.xpath('//head/title/text()').get()
            link = response.url
            unparsed = response.xpath(
                '//div[@id="mainTextClanak"]//p//text()').getall()
            publisher = 'Vreme'
            author = response.xpath('//div[@class="autor"]//text()').get()

            if len(author) < 2:
                author = ""
            else:
                "".join(author).lstrip()

            date = response.xpath('//span[@class="datum"]/text()').get()

            parsed_content = [i.strip() for i in unparsed if i != '\n']
            content = "".join(parsed_content)

            loader = ItemLoader(item=ArticleItem(), response=response)
            loader.add_value('article_title', str(title))
            loader.add_value('article_publisher', str(publisher))
            loader.add_value('article_url', str(link))
            loader.add_value('article_body', content)
            loader.add_value('article_date', str(date.split("|")[2]))
            loader.add_value('article_author', str(author))
            yield loader.load_item()

            c.execute("INSERT INTO articles VALUES (null, ?, ?)",
                      (publisher, str(link)))
            conn.commit()

            #logging.debug('article_url', str(link))
            #logging.debug('article_title', str(title))
            #logging.debug('article_body', content)
            #logging.debug('article_date', str(date.split("|")[2]))
            #logging.debug('article_author', str(author))
            #logging.debug('---------------------------------------\n\n')

        else:
            #logging.info(response.url, " has already been crawled")
            pass
示例#13
0
 def getUrl(self, response):
     # Save item
     item = ArticleItem()
     item['nurl'] = response.url
     item['aid'] = item['nurl'][-10:]
     item['press'] = response.xpath(
         '//div[@class="press_logo"]/a/img/@title').extract()
     item['title'] = response.xpath(
         '//h3[@id="articleTitle"]/text()').extract()
     item['article_body'] = ''.join(
         response.xpath(
             '//div[@id="articleBodyContents"]/text()').extract()).strip()
     item['date'] = response.xpath('//span[@class="t11"]/text()').extract()
     item['purl'] = response.xpath(
         '//div[@class="article_btns"]/a/@href').extract()
     item['nclass'] = response.xpath(
         '//div[@id="snb"]/h2/a/text()').extract()
     item['nclass2'] = response.xpath(
         '//ul[@class="nav"]/li[@class="on"]/a/text()').extract()
     return item
示例#14
0
    def parse_item(self, response):

        # comments = []
        self.logger.info(f"this is the url :{response.url}")
        comments = ItemLoader(
            item=CommentItem(),
            selector=response.selector.xpath("//div[@class='comments-wrap']"),
            response=response)
        for comment in response.selector.xpath(
                "//div[@class='comments-wrap']"):
            # comments = ItemLoader(item=CommentItem(), selector=comment, response=response)
            comments.add_xpath('username',
                               ".//cite[@class='comment-author']/text()")
            comments.add_xpath(
                'publish', ".//time[@class='comment-published']/@datetime")
            comments.add_xpath('content',
                               ".//div[@class='comment-content']/p/text()")

        # yield comments.load_item()

        body = response.selector.xpath("/html/body")
        article = ItemLoader(item=ArticleItem(),
                             selector=body,
                             response=response)
        article.add_xpath('title', "//span[@class='post-title']/text()")
        article.add_xpath(
            'body',
            "//div[@class='entry-content clearfix single-post-content']")
        article.add_xpath('author',
                          '//span[@class="post-author-name"]/b/text()')
        article.add_xpath(
            'lastmodified',
            '//div[@class="post-meta single-post-meta"]/span/time/@datetime')
        article.add_xpath('readin', '//span[@class="no-line"]/b[1]/text()')
        article.add_xpath(
            'tags', '//div[@class="entry-terms post-tags clearfix "]/a/text()')
        article.add_value('slug', response.request.url.split('/')[-2])
        article.add_value('comments', comments.load_item())

        return article.load_item()
示例#15
0
    def parse_article(self, response):
        title = response.xpath('//head/title/text()')[0].get()
        link = response.url
        content = "".join(
            response.xpath(
                '//article//div[contains(@class, "article-content")]//text()').
            getall()).splitlines()
        publisher = 'Politika'
        author = response.xpath(
            '//article//div[contains(@class, "date-time")]/a[contains(@class,"author-name")]/text()'
        ).get()
        date = response.xpath(
            '//article//div[contains(@class, "date-time")]/text()').get(
            ).strip()

        loader = ItemLoader(item=ArticleItem(), response=response)
        loader.add_value('article_title', str(title))
        loader.add_value('article_publisher', str(publisher))
        loader.add_value('article_url', str(link))
        loader.add_value('article_body', str("".join(content).lstrip()))
        loader.add_value('article_date', str(date))
        loader.add_value('article_author', str(author))
        yield loader.load_item()