def parse_article(self, response): author_url = response.css(".article__byline>a::attr(href)").get() yield scrapy.Request(author_url, callback=self.parse_author) hero_image = response.css(".article__featured-image::attr(src)").get() if hero_image: article_item = ArticleItem( title=response.css("h1::text").extract_first(), slug=self.extract_slug(response.url), author=response.css(".article__byline>a::text").extract_first().strip(), subject=response.meta.get("subject"), publish_date=self.extract_publish_date(response), text=self.extract_text(response), image_urls=[hero_image], ) else: article_item = ArticleItem( title=response.css("h1::text").extract_first(), slug=self.extract_slug(response.url), author=response.css(".article__byline>a::text").extract_first().strip(), subject=response.meta.get("subject"), publish_date=self.extract_publish_date(response), text=self.extract_text(response), image_urls=[], ) yield article_item
def parse_article(self, response): if response.url not in final_result: title = response.xpath( '//table/tr//p[@class="naslov"]/text()').get() link = response.url unparsed = response.xpath('//table//tr//p/text()').getall() publisher = 'Vreme' author = response.xpath('//table//em/text()').get() if author is None: author = ",".join( response.xpath('//table/tr/td//p[last()]/text()').getall() ).splitlines() date = response.xpath( '//table/tr/td[@align="right"]/p//text()').get() parsed_content = [i.strip() for i in unparsed if i != '\n'] content = "".join(parsed_content[2:]) loader = ItemLoader(item=ArticleItem(), response=response) loader.add_value('article_title', str(title)) loader.add_value('article_publisher', str(publisher)) loader.add_value('article_url', str(link)) loader.add_value('article_body', content) loader.add_value('article_date', str(date)) loader.add_value('article_author', str("".join(author).lstrip())) yield loader.load_item() c.execute("INSERT INTO articles VALUES (null, ?, ?)", (publisher, str(link))) conn.commit() else: pass
def parse_article(self, response): if response.url not in final_result: title = response.xpath( '//div[@class="article_head"]/h1/text()')[0].get() link = response.url content = response.xpath( '//div[@class="desc_holder cf main--content"]//text()').getall( ) publisher = 'Južne Vesti' author = response.xpath( '//div[@class="article_head"]/div/span[2]/text()').get() date = response.xpath( '//p[@class="article--single__date dib color--lgray"]/text()' )[1].get().strip() loader = ItemLoader(item=ArticleItem(), response=response) loader.add_value('article_title', str(title)) loader.add_value('article_publisher', str(publisher)) loader.add_value('article_url', str(link)) loader.add_value('article_body', str("".join(content).lstrip())) loader.add_value('article_date', str(date)) loader.add_value('article_author', str(author)) yield loader.load_item() c.execute("INSERT INTO articles VALUES (null, ?, ?)", (publisher, str(link))) conn.commit() else: pass
def parse_item(self, response): sel = Selector(response) article = ArticleItem() article['source'] = 'Folha de S.Paulo' article['url'] = response.url title = sel.xpath(FOLHA_ARTICLE_TITLE).extract() article['title'] = title[0] if title else None pub_date = sel.xpath(FOLHA_ARTICLE_PUB_DATE).extract()[0] print pub_date, " <<<<<< aqui" article['pub_date'] = datetime.strptime(pub_date, "%Y-%m-%d %H:%M") content = ' '.join(sel.xpath(FOLHA_ARTICLE_CONTENT).extract()) article['body'] = content if content else None links = sel.xpath('//article//a/@href').extract() links = list(set(links)) try: links.remove('javascript:;') except Exception: pass article['links'] = links return article
def parse_article(self, response): if response.url not in final_result: title = response.xpath( '//div[@class="content"]/article/div/h1/span/text()').get() link = response.url unparsed = response.xpath( '//div[@class="content"]/article/div/div[@class="entry"]//text()' ).getall() publisher = 'Печат' author = response.xpath( '//div[@class="content"]/article/div/p[@class="post-meta"]/span[@class="post-meta-author"]/a/text()' ).get().strip() date = response.xpath( '//div[@class="content"]/article/div/p[@class="post-meta"]/span[@class="tie-date"]/text()' ).get().strip() parsed_content = [i.strip() for i in unparsed if i != '\n'] content = "".join(parsed_content[3:-1]) loader = ItemLoader(item=ArticleItem(), response=response) loader.add_value('article_title', str(title)) loader.add_value('article_publisher', str(publisher)) loader.add_value('article_url', str(link)) loader.add_value('article_body', str("".join(content).lstrip())) loader.add_value('article_date', str(date)) loader.add_value('article_author', str(author)) yield loader.load_item() c.execute("INSERT INTO articles VALUES (null, ?, ?)", (publisher, str(link))) conn.commit() else: pass
def parse(self, response): item = ArticleItem() item['website'] = 'news.17173.com' item['url'] = response.url item['title'] = response.xpath('//h1[@class="gb-final-tit-article"]/text()').extract_first() item['content'] = '-'.join(response.xpath('//div[@id="mod_article"]//text()').extract()) item['category'] = '游戏' item['publish_time'] = response.xpath('//div[@class="gb-final-mod-info"]/span[1]/text()').extract_first() yield item
def parse_article(self, response): al = ItemLoader(item=ArticleItem(), response=response) index = response.url[::-1].index('/') article_id = response.url[-index:] al.add_value('url', response.url) al.add_value('article_id', article_id) al.add_css('title', 'h1.entry-title::text') al.add_css('target', 'div.entry-content pre') al.add_css('image_urls', 'div.entry-content p img::attr(src)') return al.load_item()
def parse(self, response): articles = response.xpath("//div[@class='GN-lbox2B']") for article in articles: url = article.xpath('h1/a/@href').extract_first() if url: url = 'https:' + url item = ArticleItem() item['website'] = 'https://gnn.gamer.com.tw' item['title'] = article.xpath("h1/a/text()").extract_first() yield scrapy.Request(url=url, callback=self.parse_item, meta={'item': item})
def parse(self, response): item = ArticleItem() item['website'] = 'news.sina.com.cn' item['url'] = response.url item['title'] = response.xpath( '//h1[@class="main-title"]/text()').extract_first() item['content'] = response.xpath( '//div[@class="article"]//p/text()').extract() item['category'] = response.xpath( '//div[@class="channel-path"]//a/text()').extract_first() item['publish_time'] = response.xpath( '//span[@class="date"]/text()').extract_first() yield item
def parse(self, response): self.logger.info('Parse function called on {}'.format(response.url)) last_modified = response.xpath urls = response.xpath("//a[contains(@href, '/stories/')]/@href") xml = response.body.decode("utf-8") urls = set() for url in re.findall(r'(https?://[^&"<>]+)', xml): if "orf.at/stories" in url and url not in urls: urls.add(url) article_item = ArticleItem() article_item["url"] = url yield response.follow(url, self.parse_article, meta={'article_item': article_item})
def parse_article(self, response): if response.url not in final_result: title = response.xpath('//b/text()').get() link = response.url unparsed_content = response.xpath('//body//p/text()').getall() publisher = 'НИН - Недељне Информативне Новине' author = response.xpath('//body/p[last()]/text()').get().strip() if author == '': author = response.xpath('//body//b/text()')[-1].get() else: author.strip() publication = response.xpath('//body/text()').get().strip().split( ",") if publication == ['']: publication = response.xpath( '//head/title/text()').get().split(",") pub_num = publication[0] if len(publication) > 1: pub_date = publication[1] else: pub_date = "" parsed_content = [i.strip() for i in unparsed_content if i != '\n'] content = "".join(parsed_content[:-1]) loader = ItemLoader(item=ArticleItem(), response=response) loader.add_value('article_title', str(title)) loader.add_value('article_publisher', str(publisher)) loader.add_value('article_url', str(link)) loader.add_value('article_body', content) loader.add_value('article_date', pub_date) loader.add_value('article_author', str(author[:50])) yield loader.load_item() c.execute("INSERT INTO articles VALUES (null, ?, ?)", (publisher, str(link))) conn.commit() #TODO: create publikacija json object and send post request to rsj api else: pass
def parse_article(self, response): if response.url not in final_result: title = response.xpath('//head/title/text()').get() link = response.url unparsed = response.xpath( '//div[@id="mainTextClanak"]//p//text()').getall() publisher = 'Vreme' author = response.xpath('//div[@class="autor"]//text()').get() if len(author) < 2: author = "" else: "".join(author).lstrip() date = response.xpath('//span[@class="datum"]/text()').get() parsed_content = [i.strip() for i in unparsed if i != '\n'] content = "".join(parsed_content) loader = ItemLoader(item=ArticleItem(), response=response) loader.add_value('article_title', str(title)) loader.add_value('article_publisher', str(publisher)) loader.add_value('article_url', str(link)) loader.add_value('article_body', content) loader.add_value('article_date', str(date.split("|")[2])) loader.add_value('article_author', str(author)) yield loader.load_item() c.execute("INSERT INTO articles VALUES (null, ?, ?)", (publisher, str(link))) conn.commit() #logging.debug('article_url', str(link)) #logging.debug('article_title', str(title)) #logging.debug('article_body', content) #logging.debug('article_date', str(date.split("|")[2])) #logging.debug('article_author', str(author)) #logging.debug('---------------------------------------\n\n') else: #logging.info(response.url, " has already been crawled") pass
def getUrl(self, response): # Save item item = ArticleItem() item['nurl'] = response.url item['aid'] = item['nurl'][-10:] item['press'] = response.xpath( '//div[@class="press_logo"]/a/img/@title').extract() item['title'] = response.xpath( '//h3[@id="articleTitle"]/text()').extract() item['article_body'] = ''.join( response.xpath( '//div[@id="articleBodyContents"]/text()').extract()).strip() item['date'] = response.xpath('//span[@class="t11"]/text()').extract() item['purl'] = response.xpath( '//div[@class="article_btns"]/a/@href').extract() item['nclass'] = response.xpath( '//div[@id="snb"]/h2/a/text()').extract() item['nclass2'] = response.xpath( '//ul[@class="nav"]/li[@class="on"]/a/text()').extract() return item
def parse_item(self, response): # comments = [] self.logger.info(f"this is the url :{response.url}") comments = ItemLoader( item=CommentItem(), selector=response.selector.xpath("//div[@class='comments-wrap']"), response=response) for comment in response.selector.xpath( "//div[@class='comments-wrap']"): # comments = ItemLoader(item=CommentItem(), selector=comment, response=response) comments.add_xpath('username', ".//cite[@class='comment-author']/text()") comments.add_xpath( 'publish', ".//time[@class='comment-published']/@datetime") comments.add_xpath('content', ".//div[@class='comment-content']/p/text()") # yield comments.load_item() body = response.selector.xpath("/html/body") article = ItemLoader(item=ArticleItem(), selector=body, response=response) article.add_xpath('title', "//span[@class='post-title']/text()") article.add_xpath( 'body', "//div[@class='entry-content clearfix single-post-content']") article.add_xpath('author', '//span[@class="post-author-name"]/b/text()') article.add_xpath( 'lastmodified', '//div[@class="post-meta single-post-meta"]/span/time/@datetime') article.add_xpath('readin', '//span[@class="no-line"]/b[1]/text()') article.add_xpath( 'tags', '//div[@class="entry-terms post-tags clearfix "]/a/text()') article.add_value('slug', response.request.url.split('/')[-2]) article.add_value('comments', comments.load_item()) return article.load_item()
def parse_article(self, response): title = response.xpath('//head/title/text()')[0].get() link = response.url content = "".join( response.xpath( '//article//div[contains(@class, "article-content")]//text()'). getall()).splitlines() publisher = 'Politika' author = response.xpath( '//article//div[contains(@class, "date-time")]/a[contains(@class,"author-name")]/text()' ).get() date = response.xpath( '//article//div[contains(@class, "date-time")]/text()').get( ).strip() loader = ItemLoader(item=ArticleItem(), response=response) loader.add_value('article_title', str(title)) loader.add_value('article_publisher', str(publisher)) loader.add_value('article_url', str(link)) loader.add_value('article_body', str("".join(content).lstrip())) loader.add_value('article_date', str(date)) loader.add_value('article_author', str(author)) yield loader.load_item()