def parse_news_interest(self, response): """ parse_news_interest :param response: response object :return: """ title = response.xpath("//title/text()").get() posted_at = response.xpath( "//div[@id='page-title']//strong/text()").get() content = " ".join( response.xpath("//div[@class='meat']//text()").getall()) content = filte(content) images_urls = response.xpath("//div[@class='meat']//img/@src").getall() video_urls = response.xpath('//p[@align="center"]//iframe/@src').get( "") author = "".join( response.xpath('//div[@id="page-title"]/text()').getall()) author = re.search(".*?by.*?(.*)\n", author).group(1) description = response.xpath( '//meta[@name="description"]/@content').get("") url = response.url type = response.xpath("//div[@class='sub-title']/text()").get("") item = AnimenewsItem() item['title'] = title item['content'] = content item['images_urls'] = images_urls item['video_urls'] = video_urls item['description'] = description item['posted_at'] = posted_at item['author'] = author item['source_url'] = url item['type'] = type item['crawl_time'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) yield item
def parse_detail(self, response): #pylint:disable=R0201 '''parse_detail method''' title = response.xpath('//div[@class="related"]/h2/a/text()').get("") content = "".join( response.xpath('//div[@class="contents"][1]//text()').getall()) content = filte(content) images = response.xpath( '//div[@class="contents"][1]//img/@src').getall() videos = "" description = response.xpath('//div[@class="showcrunchy\ news_article white-wrapper"]/h2/text()').get("") author = response.xpath( '//div[@class="byline"]/a[@class="text-link"]/text()').get() posted_on = response.xpath('//span[@class="post-date"]/text()').get() crawl_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) source_url = response.url item = CrunchyrollItem() item['title'] = title item['content'] = content item['images'] = images item['videos'] = videos item['description'] = description item['author'] = author item['posted_on'] = posted_on item['crawl_time'] = crawl_time item['source_url'] = source_url yield item
def parse_detail(self, response): try: spider_name = self.name content = filte("".join(response.xpath('//div[@class="entry-content"]//p//text()').getall())) images_urls = response.xpath("//div[@class='entry-content']//img/@src").getall() for index in range(len(images_urls)): if "==" in images_urls[index] or "data" in images_urls[index]: images_urls.pop(index) video_urls = response.xpath('//iframe/@src').getall() description = response.xpath("//meta[@property='og:description']/@content").get() posted_at = response.xpath('//div[@class="post-inner"]/header//time[@class="updated"]/text()').get() month, day, year = posted_at.split(" ")[0].split("/") posted_at = switch_time(month=month,day=day,year=year) author = response.xpath('//span[@class="author"]/a[@rel="author"]/text()').get() crawl_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()).split(" ")[0] source_url = response.url title = response.xpath("//h1[@class='entry-title']//text()").get() tags = ",".join(response.xpath("//header/a[@class='category-slug']/text()").getall()) items = CartoonbrewItem(spider_name=spider_name,content=content,images_urls=images_urls,video_urls=video_urls, description=description,posted_at=posted_at,author=author,crawl_time=crawl_time, source_url=source_url,title=title,tags=tags) yield items except Exception as e: self.logger.debug("Exception:%s" % (e.args))
def parse_news(self, response): """ parse news :param response:response object :return: """ title = response.xpath("//div[@class='c-docs--single-column']/h1" "[@class='p-article__title']/text()").get() content = "".join( response.xpath("//div[@class='p-ar" "ticle__body c-docs--normalize']//text()").getall()) content = filte(content) images_urls = response.xpath( "//div[@class='p-artic" "le__body c-docs--normalize']//img/@src").getall() header_image_url = response.xpath( '//div[@class="p-article__figure-inner"]/img/@src').get() images_urls.append(header_image_url) video_urls = response.xpath( '//div[@class="p-article__fi' 'gure-inner"]//iframe[@src]/@src').getall() description = response.xpath( '//meta[@name="description"]/@content').get() posted_at = response.xpath( '//time[@class="p-article__time"]/@datetime').get() crawl_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) source_url = response.url anime_type = response.xpath( '//ul[@class="list--inline u-float-left"]/li/a' '[@class="p-article__category"]/text()').get() tags = ",".join( response.xpath( '//a[@class="c-btn c-bt' 'n--sm c-btn--icon-left c-btn--tag"]//text()').getall()) item = OtakumodeItem() item['title'] = title item['content'] = content item['images_urls'] = images_urls item['video_urls'] = video_urls item['description'] = description item['posted_at'] = posted_at item['crawl_time'] = crawl_time item['source_url'] = source_url item['type'] = anime_type item['tags'] = tags yield item
def parse_item(self, response): try: spider_name = self.name content = filte("".join( response.xpath( "//div[@class='c-entry-content']//text()").getall())) image = response.xpath( "//picture[@class='c-picture']//img/@src").get() images_urls = response.xpath( "//div[@class='c-entry-content']//img/@img").getall() images_urls.append(image) video_urls = response.xpath( "//div[@class='c-entry-content']//iframe/@src").getall() description = response.xpath( "//meta[@name='description']/@content").get() posted_at = response.xpath( '//time[@class="c-byline__item"]//text()').get().replace( "\n", "").replace(" ", "") month, day, year = re.findall(r"(.*?)(\d){1,2},(\d+),.*?", posted_at)[0] posted_at = switch_time(month, day, year) author = response.xpath( '//span[@class="c-byline__item"]/a/text()').get() source_url = response.url title = response.xpath('//h1[@class="c-page-title"]/text()').get() tags = ",".join( response.xpath( '//li[@class="c-entry-group-labels__item"]//a/span/text()' ).getall()) crawl_time = time.strftime("%Y-%m-%d", time.localtime()) item = ThevergeItem(spider_name=spider_name, content=content, images_urls=images_urls, video_urls=video_urls, description=description, posted_at=posted_at, author=author, source_url=source_url, title=title, tags=tags, crawl_time=crawl_time) yield item except Exception as e: self.logger.debug("Exception:%s" % (e.args))
def parse_detail(self, response): try: spider_name = self.name content = filte("".join(response.xpath("//div[@class='field-items']/div[@class='field-item even']//text()").getall())) images_urls = response.xpath("//div[@class='field-items']//img/@src").getall() video_urls = response.xpath("//div[@class='field-items']//iframe/@src").getall() description = response.xpath("//meta[@name='description']/@content").get() posted_at = response.xpath("//footer[@class='submitted']//text()").getall()[2] posted_at = re.findall(r"\|.*?,(.*?)at.*?",posted_at)[0].strip() # posted_at = re.findall(r"(.*?)(\d){1,2},(\d+),.*?", posted_at) month, day, year = re.findall(r"(.*?)\s+(\d+){1,2},\s+(\d+)", posted_at)[0] posted_at = switch_time(month, day, year) author = response.xpath("//a[@class='username']/text()").get() crawl_time = time.strftime("%Y-%m-%d", time.localtime()) source_url = response.url title = response.xpath("//h1[@id='page-title']//text()").get().strip() tags = ",".join(response.xpath("//div[@class='field-items']//div//a//text()").getall()) item = AwnItem(spider_name=spider_name, content=content, images_urls=images_urls, video_urls=video_urls, description=description, posted_at=posted_at, author=author, source_url=source_url, title=title, tags=tags, crawl_time=crawl_time) yield item except Exception as e: self.logger.debug("Exception:%s" %(e.args))