Python filte示例，data_spider.utils.filter_html.filte Python示例

示例#1

0

显示文件

 def parse_news_interest(self, response):
     """
     parse_news_interest
     :param response: response object
     :return:
     """
     title = response.xpath("//title/text()").get()
     posted_at = response.xpath(
         "//div[@id='page-title']//strong/text()").get()
     content = " ".join(
         response.xpath("//div[@class='meat']//text()").getall())
     content = filte(content)
     images_urls = response.xpath("//div[@class='meat']//img/@src").getall()
     video_urls = response.xpath('//p[@align="center"]//iframe/@src').get(
         "")
     author = "".join(
         response.xpath('//div[@id="page-title"]/text()').getall())
     author = re.search(".*?by.*?(.*)\n", author).group(1)
     description = response.xpath(
         '//meta[@name="description"]/@content').get("")
     url = response.url
     type = response.xpath("//div[@class='sub-title']/text()").get("")
     item = AnimenewsItem()
     item['title'] = title
     item['content'] = content
     item['images_urls'] = images_urls
     item['video_urls'] = video_urls
     item['description'] = description
     item['posted_at'] = posted_at
     item['author'] = author
     item['source_url'] = url
     item['type'] = type
     item['crawl_time'] = time.strftime("%Y-%m-%d %H:%M:%S",
                                        time.localtime())
     yield item

示例#2

0

显示文件

 def parse_detail(self, response):  #pylint:disable=R0201
     '''parse_detail method'''
     title = response.xpath('//div[@class="related"]/h2/a/text()').get("")
     content = "".join(
         response.xpath('//div[@class="contents"][1]//text()').getall())
     content = filte(content)
     images = response.xpath(
         '//div[@class="contents"][1]//img/@src').getall()
     videos = ""
     description = response.xpath('//div[@class="showcrunchy\
     news_article white-wrapper"]/h2/text()').get("")
     author = response.xpath(
         '//div[@class="byline"]/a[@class="text-link"]/text()').get()
     posted_on = response.xpath('//span[@class="post-date"]/text()').get()
     crawl_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
     source_url = response.url
     item = CrunchyrollItem()
     item['title'] = title
     item['content'] = content
     item['images'] = images
     item['videos'] = videos
     item['description'] = description
     item['author'] = author
     item['posted_on'] = posted_on
     item['crawl_time'] = crawl_time
     item['source_url'] = source_url
     yield item

示例#3

0

显示文件

文件： cartoonbrew.py 项目： guishen2017/data_spider

    def parse_detail(self, response):
        try:
            spider_name = self.name
            content = filte("".join(response.xpath('//div[@class="entry-content"]//p//text()').getall()))
            images_urls = response.xpath("//div[@class='entry-content']//img/@src").getall()
            for index in range(len(images_urls)):
                if "==" in images_urls[index] or "data" in images_urls[index]:
                    images_urls.pop(index)
            video_urls = response.xpath('//iframe/@src').getall()
            description = response.xpath("//meta[@property='og:description']/@content").get()
            posted_at = response.xpath('//div[@class="post-inner"]/header//time[@class="updated"]/text()').get()
            month, day, year = posted_at.split(" ")[0].split("/")
            posted_at = switch_time(month=month,day=day,year=year)
            author = response.xpath('//span[@class="author"]/a[@rel="author"]/text()').get()
            crawl_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()).split(" ")[0]
            source_url = response.url
            title = response.xpath("//h1[@class='entry-title']//text()").get()
            tags = ",".join(response.xpath("//header/a[@class='category-slug']/text()").getall())

            items = CartoonbrewItem(spider_name=spider_name,content=content,images_urls=images_urls,video_urls=video_urls,
                                    description=description,posted_at=posted_at,author=author,crawl_time=crawl_time,
                                    source_url=source_url,title=title,tags=tags)
            yield items
        except Exception as e:
            self.logger.debug("Exception:%s" % (e.args))

示例#4

0

显示文件

文件： otakumode.py 项目： guishen2017/data_spider

 def parse_news(self, response):
     """
     parse news
     :param response:response object
     :return:
     """
     title = response.xpath("//div[@class='c-docs--single-column']/h1"
                            "[@class='p-article__title']/text()").get()
     content = "".join(
         response.xpath("//div[@class='p-ar"
                        "ticle__body c-docs--normalize']//text()").getall())
     content = filte(content)
     images_urls = response.xpath(
         "//div[@class='p-artic"
         "le__body c-docs--normalize']//img/@src").getall()
     header_image_url = response.xpath(
         '//div[@class="p-article__figure-inner"]/img/@src').get()
     images_urls.append(header_image_url)
     video_urls = response.xpath(
         '//div[@class="p-article__fi'
         'gure-inner"]//iframe[@src]/@src').getall()
     description = response.xpath(
         '//meta[@name="description"]/@content').get()
     posted_at = response.xpath(
         '//time[@class="p-article__time"]/@datetime').get()
     crawl_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
     source_url = response.url
     anime_type = response.xpath(
         '//ul[@class="list--inline u-float-left"]/li/a'
         '[@class="p-article__category"]/text()').get()
     tags = ",".join(
         response.xpath(
             '//a[@class="c-btn c-bt'
             'n--sm c-btn--icon-left c-btn--tag"]//text()').getall())
     item = OtakumodeItem()
     item['title'] = title
     item['content'] = content
     item['images_urls'] = images_urls
     item['video_urls'] = video_urls
     item['description'] = description
     item['posted_at'] = posted_at
     item['crawl_time'] = crawl_time
     item['source_url'] = source_url
     item['type'] = anime_type
     item['tags'] = tags
     yield item

示例#5

0

显示文件

    def parse_item(self, response):
        try:
            spider_name = self.name
            content = filte("".join(
                response.xpath(
                    "//div[@class='c-entry-content']//text()").getall()))
            image = response.xpath(
                "//picture[@class='c-picture']//img/@src").get()
            images_urls = response.xpath(
                "//div[@class='c-entry-content']//img/@img").getall()
            images_urls.append(image)
            video_urls = response.xpath(
                "//div[@class='c-entry-content']//iframe/@src").getall()
            description = response.xpath(
                "//meta[@name='description']/@content").get()
            posted_at = response.xpath(
                '//time[@class="c-byline__item"]//text()').get().replace(
                    "\n", "").replace(" ", "")
            month, day, year = re.findall(r"(.*?)(\d){1,2},(\d+),.*?",
                                          posted_at)[0]
            posted_at = switch_time(month, day, year)
            author = response.xpath(
                '//span[@class="c-byline__item"]/a/text()').get()
            source_url = response.url
            title = response.xpath('//h1[@class="c-page-title"]/text()').get()
            tags = ",".join(
                response.xpath(
                    '//li[@class="c-entry-group-labels__item"]//a/span/text()'
                ).getall())
            crawl_time = time.strftime("%Y-%m-%d", time.localtime())

            item = ThevergeItem(spider_name=spider_name,
                                content=content,
                                images_urls=images_urls,
                                video_urls=video_urls,
                                description=description,
                                posted_at=posted_at,
                                author=author,
                                source_url=source_url,
                                title=title,
                                tags=tags,
                                crawl_time=crawl_time)
            yield item
        except Exception as e:
            self.logger.debug("Exception:%s" % (e.args))

示例#6

0

显示文件

文件： awn.py 项目： guishen2017/data_spider

    def parse_detail(self, response):
        try:
            spider_name = self.name
            content = filte("".join(response.xpath("//div[@class='field-items']/div[@class='field-item even']//text()").getall()))
            images_urls = response.xpath("//div[@class='field-items']//img/@src").getall()
            video_urls = response.xpath("//div[@class='field-items']//iframe/@src").getall()
            description = response.xpath("//meta[@name='description']/@content").get()
            posted_at = response.xpath("//footer[@class='submitted']//text()").getall()[2]
            posted_at = re.findall(r"\|.*?,(.*?)at.*?",posted_at)[0].strip()
            # posted_at = re.findall(r"(.*?)(\d){1,2},(\d+),.*?", posted_at)
            month, day, year = re.findall(r"(.*?)\s+(\d+){1,2},\s+(\d+)", posted_at)[0]
            posted_at = switch_time(month, day, year)
            author = response.xpath("//a[@class='username']/text()").get()
            crawl_time = time.strftime("%Y-%m-%d", time.localtime())
            source_url = response.url
            title = response.xpath("//h1[@id='page-title']//text()").get().strip()
            tags = ",".join(response.xpath("//div[@class='field-items']//div//a//text()").getall())

            item = AwnItem(spider_name=spider_name, content=content, images_urls=images_urls, video_urls=video_urls,
                               description=description, posted_at=posted_at, author=author, source_url=source_url,
                               title=title, tags=tags, crawl_time=crawl_time)
            yield item
        except Exception as e:
            self.logger.debug("Exception:%s" %(e.args))