def parse(self, response):
        img = response.css('div.img-container').xpath('.//img')
        img_url = img.xpath('@src').extract_first()

        if img_url is not None:
            item = ImageScraperItem(
                file_urls=[img_url], files=[], response=response)

            yield item
Пример #2
0
        def _parse(response):
            file_url = response.xpath(
                '//div[@class="first-image"]/img/@src').extract_first()
            if file_url is not None and not self.__should_ignore(file_url):

                item = ImageScraperItem(
                    tags=[category],
                    file_urls=[urllib.parse.urljoin(response.url, file_url)],
                    files=[])

                yield item
Пример #3
0
        def _parse(response):
            file_url = response.xpath(
                '//img[contains(@class, "item_main")]/@src').extract_first()
            if file_url is not None and not self.__should_ignore(file_url):

                item = ImageScraperItem(
                    tags=[category],
                    file_urls=[urllib.parse.urljoin(response.url, file_url)],
                    files=[])

                yield item
Пример #4
0
    def parse(self, response):
        images = response.xpath(
            '//ul[@class="innerList"]//div[@class="imgWrap"]//img')

        for image in images:
            file_url = image.xpath('@src').extract_first()

            if file_url is not None and not self.__should_ignore(file_url):

                item = ImageScraperItem(tags=[url_tag_map[response.url]],
                                        file_urls=[file_url],
                                        files=[])

                yield item
Пример #5
0
        def _parse(response):
            json_content = response.xpath(
                '//div[starts-with(@id, "entitledItem_")]/text()'
            ).extract_first()
            file_url = None
            if json_content is not None:
                parsed = json.loads(json_content)
                file_url = parsed[0]["ItemImage"]

            if file_url is not None and not self.__should_ignore(file_url):

                item = ImageScraperItem(
                    tags=[category],
                    file_urls=[urllib.parse.urljoin(response.url, file_url)],
                    files=[])

                yield item
Пример #6
0
    def parse(self, response):
        posts = response.xpath('//post')

        for post in posts:
            file_url = 'http:' + post.xpath('@file_url').extract_first()
            tags = post.xpath('@tags').extract_first().split(' ')
            tags = list(filter(lambda x: x != '', tags))

            if file_url is not None and not self.__should_ignore(file_url):

                item = ImageScraperItem(
                    tags=tags,
                    file_urls=[file_url],
                    files=[]
                )

                yield item