Exemplo n.º 1
0
    def parse(self, rsp):
        """ Parse CG items from pixiv illust page

        @url
        @returns items 1 200
        @scraps crawled_from site_pk large_file_url file_url source
        """
        wrapper = rsp.xpath('//div[@id="wrapper"]')
        wrapper_text = wrapper.extract()[0]
        pixiv_data = extract_pixiv_path(wrapper_text)
        pixiv_data.pop('order')
        order = 0
        while True:
            large_file_url = 'https://i.pximg.net/img-original/img/'\
                             '{year}/{month}/{day}/{hour}/{minute}/{second}/{pk}_p{order}.jpg'\
                             .format(order=order, **pixiv_data)
            if requests.head(
                large_file_url, headers={'Referer': 'https://www.pixiv.net/'}
            ).status_code == 404:
                break
            yield CG(
                crawled_from='pixiv.net',
                site_pk=pixiv_data['pk'],
                large_file_url=large_file_url,
                file_url=large_file_url,
                source=rsp.url,
            )
            order += 1
Exemplo n.º 2
0
 def parse_json_result(self, art):
     try:
         file_url = large_file_url = re.findall(
             r'data-super-full-img="(.*?)"', art)[0]
     except IndexError:
         return None
     url = re.findall(r'href="(.*?)"', art)[0]
     site_pk = url.split('/')[4].split('-')[-1]
     return CG(
         crawled_from='deviantart.com',
         site_pk=site_pk,
         large_file_url=large_file_url,
         file_url=file_url,
         source=url,
     )
Exemplo n.º 3
0
 def parse_image(self, rsp):
     """ Parse CG items from image page
     @url
     @returns items 1
     @scraps crawled_from site site_pk large_file_url file_url source
     """
     url = rsp.url
     site_pk = url.split('/')[4].split('-')[-1]
     file_url = large_file_url = rsp.xpath('//img[@class="dev-content-full "]/@src')\
                                    .extract_first()
     return CG(
         crawled_from='deviantart.com',
         site_pk=site_pk,
         large_file_url=large_file_url,
         file_url=file_url,
         source=url,
     )
Exemplo n.º 4
0
    def parse(self, rsp):
        """ Parse CG items from xml

        @url http://danbooru.donmai.us/explore/posts/popular.xml
        @returns items 1 100
        @scraps crawled_from site_pk large_file_url file_url source md5
            pixiv_id donmai_uploader_id rating fav_count score
            character_tags general_tags copyright_tags artist_tags
        """
        for p in rsp.xpath('//posts/post'):
            cg = extract_donmai_rss(p)
            if not cg:
                continue
            yield CG(**cg)

        next_url = self.get_next_url(rsp)
        yield Request(next_url, callback=self.parse)
Exemplo n.º 5
0
    def parse_illust(self, rsp):
        """ Parse CG items from gallery page

        @url
        @returns items 1 200
        @scraps crawled_from site_pk large_file_url file_url source
        """
        wrapper = rsp.xpath('//div[@id="wrapper"]')
        wrapper_text = wrapper.extract()[0]
        pixiv_data = extract_pixiv_path(wrapper_text)
        pixiv_data.pop('order')
        order = 0

        while True:
            large_file_url = 'https://i.pximg.net/img-original/img/'\
                             '{year}/{month}/{day}/{hour}/{minute}/{second}/{pk}_p{order}.jpg'\
                             .format(order=order, **pixiv_data)
            if requests.head(
                large_file_url, headers={'Referer': 'https://www.pixiv.net/'}
            ).status_code == 404:
                break
            yield CG(
                crawled_from='pixiv.net',
                site_pk=pixiv_data['pk'],
                large_file_url=large_file_url,
                file_url=large_file_url,
                source=rsp.url,
            )
            order += 1

        works = rsp.xpath('//div[@id="wrapper"]//section[@class="works"]')[0]
        next_works = works.xpath(
            'ul/li[contains(@class, "selected_works")]/following-sibling::li/a/@href'
        )
        if next_works:
            next_ = next_works.extract_first().rsplit('=')[-1]
            next_url = 'https://www.pixiv.net/member_illust.php?mode=medium&illust_id=%s' % next_
            yield Request(next_url, callback=self.parse_illust)
Exemplo n.º 6
0
    def parse(self, rsp):
        """ Parse CG items from gallery page

        @url
        @returns items 1 200
        @scraps crawled_from site_pk large_file_url file_url source
        """
        preview_images = rsp.xpath(
            '//div[@class="newindex"]//ul[contains(@class, "ui-brick")]/li//img/@src'
        ).extract()
        for image in preview_images:
            pixiv_data = extract_pixiv_path(image)
            large_file_url = 'https://i.pximg.net/img-original/img/'\
                             '{year}/{month}/{day}/{hour}/{minute}/{second}/{pk}_p{order}.jpg'\
                             .format(**pixiv_data)
            yield CG(
                crawled_from='pixiv.net',
                site_pk=pixiv_data['pk'],
                large_file_url=large_file_url,
                file_url=large_file_url,
                source='https://www.pixiv.net/member_illust.php?mode=medium&illust_id={pk}'\
                       .format(**pixiv_data),
            )