예제 #1
0
class Pixiv(DummySite):
    def __init__(self, url, username, password, proxy=None):
        proxies = get_proxy(proxy)
        requests_kwargs = {
            "timeout": (3, 10),
        }
        requests_kwargs.update(proxies)
        self.api = AppPixivAPI(**requests_kwargs)
        self._fetcher = PixivFetcher(**proxies)
        self.api.login(username, password)
        self._user_id = int(url.split("/")[-1])
        self._dir_name = None
        self._total_illustrations = 0
        self._fetch_user_detail()

    @property
    def fetcher(self):
        return self._fetcher

    @property
    def dir_name(self):
        assert self._dir_name is not None
        return self._dir_name

    def _fetch_user_detail(self):
        assert self._user_id is not None
        profile = self.api.user_detail(self._user_id)
        user = profile['user']
        self._dir_name = "-".join([
            user['name'],
            user['account'],
            str(user['id']),
        ])
        self._dir_name = normalize_filename(self._dir_name)
        self._total_illustrations = profile['profile']['total_illusts']
        return self.dir_name

    def _fetch_image_list(self, ):
        ret = self.api.user_illusts(self._user_id)
        while True:
            for illustration in ret.illusts:
                yield from parse_image_urls(illustration)
            if ret.next_url is None:
                break
            ret = self.api.user_illusts(**self.api.parse_qs(ret.next_url))

    def _fetch_single_image_url(self, illustration_id):
        json_result = self.api.illust_detail(illustration_id)
        illustration_info = json_result.illust
        return illustration_info.image_urls['large']

    @property
    def tasks(self):
        yield from self._fetch_image_list()
예제 #2
0
파일: pixie.py 프로젝트: Xdynix/PixivPixie
class PixivPixie:
    """Pixiv API interface.

    Remember call login() before using other methods.

    Attributes:
        auto_re_login: If true, PixivPixie will auto re-login when login token
            expired.
    """
    def __init__(self, auto_re_login=True, **requests_kwargs):
        self.auto_re_login = auto_re_login
        self._requests_kwargs = requests_kwargs

        self._papi = PixivAPI(**requests_kwargs)
        self._aapi = AppPixivAPI(**requests_kwargs)

        self._has_auth = False
        self._last_login = None
        self._check_auth_lock = Lock()

        self._username = None
        self._password = None

    @property
    def requests_kwargs(self):
        """Parameters that will be passed to requests."""
        return self._requests_kwargs

    @requests_kwargs.setter
    def requests_kwargs(self, requests_kwargs):
        self._requests_kwargs = requests_kwargs
        self._papi.requests_kwargs = requests_kwargs
        self._aapi.requests_kwargs = requests_kwargs

    @property
    def has_auth(self):
        """Whether the pixie has login."""
        return self._has_auth

    @property
    def last_login(self):
        """Last login time. Will be a datetime object or None if haven't login
        yet."""
        return self._last_login

    def login(self, username, password):
        """Login Pixiv account.

        Notice: The access token will expire after about 1 hour. So if you are
            dealing with a long time quest, remember to re-login every some
            time.

        Args:
            username: Your Pixiv account's username.
            password: Your Pixiv account's password.

        Returns:
            None.

        Raises:
            LoginFailed: An error occurred if the username and password is not
                match.
        """
        if not username or not password:
            raise LoginFailed

        try:
            self._papi.login(username, password)

            # self._aapi.login(username, password)
            self._aapi.access_token = self._papi.access_token
            self._aapi.user_id = self._papi.user_id
            self._aapi.refresh_token = self._papi.refresh_token
        except PixivError:
            raise LoginFailed
        else:
            self._has_auth = True
            self._username = username
            self._password = password
            self._last_login = datetime.datetime.now()

        return self

    def check_auth(self, auto_re_login=False):
        """Raise error if the pixie doesn't has auth.

        Args:
            auto_re_login: If true, the PixivPixie will try to re-login when
                login token expired.

        Raises:
            NoAuth: If the PixivPixie hasn't login first.
            LoginFailed: If re-login failed.
        """
        with self._check_auth_lock:
            if not self.has_auth:
                raise NoAuth
            if datetime.datetime.now() - self.last_login >= TOKEN_LIFETIME:
                # Token expired
                if auto_re_login:
                    self.login(self._username, self._password)
                else:
                    raise NoAuth

    @_need_auth
    def illust(self, illust_id):
        """Gets a single illust.

        Args:
            illust_id: An integer.

        Returns:
            A PixivIllust object.

        Raises:
            Any exceptions check_auth() will raise.
            IllustError: If the illust_id is invalid or the illust is blocked by
                the Pixiv account setting.
        """
        json_result = Json(self._papi.works(illust_id))
        if json_result.status != 'success':
            error_code = json_result.errors.system.get('code')
            error_message = {
                206: 'Target illust not found.',
                229: 'Illust browsing restricted.',
            }
            raise IllustError(illust_id, error_message.get(error_code))
        return PixivIllust.from_papi(self, json_result.response[0])

    @classmethod
    def _papi_call(cls,
                   call_func,
                   page=1,
                   per_page=30,
                   iter_target=None,
                   extra_yield=None,
                   **kwargs):
        current_page = page
        while current_page:
            json_result = Json(
                call_func(page=current_page, per_page=per_page, **kwargs))

            if json_result.status != 'success':
                raise APIError(call_func, json_result.errors)

            if iter_target is None:
                target = json_result.response
            else:
                target = iter_target(json_result.response)

            for item in target:
                if extra_yield is None:
                    yield item
                else:
                    yield item, extra_yield(json_result.response)

            current_page = json_result.pagination.next

    def _aapi_call(self, call_func, **kwargs):
        req_auth = True

        while True:
            try:
                if int(kwargs['offset']) >= 5000:
                    break
            except (KeyError, ValueError):
                pass
            json_result = Json(call_func(**kwargs, req_auth=req_auth))

            if 'error' in json_result:
                raise APIError(call_func, json_result.error)

            yield from json_result.illusts

            if json_result.next_url is None:
                break
            kwargs = self._aapi.parse_qs(json_result.next_url)

    @query_set
    @_need_auth
    def my_following_illusts(self, until=None):
        """Fetch new illusts of following users.

        Fetch new illusts of following users.

        Normal user can only have the first 2000 illust while Premium user can
        have the first 5000.

        If you didn't turn off the browsing restriction in account setting, the
        R-18(G) illusts will be excluded.

        Args:
            until: Could be:
                [default] None: No limit.
                A string or datetime object which corresponding to the earliest
                    creation time of illusts.

        Returns:
            A QuerySet that yield PixivIllust object.

        Raises:
            Any exceptions check_auth() will raise.
        """
        if isinstance(until, str):
            until = dateutil.parser.parse(until)
        for json_result in self._papi_call(self._papi.me_following_works):
            illust = PixivIllust.from_papi(self, json_result)
            if until is not None and illust.creation_time < until:
                return
            yield illust

    @query_set
    @_need_auth
    def user_illusts(self, user_id):
        """Fetch a user's illusts.

        Fetch a user's illusts.
        If you didn't turn off the browsing restriction in account setting, the
        R-18(G) illusts will be excluded.

        Args:
            user_id: An integer.

        Returns:
            A QuerySet that yield PixivIllust object.

        Raises:
            Any exceptions check_auth() will raise.
            PAPIError: If the user_id is invalid.
        """
        for json_result in self._papi_call(
                self._papi.users_works,
                author_id=user_id,
        ):
            yield PixivIllust.from_papi(self, json_result)

    @query_set
    @_need_auth
    def ranking(
        self,
        mode=RankingMode.DAY,
        date=None,
    ):
        """Fetch all ranking illusts.

        Fetch all ranking illusts and returns them from rank high to low.
        If you didn't turn off the browsing restriction in account setting, the
        R-18(G) illusts will be excluded.

        Args:
            mode: Could be:
                [default] DAY
                WEEK
                MONTH
                DAY_MALE
                DAY_FEMALE
                WEEK_ORIGINAL
                WEEK_ROOKIE
                DAY_MANGA
                DAY_R18
                DAY_MALE_R18
                DAY_FEMALE_R18
                WEEK_R18
                WEEK_R18G

                These constants are defined in
                    pixiv_pixie.constants.RankingMode.
            date: Could be:
                [default] None: Will fetch the latest ranking.
                A date or datetime object.
                A string in the format of '%Y-%m-%d', e.g., '2017-08-01'.

        Returns:
            A QuerySet that yield PixivIllust object.

        Raises:
            Any exceptions check_auth() will raise.
        """
        if isinstance(date, (datetime.date, datetime.datetime)):
            date = date.strftime('%Y-%m-%d')

        # The response of PAPI does not contains metadata. So AAPI was used.
        for rank, json_result in enumerate(self._aapi_call(
                self._aapi.illust_ranking,
                mode=mode.value,
                date=date,
        ),
                                           start=1):
            illust = PixivIllust.from_aapi(self, json_result)
            illust.rank = rank
            yield illust

    @query_set
    @_need_auth
    def search(
        self,
        query,
        mode=SearchMode.TAG,
        period=SearchPeriod.ALL,
        order=SearchOrder.DESC,
    ):
        """Search illusts.

        Search illusts.

        Args:
            query: Query keyword. You can separate multiple keywords by space.
            mode: Could be:
                TEXT: Search in title and caption.
                [default] TAG: Search in tags.
                EXACT_TAG: Search in tags. Only exactly matched tag is
                    acceptable.
                CAPTION: Search in caption.

                These constants are defined in pixiv_pixie.constants.SearchMode.
            period: Could be:
                [default] ALL
                DAY
                WEEK
                MONTH

                This parameter is only applied when order is ASC.
                These constants are defined in
                    pixiv_pixie.constants.SearchPeriod.
            order: Could be:
                [default] DESC: The output will be from new to old.
                ASC: The output will be from old to new.

                These constants are defined in
                    pixiv_pixie.constants.SearchOrder.

        Returns:
            A QuerySet that yield PixivIllust object.

        Raises:
            Any exceptions check_auth() will raise.
        """
        for json_result in self._papi_call(
                self._papi.search_works,
                query=query,
                mode=mode.value,
                period=period.value,
                order=order.value,
        ):
            yield PixivIllust.from_papi(self, json_result)

    @query_set
    @_need_auth
    def related_illusts(self, illust_id, limit=None):
        """Fetch all related illusts.

        Fetch all related illusts of a provided illust.

        Args:
            illust_id: An integer.
            limit: Max number of illust to be yield. If limit=None, there will
                be no limit.

        Returns:
            A QuerySet that yield PixivIllust object.

        Raises:
            Any exceptions check_auth() will raise.
        """
        for cnt, json_result in enumerate(self._aapi_call(
                self._aapi.illust_related,
                illust_id=illust_id,
        ),
                                          start=1):
            if limit is not None and cnt > limit:
                break

            yield PixivIllust.from_aapi(self, json_result)

    @classmethod
    def convert_zip_to_gif(
        cls,
        input_file,
        frame_delays=None,
        output_file=None,
        use_pil=False,
    ):
        """Convert a zip file that contains all frames into gif.

        Convert a zip file that contains all frames into gif.

        Args:
            input_file: The input file. May be str or a file-like object.
            frame_delays: A list of delay durations in microsecond.
            output_file: The output file. May be str or a file-like object.
            use_pil: Whether to ues Pillow library to create GIF file. By
                default FreeImage library will be used. FreeImage create better
                quality and smaller size file, but require external .dll/.so and
                may crash for unknown reason.
        """
        if frame_delays is None:
            if isinstance(input_file, str):
                frame_info = os.path.splitext(input_file)[0] + '.txt'
                with open(frame_info, 'rt', encoding='utf-8') as f:
                    frame_delays = [int(line) for line in f if line.strip()]
            else:
                raise ValueError('Could not get frame delays.')

        if output_file is None:
            if isinstance(input_file, str):
                output_file = os.path.splitext(input_file)[0] + '.gif'
            else:
                raise ValueError('Could not determined output filename.')

        dir_name = os.path.dirname(output_file)
        if dir_name:
            os.makedirs(dir_name, exist_ok=True)

        images = []
        with ZipFile(input_file) as zip_file:
            for name in sorted(zip_file.namelist()):
                with zip_file.open(name) as input_file:
                    images.append(imageio.imread(io.BytesIO(
                        input_file.read())))
        frame_delays = [delay / 1000 for delay in frame_delays]

        if not use_pil:
            save_format = 'GIF-FI'
        else:
            save_format = 'GIF-PIL'
        imageio.mimwrite(
            output_file,
            images,
            format=save_format,
            duration=frame_delays,
        )
        del images

    @classmethod
    def _get_file_path(
        cls,
        illust,
        page,
        url,
        convert_ugoira,
        directory,
        name,
        addition_naming_info,
    ):
        original_name = os.path.basename(url)
        root, ext = os.path.splitext(original_name)

        if convert_ugoira and ext == '.zip':
            ext = '.gif'
            original_name = root + ext

        if name:
            naming_info = {
                'illust': illust,
                'page': page,
                'original_name': original_name,
                'root': root,
                'ext': ext,
            }
            if addition_naming_info:
                naming_info.update(addition_naming_info)
            filename = name.format(**naming_info)
        else:
            filename = original_name

        file_path = os.path.join(directory, filename)

        return file_path

    @classmethod
    def _try_remove_file(cls, path):
        if not isinstance(path, str) or not path:
            return

        try:
            os.remove(path)
        except OSError:
            pass

    @classmethod
    def _check_exist(cls, path, checklist):
        basename = os.path.basename(path)

        for folder in checklist:
            if os.path.exists(os.path.join(folder, basename)):
                return True

        return False

    def _download_illust_to_file(self, url, file):
        requests_kwargs = self.requests_kwargs.copy()
        requests_kwargs['stream'] = True
        requests_kwargs['headers'] = ILLUST_DOWNLOAD_HEADERS

        try:
            wrote_size = 0
            total_size = None

            for wrote_size, total_size in download(
                    file,
                    url,
                    **requests_kwargs,
            ):
                pass

            if total_size is not None and wrote_size < total_size:
                raise APIError(
                    self.download,
                    'Unexpected connection interruption.',
                )

        except requests.HTTPError as e:
            raise APIError(self.download, e.response.text) from e

    def _download_one_url(
        self,
        illust,
        url,
        path,
        convert_ugoira,
        replace,
        check_exists,
        max_tries,
        fake_download,
        use_pil,
    ):
        if not replace and os.path.exists(path):
            return False

        if self._check_exist(path, check_exists):
            return False

        if fake_download:
            return False

        dir_name = os.path.dirname(path)

        frame_path = None

        for tries in count(start=1):
            try:
                buffer = io.BytesIO()
                self._download_illust_to_file(url, buffer)
                buffer.seek(0)

                if illust.type == IllustType.UGOIRA and convert_ugoira:
                    self.convert_zip_to_gif(
                        buffer,
                        illust.frame_delays,
                        path,
                        use_pil,
                    )
                else:
                    if dir_name:
                        os.makedirs(dir_name, exist_ok=True)

                    with open(path, 'wb') as f:
                        copyfileobj(buffer, f)

                    if illust.type == IllustType.UGOIRA:
                        frame_path = os.path.splitext(path)[0] + '.txt'
                        with open(frame_path, 'wt') as f:
                            for frame_delay in illust.frame_delays:
                                print(frame_delay, file=f)

                return True
            except Exception as e:
                self._try_remove_file(path)
                self._try_remove_file(frame_path)

                if max_tries is None or tries < max_tries:
                    continue

                raise DownloadError(illust, e) from e

    def _download_multiple_urls(
        self,
        illust,
        target,
        convert_ugoira,
        replace,
        check_exists,
        max_tries,
        fake_download,
        use_pil,
    ):
        result = []

        for url, path in target:
            result.append((url, path,
                           self._download_one_url(
                               illust,
                               url,
                               path,
                               convert_ugoira=convert_ugoira,
                               replace=replace,
                               check_exists=check_exists,
                               max_tries=max_tries,
                               fake_download=fake_download,
                               use_pil=use_pil,
                           )))

        return result

    @_need_auth
    def download(
        self,
        illust,
        directory=os.path.curdir,
        name=None,
        addition_naming_info=None,
        convert_ugoira=True,
        replace=False,
        check_exists=None,
        max_tries=5,
        fake_download=False,
        use_pil=False,
    ):
        """Download illust.

        Download illust.

        Args:
            illust: The illust or illust_id to be downloaded.
            directory: Directory.
            name: If set, the downloaded file would be renamed. Could contains
                format string syntax.
                e.g. name='{illust.user_id}_{original_name}'
                The following information is provided:
                    illust: The illust object.
                    page: 0-based page number.
                    original_name: The default filename.
                    root: The root part of original_name. e.g. 'foo' in
                        'foo.bar'.
                    ext: The extension part of original_name. e.g. '.bar' in
                        'foo.bar'.
            addition_naming_info: Addition dict that will be used when
                formatting name.
            convert_ugoira: Whether to download ugoira as gif. If false, a zip
                file will be downloaded instead. And a txt file contains frame
                durations would be created.
            replace: If true, will replace already exist file(s).
            check_exists: Addition path(s) to check whether the illust exists
                (by name). Could be a path string, a list of path string or
                None.
            max_tries: Max try times when download failed. If max_tries=None, it
                will loop infinitely until finished.
            fake_download: If True, no file will be actually downloaded.
            use_pil: Whether to ues Pillow library to create GIF file. Refers to
                the doc of PixivPixie.convert_zip_to_gif().

        Returns:
            A list of download result of each page. Each result is a tuple of
                (url, path, downloaded).

        Raises:
            Any exceptions check_auth() will raise.
            DownloadError.
        """
        if isinstance(illust, int):
            illust = self.illust(illust)

        if check_exists is None:
            check_exists = []
        elif isinstance(check_exists, str):
            check_exists = [check_exists]

        download_target = []
        for tries in count(start=1):
            try:
                download_target = [(
                    url,
                    self._get_file_path(
                        illust,
                        page,
                        url,
                        convert_ugoira,
                        directory,
                        name,
                        addition_naming_info,
                    ),
                ) for page, url in enumerate(illust.image_urls)]
                break
            except Exception as e:
                if max_tries is None or tries < max_tries:
                    continue

                raise DownloadError(illust, e) from e

        return self._download_multiple_urls(
            illust,
            download_target,
            convert_ugoira=convert_ugoira,
            replace=replace,
            check_exists=check_exists,
            max_tries=max_tries,
            fake_download=fake_download,
            use_pil=use_pil,
        )
예제 #3
0
class PixivSpider:
    def __init__(self):
        """
        Init PixivSpider
        """
        self.api = AppPixivAPI()
        self.directory = 'download'
        if not os.path.exists('info.json'):
            self.data = {'illusts': []}
            self.count = 0
            print("Create new info.json file")
        else:
            with open('info.json', 'r') as f:
                self.data = json.load(f)
                self.count = len(self.data['illusts'])
                print("Load existing info.json file")
                print("Existed illusts count: %d" % self.count)
        self.illusts_names = Set()
        for illust in self.data['illusts']:
            self.illusts_names.add(illust['name'])

    def login(self):
        """
        Login pixiv.net
        """
        with open('login.json') as f:
            login = json.load(f)
            self.api.login(login["username"], login["password"])
            print("Login pixiv.net with user %s.", login["username"])

    def exit(self):
        """
        Stop spider and print logs
        """
        with open('info.json', 'w') as f:
            json.dump(self.data, f, indent=2)
        print("Finish! Total downloaded illusts number: %d" % self.count)

    def create_download_folder(self):
        """
        Setup image download directory
        """
        if not os.path.exists(self.directory):
            os.makedirs(self.directory)

    def download_illusts(self, illusts=None):
        """
        Download illusts
        """
        for illust in illusts:
            image_url = illust.meta_single_page.get('original_image_url',
                                                    illust.image_urls.large)
            print(u"👀  Found illust: %s (%s)" % (illust.title, image_url))
            url_basename = os.path.basename(image_url)
            extension = os.path.splitext(url_basename)[1]
            name = "%d_%s%s" % (illust.id, illust.title, extension)
            name = name.replace('/', ':')
            if name not in self.illusts_names:
                self.count += 1
                self.data['illusts'].append({
                    'id': self.count,
                    'name': name,
                    'illust_id': illust.id,
                    'illustrator_id': illust.user.id,
                    'source_url': image_url
                })
                self.illusts_names.add(name)
                name = "%d_" % self.count + name
                try:
                    self.api.download(image_url,
                                      path=self.directory,
                                      name=name)
                except PixivError:
                    print(u"😢  PixivError!!! Skip this illust")
                    continue
                print(u"✅  Download illust: %s (%s)" %
                      (illust.title, image_url))
            else:
                print(u"✨  Already download: %s: %s" %
                      (illust.title, image_url))

    def get_user_ids_from_illusts(self, illusts):
        """
        Get user ids by illusts
        """
        user_ids = []
        for illust in illusts:
            user_ids.append(illust.user.id)
        return user_ids

    def get_top_ranking_illusts(self,
                                count=DEFAULT_DOWNLOAD_TOP_RANKING_COUNT,
                                ranking_type=RankingType.DAY,
                                date=datetime.today().strftime("%Y-%m-%d"),
                                download=False):
        """
        Get top ranking illusts
        :count: the number of illusts that we want to download
        :ranking_type: ranking type
        :date: date
        :download: download flag
        """
        json_result = self.api.illust_ranking(ranking_type, date=date)
        illusts = self.get_illusts_from_all_pages(json_result,
                                                  json_result.illusts, count,
                                                  download)
        return illusts[:count]

    def get_recommended_illusts(self,
                                count=DEFAULT_DOWNLOAD_RECOMMENDED_COUNT,
                                content_type=ContentType.ILLUST,
                                download=False):
        """
        Get recommended illusts
        :count: the number of illusts that we want to download
        :content_type: content type
        :download: download flag
        """
        json_result = self.api.illust_recommended(content_type)
        illusts = self.get_illusts_from_all_pages(json_result,
                                                  json_result.illusts, count,
                                                  download)
        return illusts[:count]

    def get_illusts_by_user_ids(self,
                                user_ids,
                                count=DEFAULT_DOWNLOAD_EACH_USER_COUNT,
                                content_type=ContentType.ILLUST,
                                download=False):
        """
        Get illusts by user id
        """
        ret = {}
        for user_id in user_ids:
            json_result = self.api.user_illusts(user_id=user_id,
                                                type=content_type)
            illusts = self.get_illusts_from_all_pages(json_result,
                                                      json_result.illusts,
                                                      count, download)
            ret[user_id] = illusts[:count]
        return ret

    def get_illusts_from_all_pages(self,
                                   json_result,
                                   illusts,
                                   count,
                                   download=False):
        """
        Get illusts from all pages
        """
        while len(json_result) != 0 and len(illusts) < count:
            next_qs = self.api.parse_qs(json_result.next_url)
            if next_qs is None:
                break
            try:
                json_result = self.api.illust_ranking(**next_qs)
            except TypeError:
                break
            illusts += json_result.illusts

        if download:
            count = min(count, len(illusts))
            self.download_illusts(illusts=illusts[:count])

        return illusts