class PixivHandler: def __init__(self, name, app_config={}): config_path = Path(app_config.get('handlers_config_dir', '.')) / 'pixiv.toml' data_path = Path(app_config.get('data_dir', './data/')) / '{}.toml'.format(name) self.config = Config(config_path, write_defaults=True, defaults={ 'refresh': 'xxxx', }) self.config.save() self.data = Config(data_path) self.age_filter = None self.api = PixivAPI() if self.config.get('refresh'): print('logging in to Pixiv...') login_response = self.api.auth(refresh_token=self.config['refresh']) print('logged in into account {0.name} ({0.account}) [{0.id}]'.format(login_response['response']['user'])) def set_age_filter(self, filter): self.age_filter = filter def handle(self, feed): if feed == 'followings': data = self.api.me_following_works(image_sizes=['large', 'medium'], include_stats=False) elif feed == 'bookmarks': data = self.api.me_favorite_works() else: return [] if data['status'] != 'success': print('invalid response') print('got:') print(data) return [] results = data['response'] save_data = self.data.get(feed, {'last_id': 0}) print('latest id: {}'.format(save_data.get('last_id'))) results = list(filter(lambda x: x['id'] > save_data.get('last_id'), results)) if len(results) == 0: return [] save_data['last_id'] = results[0]['id'] self.data[feed] = save_data self.data.save() ret = [] for entry in results: print('Handling pixiv entry {}'.format(entry['id'])) if self.age_filter != None: if entry['age_limit'] in ['r18', 'r18-g'] and self.age_filter == 'safe': print('skipping because currently in safe mode') continue if entry['age_limit'] == 'all-age' and self.age_filter == 'r18': print('skipping because currently in r18 mode') continue content = '<https://www.pixiv.net/artworks/{}>'.format(entry['id']) content += '\n{} by {} ({})'.format(entry['title'], entry['user']['name'], entry['user']['account']) content += '\nTags: {}'.format(' '.join(entry['tags'])) if entry['is_manga']: print('it\'s a manga') work = self.api.works(entry['id']) if work['status'] != 'success': continue work = work['response'] if len(work) == 0: continue work = work[0] urls = [x['image_urls']['medium'] for x in work['metadata']['pages']] if len(urls) > 4: content += '\n{} more pictures not shown here'.format(len(urls) - 4) urls = urls[:4] else: if entry['width'] > 2000 or entry['height'] > 2000: content += '\n(not displaying full resolution because it is too large)' urls = [entry['image_urls']['medium']] else: urls = [entry['image_urls']['large']] files = [] index = 0 for url in urls: print('downloading picture...') response = requests.get(url, headers={'referer': 'https://pixiv.net'}) if response.status_code != 200: continue ext = Path(url).suffix files.append({'data': response.content, 'name': 'page{}{}'.format(index, ext)}) index += 1 ret.append({'content': content, 'files': files}) ret.reverse() return ret
class PixivClip: def __init__(self, _user, _pass, default_path=r'F:/PIXIV'): self._user, self._pass = (_user, _pass) self.api = PixivAPI() self.pixiv_utils = PixivUtils() self.pastes = Queue() self.default_path = default_path self.local_pixiv_ids = [] def login(self): self.api.login(self._user, self._pass) def get_illust(self, illust_id): if not self.api.access_token: self.login() try: json_result = self.api.works(illust_id) json_result = json_result.response except AttributeError: raise AttributeError(json_result) except utils.PixivError as e: self.login() print(e, self.api.access_token, json_result) return self.get_illust(illust_id) else: json_result = self.pixiv_utils.is_single_array(json_result) return json_result def work(self, illust_id): try: illust = self.get_illust(illust_id) except AttributeError as json_result: # print(json_result.has_error) # print(json_result.status) print(json_result) print(dir(json_result)) return else: pass illust_type = illust.type print(illust) def refresh_local_pixiv_ids(self): for i in self.pixiv_utils.list_imgs_pixiv_ids_in_dir(self.default_path): self.local_pixiv_ids.append(i) def callback(self, url): illust_id = self.pixiv_utils.parse_url_for_id(url) if illust_id in self.pastes.queue: print(illust_id, "Already In Queue!") return self.pastes.put(illust_id) def watch(self): watcher = ClipWatcher(self.pixiv_utils.is_pixiv_illust_url, self.callback) try: for i in watcher.start(): print('PIXIV LINKS:', self.pastes.qsize(), i, flush=True, end='\r') except (KeyboardInterrupt, Exception) as e: watcher.stop() return watcher def print_pastes_queue(self): while not self.pastes.empty(): illust_id = self.pastes.get() print(illust_id) input("PAUSED!!!") def begin(self): watcher = self.watch() self.print_pastes_queue()
class PixivCrawler: KKRTAG = ['弦巻こころ'] def __init__( self, auth, work_path=os.path.abspath('../pixiv/'), ): self._api = PixivAPI() self._api.login(*auth) self._wd = work_path def fetch_work(self, work_id, tag): got = False ri = self._api.works(work_id) try: r = ri.response[0] except: r = None if not r: return got url_list = [] if r.metadata: for p in r.metadata.pages: url_list.append(p.image_urls.large) else: url_list.append(r.image_urls.large) created_time = r.created_time[:10].replace('-', '') wd = os.path.join(self._wd, created_time) if not os.path.isdir(wd): os.mkdir(wd) fns = [] for url in url_list: fn = os.path.basename(url) final_fn = os.path.join(created_time, fn) _logger.info('getting %s to %s', url, wd) try: if self._api.download(url, fname=fn, path=wd): got = True shutil.move(os.path.join(wd, fn), os.path.join(wd, fn + '.download')) fns.append(final_fn) except: import sys sys.excepthook(*sys.exc_info()) if fns: meta = json.dumps(r) dmeta = { 'work_id': work_id, 'mode': tag, 'user': r.user.id, 'fn': fns, 'meta': meta, } PixivCursor.insert_update_one(dmeta) return got def get_by_tag(self, search_tag='', filter_tag=[], num=30, save_tag=''): if not search_tag and not filter_tag: return None if filter_tag: filter_tag = [x.strip().lower() for x in filter_tag] if not search_tag: search_tag = filter_tag[0] filter_tag = filter_tag[1:] if not save_tag: save_tag = search_tag filter_tag = set(filter_tag) _logger.info('search: %s filter: %s', search_tag, filter_tag) ret = 0 page = 1 while ret < num: r = self._api.search_works(search_tag, mode='tag', page=page, per_page=30) try: l = r.response except: l = None if not l: break _logger.info('get %d illusts', len(l)) for i in l: if i.type != 'illustration': continue tt = set([x.strip().lower() for x in i.tags]) if len(tt & filter_tag) != len(filter_tag): continue if self.fetch_work(i.id, save_tag): ret += 1 if ret > num: break page += 1 return ret def get_rank(self, mode='daily', num=30): ret = 0 page = 1 while ret < num: r = self._api.ranking_all(mode=mode, page=page, per_page=30) try: l = r.response[0].works except: l = None if not l: break _logger.info('get %d ranking illust', len(l)) for i in l: if i.work.type != 'illustration': continue if self.fetch_work(i.work.id, mode): ret += 1 if ret >= num: break page += 1 return ret
class PixivPixie: """Pixiv API interface. Remember call login() before using other methods. Attributes: auto_re_login: If true, PixivPixie will auto re-login when login token expired. """ def __init__(self, auto_re_login=True, **requests_kwargs): self.auto_re_login = auto_re_login self._requests_kwargs = requests_kwargs self._papi = PixivAPI(**requests_kwargs) self._aapi = AppPixivAPI(**requests_kwargs) self._has_auth = False self._last_login = None self._check_auth_lock = Lock() self._username = None self._password = None @property def requests_kwargs(self): """Parameters that will be passed to requests.""" return self._requests_kwargs @requests_kwargs.setter def requests_kwargs(self, requests_kwargs): self._requests_kwargs = requests_kwargs self._papi.requests_kwargs = requests_kwargs self._aapi.requests_kwargs = requests_kwargs @property def has_auth(self): """Whether the pixie has login.""" return self._has_auth @property def last_login(self): """Last login time. Will be a datetime object or None if haven't login yet.""" return self._last_login def login(self, username, password): """Login Pixiv account. Notice: The access token will expire after about 1 hour. So if you are dealing with a long time quest, remember to re-login every some time. Args: username: Your Pixiv account's username. password: Your Pixiv account's password. Returns: None. Raises: LoginFailed: An error occurred if the username and password is not match. """ if not username or not password: raise LoginFailed try: self._papi.login(username, password) # self._aapi.login(username, password) self._aapi.access_token = self._papi.access_token self._aapi.user_id = self._papi.user_id self._aapi.refresh_token = self._papi.refresh_token except PixivError: raise LoginFailed else: self._has_auth = True self._username = username self._password = password self._last_login = datetime.datetime.now() return self def check_auth(self, auto_re_login=False): """Raise error if the pixie doesn't has auth. Args: auto_re_login: If true, the PixivPixie will try to re-login when login token expired. Raises: NoAuth: If the PixivPixie hasn't login first. LoginFailed: If re-login failed. """ with self._check_auth_lock: if not self.has_auth: raise NoAuth if datetime.datetime.now() - self.last_login >= TOKEN_LIFETIME: # Token expired if auto_re_login: self.login(self._username, self._password) else: raise NoAuth @_need_auth def illust(self, illust_id): """Gets a single illust. Args: illust_id: An integer. Returns: A PixivIllust object. Raises: Any exceptions check_auth() will raise. IllustError: If the illust_id is invalid or the illust is blocked by the Pixiv account setting. """ json_result = Json(self._papi.works(illust_id)) if json_result.status != 'success': error_code = json_result.errors.system.get('code') error_message = { 206: 'Target illust not found.', 229: 'Illust browsing restricted.', } raise IllustError(illust_id, error_message.get(error_code)) return PixivIllust.from_papi(self, json_result.response[0]) @classmethod def _papi_call(cls, call_func, page=1, per_page=30, iter_target=None, extra_yield=None, **kwargs): current_page = page while current_page: json_result = Json( call_func(page=current_page, per_page=per_page, **kwargs)) if json_result.status != 'success': raise APIError(call_func, json_result.errors) if iter_target is None: target = json_result.response else: target = iter_target(json_result.response) for item in target: if extra_yield is None: yield item else: yield item, extra_yield(json_result.response) current_page = json_result.pagination.next def _aapi_call(self, call_func, **kwargs): req_auth = True while True: try: if int(kwargs['offset']) >= 5000: break except (KeyError, ValueError): pass json_result = Json(call_func(**kwargs, req_auth=req_auth)) if 'error' in json_result: raise APIError(call_func, json_result.error) yield from json_result.illusts if json_result.next_url is None: break kwargs = self._aapi.parse_qs(json_result.next_url) @query_set @_need_auth def my_following_illusts(self, until=None): """Fetch new illusts of following users. Fetch new illusts of following users. Normal user can only have the first 2000 illust while Premium user can have the first 5000. If you didn't turn off the browsing restriction in account setting, the R-18(G) illusts will be excluded. Args: until: Could be: [default] None: No limit. A string or datetime object which corresponding to the earliest creation time of illusts. Returns: A QuerySet that yield PixivIllust object. Raises: Any exceptions check_auth() will raise. """ if isinstance(until, str): until = dateutil.parser.parse(until) for json_result in self._papi_call(self._papi.me_following_works): illust = PixivIllust.from_papi(self, json_result) if until is not None and illust.creation_time < until: return yield illust @query_set @_need_auth def user_illusts(self, user_id): """Fetch a user's illusts. Fetch a user's illusts. If you didn't turn off the browsing restriction in account setting, the R-18(G) illusts will be excluded. Args: user_id: An integer. Returns: A QuerySet that yield PixivIllust object. Raises: Any exceptions check_auth() will raise. PAPIError: If the user_id is invalid. """ for json_result in self._papi_call( self._papi.users_works, author_id=user_id, ): yield PixivIllust.from_papi(self, json_result) @query_set @_need_auth def ranking( self, mode=RankingMode.DAY, date=None, ): """Fetch all ranking illusts. Fetch all ranking illusts and returns them from rank high to low. If you didn't turn off the browsing restriction in account setting, the R-18(G) illusts will be excluded. Args: mode: Could be: [default] DAY WEEK MONTH DAY_MALE DAY_FEMALE WEEK_ORIGINAL WEEK_ROOKIE DAY_MANGA DAY_R18 DAY_MALE_R18 DAY_FEMALE_R18 WEEK_R18 WEEK_R18G These constants are defined in pixiv_pixie.constants.RankingMode. date: Could be: [default] None: Will fetch the latest ranking. A date or datetime object. A string in the format of '%Y-%m-%d', e.g., '2017-08-01'. Returns: A QuerySet that yield PixivIllust object. Raises: Any exceptions check_auth() will raise. """ if isinstance(date, (datetime.date, datetime.datetime)): date = date.strftime('%Y-%m-%d') # The response of PAPI does not contains metadata. So AAPI was used. for rank, json_result in enumerate(self._aapi_call( self._aapi.illust_ranking, mode=mode.value, date=date, ), start=1): illust = PixivIllust.from_aapi(self, json_result) illust.rank = rank yield illust @query_set @_need_auth def search( self, query, mode=SearchMode.TAG, period=SearchPeriod.ALL, order=SearchOrder.DESC, ): """Search illusts. Search illusts. Args: query: Query keyword. You can separate multiple keywords by space. mode: Could be: TEXT: Search in title and caption. [default] TAG: Search in tags. EXACT_TAG: Search in tags. Only exactly matched tag is acceptable. CAPTION: Search in caption. These constants are defined in pixiv_pixie.constants.SearchMode. period: Could be: [default] ALL DAY WEEK MONTH This parameter is only applied when order is ASC. These constants are defined in pixiv_pixie.constants.SearchPeriod. order: Could be: [default] DESC: The output will be from new to old. ASC: The output will be from old to new. These constants are defined in pixiv_pixie.constants.SearchOrder. Returns: A QuerySet that yield PixivIllust object. Raises: Any exceptions check_auth() will raise. """ for json_result in self._papi_call( self._papi.search_works, query=query, mode=mode.value, period=period.value, order=order.value, ): yield PixivIllust.from_papi(self, json_result) @query_set @_need_auth def related_illusts(self, illust_id, limit=None): """Fetch all related illusts. Fetch all related illusts of a provided illust. Args: illust_id: An integer. limit: Max number of illust to be yield. If limit=None, there will be no limit. Returns: A QuerySet that yield PixivIllust object. Raises: Any exceptions check_auth() will raise. """ for cnt, json_result in enumerate(self._aapi_call( self._aapi.illust_related, illust_id=illust_id, ), start=1): if limit is not None and cnt > limit: break yield PixivIllust.from_aapi(self, json_result) @classmethod def convert_zip_to_gif( cls, input_file, frame_delays=None, output_file=None, use_pil=False, ): """Convert a zip file that contains all frames into gif. Convert a zip file that contains all frames into gif. Args: input_file: The input file. May be str or a file-like object. frame_delays: A list of delay durations in microsecond. output_file: The output file. May be str or a file-like object. use_pil: Whether to ues Pillow library to create GIF file. By default FreeImage library will be used. FreeImage create better quality and smaller size file, but require external .dll/.so and may crash for unknown reason. """ if frame_delays is None: if isinstance(input_file, str): frame_info = os.path.splitext(input_file)[0] + '.txt' with open(frame_info, 'rt', encoding='utf-8') as f: frame_delays = [int(line) for line in f if line.strip()] else: raise ValueError('Could not get frame delays.') if output_file is None: if isinstance(input_file, str): output_file = os.path.splitext(input_file)[0] + '.gif' else: raise ValueError('Could not determined output filename.') dir_name = os.path.dirname(output_file) if dir_name: os.makedirs(dir_name, exist_ok=True) images = [] with ZipFile(input_file) as zip_file: for name in sorted(zip_file.namelist()): with zip_file.open(name) as input_file: images.append(imageio.imread(io.BytesIO( input_file.read()))) frame_delays = [delay / 1000 for delay in frame_delays] if not use_pil: save_format = 'GIF-FI' else: save_format = 'GIF-PIL' imageio.mimwrite( output_file, images, format=save_format, duration=frame_delays, ) del images @classmethod def _get_file_path( cls, illust, page, url, convert_ugoira, directory, name, addition_naming_info, ): original_name = os.path.basename(url) root, ext = os.path.splitext(original_name) if convert_ugoira and ext == '.zip': ext = '.gif' original_name = root + ext if name: naming_info = { 'illust': illust, 'page': page, 'original_name': original_name, 'root': root, 'ext': ext, } if addition_naming_info: naming_info.update(addition_naming_info) filename = name.format(**naming_info) else: filename = original_name file_path = os.path.join(directory, filename) return file_path @classmethod def _try_remove_file(cls, path): if not isinstance(path, str) or not path: return try: os.remove(path) except OSError: pass @classmethod def _check_exist(cls, path, checklist): basename = os.path.basename(path) for folder in checklist: if os.path.exists(os.path.join(folder, basename)): return True return False def _download_illust_to_file(self, url, file): requests_kwargs = self.requests_kwargs.copy() requests_kwargs['stream'] = True requests_kwargs['headers'] = ILLUST_DOWNLOAD_HEADERS try: wrote_size = 0 total_size = None for wrote_size, total_size in download( file, url, **requests_kwargs, ): pass if total_size is not None and wrote_size < total_size: raise APIError( self.download, 'Unexpected connection interruption.', ) except requests.HTTPError as e: raise APIError(self.download, e.response.text) from e def _download_one_url( self, illust, url, path, convert_ugoira, replace, check_exists, max_tries, fake_download, use_pil, ): if not replace and os.path.exists(path): return False if self._check_exist(path, check_exists): return False if fake_download: return False dir_name = os.path.dirname(path) frame_path = None for tries in count(start=1): try: buffer = io.BytesIO() self._download_illust_to_file(url, buffer) buffer.seek(0) if illust.type == IllustType.UGOIRA and convert_ugoira: self.convert_zip_to_gif( buffer, illust.frame_delays, path, use_pil, ) else: if dir_name: os.makedirs(dir_name, exist_ok=True) with open(path, 'wb') as f: copyfileobj(buffer, f) if illust.type == IllustType.UGOIRA: frame_path = os.path.splitext(path)[0] + '.txt' with open(frame_path, 'wt') as f: for frame_delay in illust.frame_delays: print(frame_delay, file=f) return True except Exception as e: self._try_remove_file(path) self._try_remove_file(frame_path) if max_tries is None or tries < max_tries: continue raise DownloadError(illust, e) from e def _download_multiple_urls( self, illust, target, convert_ugoira, replace, check_exists, max_tries, fake_download, use_pil, ): result = [] for url, path in target: result.append((url, path, self._download_one_url( illust, url, path, convert_ugoira=convert_ugoira, replace=replace, check_exists=check_exists, max_tries=max_tries, fake_download=fake_download, use_pil=use_pil, ))) return result @_need_auth def download( self, illust, directory=os.path.curdir, name=None, addition_naming_info=None, convert_ugoira=True, replace=False, check_exists=None, max_tries=5, fake_download=False, use_pil=False, ): """Download illust. Download illust. Args: illust: The illust or illust_id to be downloaded. directory: Directory. name: If set, the downloaded file would be renamed. Could contains format string syntax. e.g. name='{illust.user_id}_{original_name}' The following information is provided: illust: The illust object. page: 0-based page number. original_name: The default filename. root: The root part of original_name. e.g. 'foo' in 'foo.bar'. ext: The extension part of original_name. e.g. '.bar' in 'foo.bar'. addition_naming_info: Addition dict that will be used when formatting name. convert_ugoira: Whether to download ugoira as gif. If false, a zip file will be downloaded instead. And a txt file contains frame durations would be created. replace: If true, will replace already exist file(s). check_exists: Addition path(s) to check whether the illust exists (by name). Could be a path string, a list of path string or None. max_tries: Max try times when download failed. If max_tries=None, it will loop infinitely until finished. fake_download: If True, no file will be actually downloaded. use_pil: Whether to ues Pillow library to create GIF file. Refers to the doc of PixivPixie.convert_zip_to_gif(). Returns: A list of download result of each page. Each result is a tuple of (url, path, downloaded). Raises: Any exceptions check_auth() will raise. DownloadError. """ if isinstance(illust, int): illust = self.illust(illust) if check_exists is None: check_exists = [] elif isinstance(check_exists, str): check_exists = [check_exists] download_target = [] for tries in count(start=1): try: download_target = [( url, self._get_file_path( illust, page, url, convert_ugoira, directory, name, addition_naming_info, ), ) for page, url in enumerate(illust.image_urls)] break except Exception as e: if max_tries is None or tries < max_tries: continue raise DownloadError(illust, e) from e return self._download_multiple_urls( illust, download_target, convert_ugoira=convert_ugoira, replace=replace, check_exists=check_exists, max_tries=max_tries, fake_download=fake_download, use_pil=use_pil, )
from create_database import configs import time from pixivpy3 import PixivAPI _REQUESTS_KWARGS = { 'proxies': { 'https': configs.proxy, }, 'verify': True, # PAPI use https, an easy way is disable requests SSL verify } start_time = time.time() api = PixivAPI(**_REQUESTS_KWARGS) api.set_auth(configs.pixiv.access_token, configs.pixiv.refresh_token) # api.login(configs.pixiv.user, configs.pixiv.passwd) # json_result = api.illust_detail(59580629) # illust = json_result.illust # print(">>> origin url: %s" % illust.image_urls['large']) # api.auth(configs.pixiv.user, configs.pixiv.passwd, configs.pixiv.refresh_token) json_result = api.works(46363414) print(json_result) illust = json_result.response[0] print(">>> %s, origin url: %s" % (illust.caption, illust.image_urls['large'])) end_time = time.time() print(end_time - start_time, 's')
class pixivImage: #Takes URL or ID as argument def __init__(self, *args): baseURL = "https://www.pixiv.net/member_illust.php?mode=medium&illust_id=" self.image_URLs = [] self.directories = [] self.caption = "" self.userTags = [] self.userImported = False for arg in args: length = len(str(arg)) #If it is an ID, it is 8 digits long and an int if length == 8: self.ID = int(arg) self.URL = baseURL + str(arg) #If it's a url, it's the baseURL plus the int elif isinstance(arg, str) & length == len(baseURL) + 8: self.URL = arg try: self.ID = self.url[self.url.find("&illust_id=", 0, length ) + len("&illust_id="):length] except TypeError: print("URL is malformed") #Fix minor bad URL self.URL = baseURL + str(arg["ID"]) else: print("URL OR ID is wrong or in bad format") #Gets PixixImage attribute def __get__(self, obj, objtype): #Tries to get attribute, if it does not exist, cycles through imports then outputs error try: return getattr(obj, self.attr) except AttributeError: try: self.importIllustJSON() return self.item except AttributeError: try: self.importUserJSON() except AttributeError: print("Image does not have that attribute") pass def setCustomTags(self, tags): self.userTags = tags def setCaption(self, caption): self.caption = caption #Import info using pixivAPI into class from JSON def importIllustJSON(self): #Login to Pixiv API self.api = PixivAPI() self.api.login(pixivLogin["pixivusername"], pixivLogin["pixivpassword"]) userURL = "https://www.pixiv.net/member_id=" self.JSON = self.api.works(self.ID)['response'][0] self.manga = self.JSON['is_manga'] self.account = self.JSON['user']['account'] self.name = self.JSON['user']['name'] self.user_ID = self.JSON['user']['id'] self.user_URL = userURL + str(self.user_ID) self.title = self.JSON['title'] self.tags = self.JSON['tags'] self.pages = self.JSON['page_count'] if self.pages > 1: for page in range(self.pages - 1): self.image_URLs.append(self.JSON['metadata']["pages"][page] ["image_urls"]['large']) else: self.image_URLs.append(self.JSON['image_urls']['large']) #Imports JSON with user information. def importUserJSON(self): #Non-authenticated API login aapi = AppPixivAPI() self.userJSON = aapi.user_detail(self.user_ID) self.webpage = self.userJSON['profile']['webpage'] self.twitter_name = self.userJSON['profile']['twitter_account'] self.twitter_URL = self.userJSON['profile']['twitter_url'] self.pawoo_URL = self.userJSON['profile']['pawoo_url'] self.userImported = True #Manually import JSON information def importJSON(self): self.importIllustJSON() self.importUserJSON() #Downloads images to directory def download(self, directory=None): for URL in self.image_URLs: if directory is None: directory = os.path.dirname( os.path.abspath(__file__)) + "\\temp\\" if not os.path.exists(directory): os.makedirs(os.path.dirname(directory)) self.api.download(URL, prefix=directory) else: if not os.path.exists(directory): os.makedirs(directory) self.api.download(URL, prefix=directory) directory = directory + "\\" + str(os.path.basename(URL)) self.directories.append(directory) self.api.download(URL)
class Pixiv(object): def __init__(self, dbDict, config): self.config = config self.dbDict = dbDict self.username = config['PIXIV_USERNAME'] self.password = config['PIXIV_PASSWORD'] self.imageDirectory = os.path.join(config['PIXIV_DOWNLOAD_DIRECTORY'], 'images') self.ugoiraDirectory = os.path.join(config['PIXIV_DOWNLOAD_DIRECTORY'], 'ugoira') self.avatarDirectory = os.path.join(config['PIXIV_DOWNLOAD_DIRECTORY'], 'avatars') os.makedirs(self.imageDirectory, exist_ok=True) os.makedirs(self.ugoiraDirectory, exist_ok=True) os.makedirs(self.avatarDirectory, exist_ok=True) self.api = PixivAPI() self.authorize() def authorize(self): self.api.login(self.username, self.password) def loadWorks(self): print('Retrieving Pixiv works') self.authorize() apiWorks = self.api.me_following_works( 1, self.config['MAX_WORKS_ON_PAGE']) workDicts = apiWorks['response'] workDicts = [w for w in workDicts] [self._getImageData(workDict) for workDict in workDicts] def loadExtraWorkInfo(self): updates = [] worksToUpdate = [ work for work in self.dbDict['works'].values() if work['website'] == 'Pixiv' and not work.get('imageUrls') ] if worksToUpdate: print("Found {} new Pixiv works".format(len(worksToUpdate))) for work in worksToUpdate: imageDict = work['pixivMeta'] extraInfo = { 'authorAvatarUrl': self._getAvatarUrl( str( imageDict.get('user').get('profile_image_urls').get( 'px_50x50'))), 'imageUrls': self._getImageUrls(imageDict), 'pixivMeta': '', } updates.append((work['identifier'], extraInfo)) [ self.dbDict['works'][identifier].update(extraInfo) for (identifier, extraInfo) in updates ] def _getImageData(self, imageDict): identifier = str(imageDict.get('id')) if identifier not in self.dbDict[ 'works']: # Skip images we've already loaded user = imageDict.get('user') or {} imageData = { 'identifier': identifier, 'authorName': str(user.get('name')), 'authorHandle': str(user.get('account')), 'authorAvatarUrl': None, 'profileUrl': 'http://www.pixiv.net/member.php?id=' + str(user.get('id')), 'website': 'Pixiv', 'imageTitle': str(imageDict.get('title')), 'imageUrls': None, 'imagePageUrl': 'http://www.pixiv.net/member_illust.php?mode=medium&illust_id=' + str(imageDict.get('id')), 'imageTimestamp': self._parseTime(imageDict), 'imageType': str(imageDict.get('type')), 'nsfw': str(imageDict.get('age_limit') != 'all-age'), 'width': str(imageDict.get('width')) or '500', 'height': str(imageDict.get('height')) or '500', 'success': str(imageDict.get('status') == 'success'), 'error': str(imageDict.get('errors')), 'pixivMeta': imageDict, #stores the pixiv API info to facilitate late download of images } self.dbDict['works'][identifier] = imageData def _parseTime(self, imageDict): s = max(imageDict.get('created_time', ''), imageDict.get('reupoloaded_time', '')) return datetime.datetime.strptime(s, '%Y-%m-%d %H:%M:%S').replace( tzinfo=pytz.timezone("Asia/Tokyo")).astimezone( pytz.utc).isoformat() def _getAvatarUrl(self, remoteUrl): return self._downloadImage(remoteUrl, self.avatarDirectory) def _getImageUrls(self, imageDict): workType = imageDict.get('type') if imageDict.get('is_manga'): response = self.api.works(imageDict['id']) response = response.get('response')[0] or {} metadata = response.get('metadata') or {} pages = metadata.get('pages') or [] def getMangaUrl(d): urld = d.get('image_urls') return self._generateImageUrl( urld.get('small') or urld.get('medium') or urld.get('large')) urls = [getMangaUrl(item) for item in pages] # Weird special case: "type" field in Pixiv JSON can be "manga" while "is_manga" is False # In this case there is only a single image URL and the JSON is formatted like an illustration elif workType == 'illustration' or (workType == 'manga' and not imageDict.get('is_manga')): urlDict = imageDict.get('image_urls') or {} urls = [ self._generateImageUrl( urlDict.get('small') or urlDict.get('medium') or urlDict.get('large')) ] elif workType == 'ugoira': return self._constructUgoira(imageDict.get('id')) else: #Default case; all response types seem to have at least something in image_urls urlDict = imageDict.get('image_urls') or {} urls = [ urlDict.get('small') or urlDict.get('medium') or urlDict.get('large') ] urls = [self._downloadImage(url, self.imageDirectory) for url in urls] return urls def _generateImageUrl(self, url): # Construct the URL for the full-res image. Super brittle; entirely dependent on Pixiv never changing anything leftSide = url[:url[8:].find('/') + 9] #Split on first slash after https:// rightSide = url[url.find('/img/'):].replace('_master1200', '') return leftSide + 'img-original' + rightSide def _downloadImage(self, url, directory): name = url[url.rfind('/') + 1:url.rfind('.')] extant = { name.split('.')[0]: os.path.join(directory, name) for name in os.listdir(directory) } if extant.get(name): print('Already downloaded {}'.format(url)) return extant.get(name) print('Downloading ' + url) def attemptDownload(attemptUrl, suffix): attemptUrl = '.'.join((attemptUrl.rpartition('.')[0], suffix)) return requests.get( attemptUrl, headers={'referer': attemptUrl[:attemptUrl.find('/img')]}, stream=True) r = attemptDownload(url, 'png') if r.status_code == 404: r = attemptDownload(url, 'jpg') if r.status_code == 404: r = attemptDownload(url, 'gif') if r.status_code == 200: filename = url.split('/')[-1] filepath = os.path.join(directory, filename) with open(filepath, 'wb') as f: for chunk in r: f.write(chunk) return '/'.join((directory, filename)) else: return r.status_code + ' ' + url def _constructUgoira(self, identifier): directory = os.path.join(self.ugoiraDirectory, str(identifier)) os.makedirs(directory, exist_ok=True) response = self.api.works(identifier) response = response.get('response')[0] or {} metadata = response.get('metadata') or {} frameTimes = [ 'duration {}'.format(delay['delay_msec'] / 1000) for delay in metadata.get('frames') ] zipUrl = sorted(metadata['zip_urls'].items())[-1][ 1] # I don't think zip_urls will ever be longer than 1 but ?? zipPath = self._downloadUgoiraZip(zipUrl, directory) with zipfile.ZipFile(zipPath, 'r') as zap: zap.extractall(directory) imagePaths = [ "file '{}'".format(fileName) for fileName in os.listdir(directory) if not fileName.endswith('.zip') ] frameData = '\n'.join(itertools.chain(*zip(imagePaths, frameTimes))) concatFile = os.path.join(directory, 'concat.txt') print('Writing frame data to: {}'.format(concatFile)) with open(concatFile, 'w') as f: f.write(frameData) concatFile = os.path.abspath(os.path.join(os.getcwd(), concatFile)) workingDirectory = os.path.abspath(os.path.join( os.getcwd(), directory)) outFile = os.path.join(directory, '{}.webm'.format(identifier)) ffmpeg = 'ffmpeg -n -f concat -i {} -c:v libvpx -crf 10 -b:v 2M {}.webm'.format( concatFile, identifier) print('Rendering video to {}'.format(outFile)) subprocess.run(ffmpeg, shell=True, cwd=workingDirectory) print('Finished rendering') return [outFile] def _downloadUgoiraZip(self, url, directory): print('Downloading ugoira zip: {}'.format(url)) path = os.path.join(directory, url.split('/')[-1]) if os.path.exists(path): print('Zip already downloaded; skipping') else: r = requests.get(url, headers={'referer': url[:url.find('/img')]}, stream=True) with open(path, 'wb') as f: for chunk in r.iter_content(chunk_size=1024): if chunk: # filter out keep-alive new chunks f.write(chunk) return path