def download_image_byid(cls, id): if id: detail = PixivApi.illust_detail(id) print(detail) if detail: download_url = ImageDownload.get_image_url(None, detail) if download_url: PixivApi.download(download_url) else: print("download by id fail,can't find download url") else: print("can't get detail id:" + str(id))
def download_all_by_id(illust_id, path, limit_p=True): detail = PixivApi.illust_detail(illust_id) if detail: try: detail = detail.illust # 普通插画 if detail.page_count == 1: try: url = detail.meta_single_page.original_image_url except: url = detail.image_urls.large extension = os.path.splitext(url)[1] save_path = path + "/p_%s%s" % (illust_id, extension) print("Downloading:" + save_path) path = PixivApi.download(url, path=save_path) # 多图插画 else: if detail.page_count > P_LIMIT and limit_p: # 该插画P数大于最大限制,放弃下载 print("Pixiv id:%s P>limit,Skip download" % (illust_id,)) return urls = detail.meta_pages # 获取多图 if len(urls) > 1: # 多图放入一个文件夹中 path += "/p_%s" % illust_id if not os.path.exists(path): os.mkdir(path) for index in range(len(urls)): try: url = urls[index].image_urls.original if \ urls[index].image_urls.has_key("original") else urls[index].image_urls.large extension = os.path.splitext(url)[1] save_path = path + "/p_%s_%d%s" % (illust_id, index, extension) print("Downloading:" + save_path) PixivApi.download(url, path=save_path) except Exception: continue path = path + "/" else: # 获取多图失败,下载大图 url = detail.image_urls.large path = PixivApi.download(url, prefix=path) return path except Exception as e: error_log("Download fail:" + detail) error_log(e) else: print(" can't get detail id :" + str(illust_id))
def download_illustration(illu, path, auth_api): """ #illu 包含插画详细 path 存储路径 auth_api 具有身份验证的下载工具 """ if illu.has_key("url") and illu.has_key("title"): illust_id = CommonUtils.get_url_param(illu.url, "illust_id") detail = PixivApi.illust_detail(illust_id) if detail: try: detail = detail.illust # 普通插画 if detail.page_count == 1: try: url = detail.meta_single_page.original_image_url except: url = detail.image_urls.large download(illust_id, illu.title, path, url, auth_api) # 多图插画 else: if detail.page_count > P_LIMIT: # 该插画P数大于最大限制,放弃下载 print("Pixiv id:%s,name:%s P>limit,Skip download" % (illust_id, illu.title)) return urls = detail.meta_pages # 获取多图 if len(urls) > 1: # 多图放入一个文件夹中 path += "/p_%s" % illust_id if not os.path.exists(path): os.mkdir(path) for index in range(len(urls)): try: url = urls[index].image_urls.original if \ urls[index].image_urls.has_key("original") else urls[index].image_urls.large extension = os.path.splitext(url)[1] if IMAGE_USE_ORG_NAME: save_path = path + "/p_%s_%s_%d%s" % ( illust_id, CommonUtils.filter_dir_name(illu.title), index, extension) else: save_path = path + "/p_%s_%d%s" % (illust_id, index, extension) print(save_path) auth_api.download(url, path=save_path) except: continue else: # 获取多图失败,下载大图 url = detail.image_urls.large download(illust_id, illu.title, path, url, auth_api) except Exception, e: error_log("Download fail:") error_log(e) else: print(illu.title + " can't get detail id :" + illust_id)
def consumer_download_work(queue, save_path): while True: try: illust = queue.get() if illust.page_count == 1: try: url = illust.meta_single_page.original_image_url except: url = illust.image_urls.large else: url = illust.image_urls.large extension = os.path.splitext(url)[1] image_save_path = save_path + "/p_%s%s" % (illust.id, extension) PixivApi.download(url, path=image_save_path) print("download " + image_save_path + "\n") except Exception, e: print("download Fail remove id" + str(illust.id)) print(e) continue finally:
def download_topics(cls, url, path, quality=1): html = HtmlDownloader.download(url) illu_list = HtmlDownloader.parse_illustration(html) title_des = HtmlDownloader.get_title(html) if title_des and illu_list: title_des["size"] = len(illu_list) CommonUtils.write_topic_des(path + "/topic.txt", title_des) if not illu_list: return for illu in illu_list: try: filename = CommonUtils.filter_dir_name(illu.title) extension = os.path.splitext(illu.image)[1] id = CommonUtils.get_url_param(illu.image_page, "illust_id") if quality == 1: # 通过api获取 插画原图地址,下载原图 detail = PixivApi.illust_detail(id) if detail: download_url = ImageDownload.get_image_url( illu, detail) if IMAGE_USE_ORG_NAME: save_path = path + "/p_%s_%s%s" % (id, filename, extension) else: save_path = path + "/p_%s%s" % (id, extension) print(save_path) PixivApi.download(download_url, path=save_path) else: print(illu.title + " can't get detail id :" + id) else: # 直接下载 pixivision 展示图 print(path + "/p_%s_%s%s" % (id, filename, extension)) PixivApi.download(illu.image, path=path + "/p_%s_%s%s" % (id, filename, extension)) except Exception, e: error_log("Download Illu Fail:" + " Illustration :" + str(illu)) error_log(e) continue
def relate_illust(seed): queue = Queue() r = redis.Redis(REDIS_IP, REDIS_PORT) i_filter = RedisFilter(r, 5, "setFilter2:PixivRelated") save_path = "E:/imageDownLoad/related_%s" % str(seed) if not os.path.exists(save_path): os.mkdir(save_path) # 启动消费者下载器 for i in range(3): t = Thread(target=consumer_download_work, args=(queue, save_path, i_filter)) t.daemon = True t.start() related = PixivApi.illust_related(seed) # 解析返回json串,将下载url放入队列 producer_put_work(related, queue, i_filter) if related.has_key("next_url"): url = related.next_url else: print("There is no next URL,(没有查询到关联作品)") return count = 1 while True: # 间隔时间 # time.sleep(2) resp = HtmlDownloader.download(url) related2 = parse_json(resp) if related.has_key("next_url"): url = related.next_url else: print("There is no next URL,(没有查询到关联作品)") break print("Depth :" + str(count) + " Associated illust:" + str(len(related2.illusts))) print("Next URL:" + related2.next_url) producer_put_work(related2, queue, i_filter) # 需要到达的深度 if count == 2: print("producer completed!") break count += 1 queue.join()
def relate_illust(seed, depth=2, image_path='imageDownload'): queue = Queue() save_path = (image_path + "/related_%s") % str(seed) if not os.path.exists(save_path): os.mkdir(save_path) # 启动消费者下载器 for i in range(3): t = Thread(target=consumer_download_work, args=(queue, save_path)) t.daemon = True t.start() related = PixivApi.illust_related(seed) # 解析返回json串,将下载url放入队列 producer_put_work(related, queue) if related.has_key("next_url"): url = related.next_url else: print("There is no next URL,(无法查询到关联作品)") return count = 1 while True: # 间隔时间 # time.sleep(2) resp = HtmlDownloader.download(url) related2 = parse_json(resp) if related.has_key("next_url"): url = related.next_url else: print("There is no next URL,(没有查询到关联作品)") break print("Depth :" + str(count) + " Associated illust:" + str(len(related2.illusts))) print("Next URL:" + related2.next_url) producer_put_work(related2, queue) # 需要到达的深度 if count == depth: print("producer completed!") break count += 1 queue.join()
def test_api(): detail = PixivApi.illust_detail(54809586) print(detail.illust) related = PixivApi.illust_related(54809586) print(related)
def download_test(url): print("start download:" + str(time.time())) PixivApi.download(url) # 取最终一个url下载结束时间 print("url:" + url + " end:" + str(time.time()))
# 没有安装 twisted 时,只能使用 顺序线程下载。 def run_by_list(): error_log("start:" + str(time.time())) # Pixivision全站插图爬取 urls = [LINK_URL % n for n in range(1, PAGE_NUM + 1)] # 步伐,每次启动 2 *20 个图片下载进程 ,可根据电脑性能调整线程大小。其实运行速度和内存CPU关系不大,关键是网速 step = 2 length = len(urls) start_index = 0 while start_index < length: launchers = [] for url in urls[start_index:(start_index + step)]: print("Start " + url) launchers.append(PixivisionLauncher(url, IMAGE_SAVE_BASEPATH)) for launcher in launchers: launcher.start() for launcher in launchers: launcher.join() start_index += step error_log("end:" + str(time.time())) if __name__ == '__main__': PixivApi.check_api() try: from twisted.python.threadpool import ThreadPool except Exception: run_by_list() else: run_by_pool()
def download_all_by_url(url, prefix): illust_id = CommonUtils.get_url_param(url, "illust_id") if illust_id: return download_all_by_id(illust_id, prefix) else: return PixivApi.download(url.strip(), prefix=prefix)
def download_byurl(cls, url): illust_id = CommonUtils.get_url_param(url, "illust_id") if illust_id: ImageDownload.download_image_byid(illust_id) else: PixivApi.download(url.strip())
from pixivapi.PixivApi import PixivApi from utils import CommonUtils if __name__ == '__main__': type = raw_input("Please chose run mode.1.Use pixiv_config file to search. 2. Enter the parameters manually:\n") if type == "1": username = USERNAME password = PASSWORD print ("Loading") # PixivDataDownloader.PixivDataHandler() 也可以不登陆进行数据爬取,但不登陆就没有人气推荐作品。爬取的插画质量会低很多,所以干脆强制要求登录了。 if len(PIXIV_COOKIES) >= 3: data_handler = PixivDataDownloader.PixivDataHandler(cookies=PIXIV_COOKIES) else: data_handler = PixivDataDownloader.PixivDataHandler(username, password) # 这里可以使用两种api进行下载, AuthPixivApi和PixivApi 。 AuthPixivApi需要登录,但能下载更多限制级别的插画。通常情况PixivApi即可满足需求。 auth_api = PixivApi() print("Login success!!!!") download_threshold = DOWNLOAD_THRESHOLD path = SEARCH_SAVE_PATH page = SEARCH_PAGE keyword = SEARCH_KEYWORD else: username = raw_input("Please enter your pixiv accounts eamil or pixiv ID\n") password = raw_input('Enter password:\n ') print ("Loading") data_handler = PixivDataDownloader.PixivDataHandler(username, password) auth_api = AuthPixivApi(username, password) print("Login success!!!!") path = raw_input("Please input illustration save path:\n") page = int(raw_input("Please enter the total number of pages you want to crawl:\n")) download_threshold = int(raw_input("Please enter the minimum number of illustration's bookmarks:\n"))
def test_api(): detail = PixivApi.illust_detail(52819443) print(detail.illust)