def run(self): self._load_blog() spider_log.info("Thread:{} 开始获取视频!Blog:{} Offset:{}".format( self.getName(), self.blog.url, self.offset)) @update_key_use(self.key) def do(): try: t = Tumblpy(self.key.ConsumerKey, self.key.ConsumerSecret) resp = t.get('posts/video', blog_url=self.blog.url, params={"offset": self.offset}) # 视频不存在会导致 url字段为空 posts = resp.get('posts') video_posts_handler(posts, self.blog) t.client.close() except TumblpyRateLimitError: spider_log.info("Key调用次数达到上限,本线程退出") return except TumblpyError as e: if e.error_code == 404: mark_dead_blog(self.blog) finally: session.remove() return do()
def load_new_video(self): spider_log.info("开始执行爬取新视频") blog_id_list = load_alive_blog_list() for blog in blog_id_list: start_block = 0 end_block = int(ceil(blog.posts / 20 - blog.loaded_video)) self._load_video(blog, start_block, end_block) spider_log.info("爬取新视频完成")
def load_all_image(self): spider_log.info("开始执行爬取所有图片") blog_id_list = load_alive_blog_list() for blog in blog_id_list: start_block = blog.loaded_image end_block = int(ceil(blog.posts / 20)) self._load_image(blog, start_block, end_block) spider_log.info("爬取所有图片完成")
def select_item(selection): item = { "1": check_key, "2": import_blog, "3": update_blog, "4": load_all_image, "5": load_new_image, "6": load_all_video, "7": load_new_video, "8": download_image, "9": download_video, } item.get(selection, exit)() spider_log.info("*" * 15 + " 执行完毕 " + "*" * 15)
def post_handler(posts, blog): for post in posts: photo_list = post.get('photos') release_time = post.get('date')[:19] for photo in photo_list: alt_sizes = photo.get('alt_sizes') photo_item = alt_sizes[0] photo_url = photo_item.get('url') image = Image(photo_url, blog.id, release_time) if same_item_count(Image, image) > 0: spider_log.info("Image:{} already exist.".format(image.url)) else: add_item(image) spider_log.info("Image:{} add to database successful.".format( image.url))
def do(): try: t = Tumblpy(self.key.ConsumerKey, self.key.ConsumerSecret) resp = t.get('posts/video', blog_url=self.blog.url, params={"offset": self.offset}) # 视频不存在会导致 url字段为空 posts = resp.get('posts') video_posts_handler(posts, self.blog) t.client.close() except TumblpyRateLimitError: spider_log.info("Key调用次数达到上限,本线程退出") return except TumblpyError as e: if e.error_code == 404: mark_dead_blog(self.blog) finally: session.remove()
def do(): try: t = Tumblpy(self.key.ConsumerKey, self.key.ConsumerSecret) # t.client.headers = {'Connection': 'close'} resp = t.get('posts/photo', blog_url=self.blog.url, params={"offset": self.offset}) posts = resp.get('posts') post_handler(posts, self.blog) t.client.close() except TumblpyRateLimitError: spider_log.info("Key调用次数达到上限,本线程退出") return except TumblpyError as e: if e.error_code == 404: mark_dead_blog(self.blog) finally: session.remove()
def do_flush_key(key): spider_log.info("正在刷新Key ID:{}".format(key.id)) t = Tumblpy(key.ConsumerKey, key.ConsumerSecret) auth_props = t.get_authentication_tokens() key.Token = auth_props.get("oauth_token") key.TokenSecret = auth_props.get("oauth_token_secret") spider_log.info("请打开下面的链接执行授权") spider_log.info(auth_props.get("auth_url")) key.UpdateTime = datetime.now().strftime("%Y-%m-%d %H:%M:%S") t.client.close() session.commit() spider_log.info("刷新Key ID:{} 完成".format(key.id))
def run(self): self.item = get_item(self.item_class, self.item_id) self.blog = get_item_blog(self.item) folder_path = get_file_folder_path( self.blog, "images" if self.item_class is Image else "videos") file_path = get_file_path(self.item.url, folder_path) # eg. wget https://vtt.tumblr.com/tumblr_o6cjopNCcN1vt349a.mp4 -O /mnt/storage/tumblr_o6cjopNCcN1vt349a.mp4 # cmd = "wget -c {} -O {}".format(self.item.url, file_path) cmd = [ "wget", "-cq", self.item.url.replace("https", "http"), "-O", file_path ] spider_log.info("下载 Id:{} 命令:{}".format(self.item.id, cmd)) p = call(cmd) success = p is 0 spider_log.info("Id:{} 结果为 {}".format(self.item.id, str(success))) if success: self.success_callback(file_path) else: self.fail_callback() session.remove()
def import_blog(): spider_log.info("开始导入") with open(r"./blog_data.txt", "r", encoding="utf-8") as f: for line in f: if len(line) > 1: blog = Blog(url=line.replace("\n", "")) add_item(blog) new_file_name = "./import_finished{}.txt".format( datetime.now().strftime("%Y%m%d%H%M%S")) rename("./blog_data.txt", new_file_name) spider_log.info("导入完毕,旧文件被重命名为 {}".format(new_file_name)) spider_log.info("开始更新博主信息") update_blog()
def do(): spider_log.info("开始获取博客信息!") blogs = load_all_blog() spider_log.info("加载Blog列表完成!") for blog in blogs: try: t = Tumblpy(self.key.ConsumerKey, self.key.ConsumerSecret) resp = t.get('info', blog_url=urlparse(blog.url).netloc) b = resp.get("blog") t.client.close() blog.name = b.get("name") blog.url = b.get("url") blog.posts = b.get("posts") spider_log.info("BlogId:{} 已更新".format(blog.id)) except TumblpyRateLimitError: spider_log.info("Key达到上限,本线程退出") except TumblpyError as e: if e.error_code == 404: mark_dead_blog(blog) finally: session.commit() session.remove()
def video_posts_handler(posts, blog): for post in posts: url = post.get('video_url') release_time = post.get('date')[:19] if url is None: spider_log.info("Video has been removed jump to next.") continue else: video = Video(url, blog.id, release_time) if same_item_count(Video, video) > 0: spider_log.info("Video:{} already exist.".format(video.url)) else: add_item(video) spider_log.info("Video:{} add to database successful.".format( video.url))
def _load_key(self): self.key = get_key() spider_log.info("加载Key完成!KeyId:{}".format(self.key.id))
def download_image(): spider_log.info("下载图片开始运行") item_list = load_download_item(Image) download_item(item_list) spider_log.info("下载图片结束")
def _load_blog(self): self.blog = get_blog(self.blog_id) spider_log.info("加载Blog完成!BlogId:{}".format(self.blog.id))
def download_video(): spider_log.info("下载视频开始运行") item_list = load_download_item(Video) download_item(item_list) spider_log.info("下载视频结束")
def check_key(): spider_log.info("检查Key中...") if not key_is_valid(): key_list = get_all_key() flush_key(key_list)
def flush_key(key_list): spider_log.info("刷新Key中...") for key in key_list: do_flush_key(key) spider_log.info("刷新Key全部完成")