Пример #1
0
    def run(self):
        self._load_blog()
        spider_log.info("Thread:{} 开始获取视频!Blog:{} Offset:{}".format(
            self.getName(), self.blog.url, self.offset))

        @update_key_use(self.key)
        def do():
            try:
                t = Tumblpy(self.key.ConsumerKey, self.key.ConsumerSecret)
                resp = t.get('posts/video',
                             blog_url=self.blog.url,
                             params={"offset": self.offset})
                # 视频不存在会导致 url字段为空
                posts = resp.get('posts')
                video_posts_handler(posts, self.blog)
                t.client.close()
            except TumblpyRateLimitError:
                spider_log.info("Key调用次数达到上限,本线程退出")
                return
            except TumblpyError as e:
                if e.error_code == 404:
                    mark_dead_blog(self.blog)
            finally:
                session.remove()

        return do()
Пример #2
0
 def load_new_video(self):
     spider_log.info("开始执行爬取新视频")
     blog_id_list = load_alive_blog_list()
     for blog in blog_id_list:
         start_block = 0
         end_block = int(ceil(blog.posts / 20 - blog.loaded_video))
         self._load_video(blog, start_block, end_block)
     spider_log.info("爬取新视频完成")
Пример #3
0
 def load_all_image(self):
     spider_log.info("开始执行爬取所有图片")
     blog_id_list = load_alive_blog_list()
     for blog in blog_id_list:
         start_block = blog.loaded_image
         end_block = int(ceil(blog.posts / 20))
         self._load_image(blog, start_block, end_block)
     spider_log.info("爬取所有图片完成")
Пример #4
0
def select_item(selection):
    item = {
        "1": check_key,
        "2": import_blog,
        "3": update_blog,
        "4": load_all_image,
        "5": load_new_image,
        "6": load_all_video,
        "7": load_new_video,
        "8": download_image,
        "9": download_video,
    }
    item.get(selection, exit)()
    spider_log.info("*" * 15 + " 执行完毕 " + "*" * 15)
Пример #5
0
def post_handler(posts, blog):
    for post in posts:
        photo_list = post.get('photos')
        release_time = post.get('date')[:19]
        for photo in photo_list:
            alt_sizes = photo.get('alt_sizes')
            photo_item = alt_sizes[0]
            photo_url = photo_item.get('url')
            image = Image(photo_url, blog.id, release_time)
            if same_item_count(Image, image) > 0:
                spider_log.info("Image:{} already exist.".format(image.url))
            else:
                add_item(image)
                spider_log.info("Image:{} add to database successful.".format(
                    image.url))
Пример #6
0
 def do():
     try:
         t = Tumblpy(self.key.ConsumerKey, self.key.ConsumerSecret)
         resp = t.get('posts/video',
                      blog_url=self.blog.url,
                      params={"offset": self.offset})
         # 视频不存在会导致 url字段为空
         posts = resp.get('posts')
         video_posts_handler(posts, self.blog)
         t.client.close()
     except TumblpyRateLimitError:
         spider_log.info("Key调用次数达到上限,本线程退出")
         return
     except TumblpyError as e:
         if e.error_code == 404:
             mark_dead_blog(self.blog)
     finally:
         session.remove()
Пример #7
0
 def do():
     try:
         t = Tumblpy(self.key.ConsumerKey, self.key.ConsumerSecret)
         # t.client.headers = {'Connection': 'close'}
         resp = t.get('posts/photo',
                      blog_url=self.blog.url,
                      params={"offset": self.offset})
         posts = resp.get('posts')
         post_handler(posts, self.blog)
         t.client.close()
     except TumblpyRateLimitError:
         spider_log.info("Key调用次数达到上限,本线程退出")
         return
     except TumblpyError as e:
         if e.error_code == 404:
             mark_dead_blog(self.blog)
     finally:
         session.remove()
Пример #8
0
def do_flush_key(key):
    spider_log.info("正在刷新Key ID:{}".format(key.id))
    t = Tumblpy(key.ConsumerKey, key.ConsumerSecret)
    auth_props = t.get_authentication_tokens()
    key.Token = auth_props.get("oauth_token")
    key.TokenSecret = auth_props.get("oauth_token_secret")
    spider_log.info("请打开下面的链接执行授权")
    spider_log.info(auth_props.get("auth_url"))
    key.UpdateTime = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    t.client.close()
    session.commit()
    spider_log.info("刷新Key ID:{} 完成".format(key.id))
Пример #9
0
 def run(self):
     self.item = get_item(self.item_class, self.item_id)
     self.blog = get_item_blog(self.item)
     folder_path = get_file_folder_path(
         self.blog, "images" if self.item_class is Image else "videos")
     file_path = get_file_path(self.item.url, folder_path)
     # eg.  wget https://vtt.tumblr.com/tumblr_o6cjopNCcN1vt349a.mp4 -O /mnt/storage/tumblr_o6cjopNCcN1vt349a.mp4
     # cmd = "wget -c {} -O {}".format(self.item.url, file_path)
     cmd = [
         "wget", "-cq",
         self.item.url.replace("https", "http"), "-O", file_path
     ]
     spider_log.info("下载 Id:{} 命令:{}".format(self.item.id, cmd))
     p = call(cmd)
     success = p is 0
     spider_log.info("Id:{} 结果为 {}".format(self.item.id, str(success)))
     if success:
         self.success_callback(file_path)
     else:
         self.fail_callback()
     session.remove()
Пример #10
0
def import_blog():
    spider_log.info("开始导入")
    with open(r"./blog_data.txt", "r", encoding="utf-8") as f:
        for line in f:
            if len(line) > 1:
                blog = Blog(url=line.replace("\n", ""))
                add_item(blog)

    new_file_name = "./import_finished{}.txt".format(
        datetime.now().strftime("%Y%m%d%H%M%S"))
    rename("./blog_data.txt", new_file_name)
    spider_log.info("导入完毕,旧文件被重命名为 {}".format(new_file_name))
    spider_log.info("开始更新博主信息")
    update_blog()
Пример #11
0
 def do():
     spider_log.info("开始获取博客信息!")
     blogs = load_all_blog()
     spider_log.info("加载Blog列表完成!")
     for blog in blogs:
         try:
             t = Tumblpy(self.key.ConsumerKey, self.key.ConsumerSecret)
             resp = t.get('info', blog_url=urlparse(blog.url).netloc)
             b = resp.get("blog")
             t.client.close()
             blog.name = b.get("name")
             blog.url = b.get("url")
             blog.posts = b.get("posts")
             spider_log.info("BlogId:{} 已更新".format(blog.id))
         except TumblpyRateLimitError:
             spider_log.info("Key达到上限,本线程退出")
         except TumblpyError as e:
             if e.error_code == 404:
                 mark_dead_blog(blog)
         finally:
             session.commit()
     session.remove()
Пример #12
0
def video_posts_handler(posts, blog):
    for post in posts:
        url = post.get('video_url')
        release_time = post.get('date')[:19]
        if url is None:
            spider_log.info("Video has been removed jump to next.")
            continue
        else:
            video = Video(url, blog.id, release_time)
        if same_item_count(Video, video) > 0:
            spider_log.info("Video:{} already exist.".format(video.url))
        else:
            add_item(video)
            spider_log.info("Video:{} add to database successful.".format(
                video.url))
Пример #13
0
 def _load_key(self):
     self.key = get_key()
     spider_log.info("加载Key完成!KeyId:{}".format(self.key.id))
Пример #14
0
def download_image():
    spider_log.info("下载图片开始运行")
    item_list = load_download_item(Image)
    download_item(item_list)
    spider_log.info("下载图片结束")
Пример #15
0
 def _load_blog(self):
     self.blog = get_blog(self.blog_id)
     spider_log.info("加载Blog完成!BlogId:{}".format(self.blog.id))
Пример #16
0
def download_video():
    spider_log.info("下载视频开始运行")
    item_list = load_download_item(Video)
    download_item(item_list)
    spider_log.info("下载视频结束")
Пример #17
0
def check_key():
    spider_log.info("检查Key中...")
    if not key_is_valid():
        key_list = get_all_key()
        flush_key(key_list)
Пример #18
0
def flush_key(key_list):
    spider_log.info("刷新Key中...")
    for key in key_list:
        do_flush_key(key)
    spider_log.info("刷新Key全部完成")