Exemplo n.º 1
0
 def __download_pic(self, pic, path):
     """
     下载单个图片
     :param pic: 图片数据
     :param path: 存储路径目录
     :return: bool 下载结果, str 下载路径
     """
     path = os.path.join(path, self.__make_photo_name(pic))
     if not os.path.exists(path):
         url = WeiboApi.make_large_url(pic)
         response = WeiboApi.get(url, timeout=60)
         with open(path, 'wb') as fp:
             fp.write(response.content)
         return True, path
     return False, path
 def __init__(self, target_url, MAX_PAGE):
     self.logger = logging.getLogger(__name__)
     self.MAX_PAGE = MAX_PAGE
     # 目标数据
     self.logger.info(Fore.GREEN + target_url)
     self.target = WeiboApi.fetch_user_info(target_url)
     self.page_id, self.title_value = self.target['page_id'], self.target[
         'title_value']
     # 断点数据
     self.cpdir = './checkpoints/'
     if not os.path.isdir(self.cpdir):
         os.makedirs(self.cpdir)
     self.filename = self.page_id + '_resume.pkl.gz'
     self.sleeptime = settings.SLEEPTIME
     if os.path.isfile(os.path.join(self.cpdir, self.filename)):
         with gzip.open(os.path.join(self.cpdir, self.filename), 'rb') as f:
             self.since_id, self.page = pickle.load(f)
         self.logger.info(Fore.GREEN +
                          'Resuming from {0} with page={1}'.format(
                              self.filename, self.page))
     else:
         self.since_id = ''
         self.page = 1  # Page starts from 1
         self.logger.info(
             Fore.GREEN +
             'Starting from begining with page={0}'.format(self.page))
     self.root = self.__init_folder()
Exemplo n.º 3
0
    def __download_album(self, album):
        """
        下载单个相册
        :param album: 相册数据
        :return: None
        """
        # 相册所有图片的id
        all_photo_ids = WeiboApi.fetch_photo_ids(self.uid, album['album_id'],
                                                 album['type'])
        self.logger.info(Fore.BLUE + '检测到 %d 张图片' % len(all_photo_ids))

        # 相册所有大图的数据
        all_large_pics = self.__fetch_large_pics(album, all_photo_ids)
        total = len(all_large_pics)

        # 下载所有大图
        with concurrent.futures.ThreadPoolExecutor() as executor:
            album_path = self.__make_album_path(album)

            future_to_large = {
                executor.submit(self.__download_pic, large, album_path): large
                for large in all_large_pics
            }

            for i, future in enumerate(
                    concurrent.futures.as_completed(future_to_large)):
                large = future_to_large[future]
                count_msg = '%d/%d ' % (i + 1, total)
                try:
                    result, path = future.result()
                except Exception as exc:
                    err = '%s 抛出了异常: %s' % (WeiboApi.make_large_url(large),
                                            exc)
                    self.logger.error(''.join([Fore.RED, count_msg, err]))
                else:
                    style = result and Style.NORMAL or Style.DIM
                    self.logger.info(''.join(
                        [Fore.GREEN, style, count_msg, path]))
            else:
                self.logger.info(Fore.BLUE + '《%s》 已完成' % album['caption'])
Exemplo n.º 4
0
    def __init__(self, target_url):
        """
        初始化
        :param target_url: 目标微博主页url
        """
        self.logger = logging.getLogger(self.__class__.__name__)

        # 目标数据
        self.logger.info(Fore.BLUE + target_url)
        self.target = WeiboApi.fetch_user_info(target_url)
        self.uid, self.name = self.target['oid'], self.target['onick']

        # 本地预处理
        self.root = self.__init_folder()
Exemplo n.º 5
0
    def start(self):
        """
        依次下载每一个相册
        :return: None
        """
        self.logger.info(Fore.BLUE + Style.BRIGHT + '开始下载 "%s" 的微博相册' % self.name)

        # 获取每一页的相册列表
        page_size, album_count = 20, 0
        for page in itertools.count(1):
            total, album_list = WeiboApi.fetch_album_list(self.uid, page, page_size)
            if not album_list:
                break

            for album in album_list:
                album_count += 1
                msg = '开始下载第 %d / %d 个微博相册《%s》' % (album_count, total, album['caption'])
                self.logger.info(Fore.BLUE + msg)
                self.__download_album(album)
Exemplo n.º 6
0
    def __init__(self, target_url):
        """
        初始化
        :param target_url: 目标微博主页url
        """
        self.logger = logging.getLogger(self.__class__.__name__)

        # 目标数据
        self.logger.info(Fore.BLUE + target_url)
        while True:
            try:
                self.target = WeiboApi.fetch_user_info(target_url)
                self.uid, self.name = self.target['oid'], self.target['onick']
            except KeyError as err:
                wait = 60
                message = '获取 %s 失败,可能原因为 Cookies 过期,或 API 发生变化,等待 %ds 后重试。' % (
                    err, wait)
                self.logger.error(Fore.RED + message)
                sleep(wait)
            else:
                break

        # 本地预处理
        self.root = self.__init_folder()
    def start(self):
        """开始下载图片缩略图和大图
        """
        flag = True
        while flag:
            self.logger.info(
                Fore.YELLOW +
                'since_id={0}, page={1}'.format(self.since_id, str(self.page)))
            try:  # 有时候会解析失败
                r = WeiboApi.chaohua_img_info(self.since_id, self.page_id,
                                              self.page)
                #  print(r.url)
                b = BeautifulSoup(r.json()['data'], features="lxml")
            except Exception as e:
                self.logger.error(Fore.RED +
                                  'since_id={0}, page={1}, err={2}'.format(
                                      self.since_id, self.page, e))
                with gzip.open(os.path.join(self.cpdir, self.filename),
                               'wb') as f:
                    pickle.dump(obj=[self.since_id, self.page], file=f)  # 保存现场
                sys.exit('Bye! Use `python main.py` to continue downloading.')
            for i in b.findAll('img'):
                uri = i.attrs['src']
                if 'http' not in uri:
                    uri = 'https:' + uri  # 有些URI头部没有HTTPS,加上
                #  print(uri)
                try:
                    uri_pair = {
                        'thumbnail': uri,
                        'large': uri.replace('thumb300', 'large')
                    }
                    for t in uri_pair:
                        time.sleep(self.sleeptime)
                        uri = uri_pair[t]
                        filename = os.path.join(
                            self.root, t + '/' + uri[uri.rfind('/') + 1:])
                        if os.path.isfile(filename):
                            self.logger.info(Fore.BLUE + Style.DIM + filename)
                            continue
                        else:
                            urllib.request.urlretrieve(uri, filename)
                            self.logger.info(Fore.BLUE + filename)
                except KeyboardInterrupt:
                    self.logger.error(
                        Fore.RED +
                        'since_id={0}, page={1}, err=KeyboardInterrupt'.format(
                            self.since_id, self.page))
                    with gzip.open(os.path.join(self.cpdir, self.filename),
                                   'wb') as f:
                        pickle.dump(obj=[self.since_id, self.page],
                                    file=f)  # 保存现场
                    sys.exit(
                        'Bye! Use `python main.py` to continue downloading.')
                except Exception as e:
                    self.logger.error(Fore.RED +
                                      'Img={0}, err={1}'.format(uri, e))

            # 模拟翻页
            flag = False
            for div in reversed(b.findAll('div')):
                # action-data 里的 since_id 是翻页的根据
                if 'action-data' in div.attrs:
                    tmp = div.attrs['action-data']
                    dic = dict(
                        [j.split('=')[0], j.split('=')[1]]
                        for j in tmp.split('&'))
                    if 'since_id' in dic and self.page < self.MAX_PAGE:
                        flag = True
                        self.since_id = urllib.parse.unquote(dic['since_id'])
                        self.page += 1
                        break
            if not flag:
                with gzip.open(os.path.join(self.cpdir, self.filename),
                               'wb') as f:
                    pickle.dump(obj=[self.since_id, self.page], file=f)  # 保存末尾
                self.logger.info(Fore.BLUE + 'Crawler finished')