Пример #1
0
 def test_100_crawl(self):
     rc = entry.ZhihuDaily().crawl()
     data = json.dumps(rc, ensure_ascii=False)
     utils.mkdirp(_build_dir)
     with open(os.path.join(_build_dir, 'crawl.json'), 'w') as fh:
         fh.write(data)
     log.debug(rc)
Пример #2
0
 def test_100_gray_image(self):
     '''测试灰度图片处理的工具方法'''
     raw_path = os.path.join(_assets_dir, 'raw.jpg')
     with open(raw_path, 'rb') as fh:
         raw_img_buf = fh.read()
     data = img.gray_image(raw_img_buf)
     output_path = os.path.join(_build_dir, 'gray.jpg')
     utils.mkdirp(_build_dir)
     with open(output_path, 'wb') as fh:
         fh.write(data)
Пример #3
0
 def test_000_rescale_image(self):
     """测试调整图片工具方法"""
     raw_path = os.path.join(_assets_dir, 'raw.jpg')
     with open(raw_path, 'rb') as fh:
         raw_img_buf = fh.read()
     data = img.rescale_image(raw_img_buf,
                              dimen=config.img_max_thumb_dimen,
                              maxsizeb=config.img_max_thumb_size)
     output_path = os.path.join(_build_dir, 'rescale.jpg')
     utils.mkdirp(_build_dir)
     with open(output_path, 'wb') as fh:
         fh.write(data)
Пример #4
0
    def __init__(self, date=None, *args, **kwargs):
        """
        知乎日报爬虫类,用于爬取&解析知乎日报页面&相关协议

        :param str date: 爬取日期,命令行参数,默认为空,即爬取当日最新,内容格式:``yyyymmdd``
        :param str output_file: (可选,关键字参数)结果输出文件,
            用以将最终爬取到的数据写入到指定文件中,默认为 ``moear_spider_zhihudaily``
            下的 ``build`` 路径,建议仅作为测试时使用
        """
        super(ZhihuDailySpider, self).__init__(*args, **kwargs)

        self.start_urls = ['http://news-at.zhihu.com/api/4/news/latest']

        # 此处由于知乎日报的协议为爬取指定日期的前一天
        # 故需要将Spider接受的date日期+1天作为爬取参数
        if date is not None:
            self.logger.info('指定爬取参数:date={}'.format(date))
            try:
                spider_date = datetime.datetime.strptime(date, '%Y%m%d')
                spider_date += datetime.timedelta(days=1)
                spider_date_str = spider_date.strftime('%Y%m%d')
                self.logger.info('格式化后的知乎爬取日期参数:{}'.format(spider_date_str))
                self.start_urls = [
                    'http://news.at.zhihu.com/api/4/news/before/{}'.format(
                        spider_date_str)
                ]
            except ValueError:
                self.logger.error('指定的爬取日期错误(yyymmdd):{}'.format(date))
                self.start_urls = []

        self.item_list = []
        self.output_file = kwargs.get('output_file', None)
        if not self.output_file:
            # 此处逻辑用于命令行执行 ``scrapy crawl zhihu_daily`` 测试时的文件输出
            _base_dir = os.path.dirname(
                os.path.dirname(os.path.abspath(__file__)))
            _output_file_default = os.path.join(_base_dir, 'build',
                                                'output.json')
            utils.mkdirp(os.path.dirname(_output_file_default))
            self.output_file = _output_file_default
        self.logger.info('输出文件路径: {}'.format(self.output_file))
Пример #5
0
    def parse(self, response):
        """
        从 self.data 中将文章信息格式化为 :class:`.MoearPackageMobiItem`
        """
        # 工作&输出路径
        self.template_dir = self.settings.get('TEMPLATE_DIR')
        shutil.rmtree(self.settings.get('BUILD_SOURCE_DIR'),
                      ignore_errors=True)
        self.build_source_dir = utils.mkdirp(
            self.settings.get('BUILD_SOURCE_DIR'))

        # 获取Post模板对象
        template_post_path = os.path.join(self.template_dir, 'post.html')
        with open(template_post_path, 'r') as f:
            self.template_post = Template(f.read())

        self._logger.info('构建处理路径 => {0}'.format(self.build_source_dir))

        image_filter = self.options.get('image_filter', '')
        common_image_filter = self.options.get('common_image_filter', [])
        for sections in self.data.values():
            for p in sections:
                item = MoearPackageMobiItem()
                pmeta = p.get('meta', {})
                item['url'] = p.get('origin_url', '')
                item['title'] = p.get('title', '')
                item['cover_image'] = pmeta.get('moear.cover_image_slug')
                item['content'] = p.get('content', '')

                # 为图片持久化pipeline执行做数据准备
                item['image_urls'] = [item['cover_image']] \
                    if item['cover_image'] is not None else []
                item['image_urls'] += \
                    self._populated_image_urls_with_content(item['content'])
                self._logger.debug('待处理的图片url(过滤前): {}'.format(
                    item['image_urls']))
                item['image_urls'], item['image_urls_removed'] = \
                    self.filter_images_urls(
                        item['image_urls'], image_filter, common_image_filter)
                self._logger.debug('待处理的图片url: {}'.format(item['image_urls']))

                yield item
Пример #6
0
# 国际化语言支持
LANGUAGES = (
    ('en', gettext_noop('English')),
    ('zh-hans', gettext_noop('Simplified Chinese')),
)
LOCALE_PATHS = [
    os.path.join(BASE_DIR, 'locale'),
]

# Static files (CSS, JavaScript, Images)
# https://docs.djangoproject.com/en/2.0/howto/static-files/
STATIC_URL = '/static/'
STATIC_ROOT = os.path.join(RUNTIME_DIR, 'static')

# 静态站点生成器
SITE_PAGES_DIR = utils.mkdirp(os.path.join(BASE_DIR, 'templates', 'pages'))

# 模板消息Tags名称重定义
MESSAGE_TAGS = {
    messages.DEBUG: 'dark',
    messages.INFO: 'info',
    messages.SUCCESS: 'success',
    messages.WARNING: 'warning',
    messages.ERROR: 'danger',
}

# 设置日期时间显示格式
DATETIME_FORMAT = 'Y/m/d H:i:s'

# Email
EMAIL_HOST = _get_config('EMAIL_HOST', '')
Пример #7
0
    def closed(self, reason):
        '''
        异步爬取本地化处理完成后,使用结果数据,进行输出文件的渲染,渲染完毕,
        调用 :meth:`.MobiSpider.generate_mobi_file` 方法,生成目标 ``mobi`` 文件
        '''
        # 拷贝封面&报头图片文件
        utils.mkdirp(os.path.join(self.build_source_dir, 'images'))
        self._logger.info(self.options)
        shutil.copy(self.options.get('img_cover'),
                    os.path.join(self.build_source_dir, 'images', 'cover.jpg'))
        shutil.copy(
            self.options.get('img_masthead'),
            os.path.join(self.build_source_dir, 'images', 'masthead.gif'))

        # 拷贝css文件
        css_base_path = self.options.get('css_base')
        css_package_path = self.options.get('css_package')
        css_extra = self.options.get('extra_css', '')
        css_output_dir = os.path.join(self.build_source_dir, 'css')
        utils.mkdirp(css_output_dir)
        if css_base_path:
            shutil.copy(css_base_path, os.path.join(css_output_dir,
                                                    'base.css'))
        if css_package_path:
            shutil.copy(css_package_path,
                        os.path.join(css_output_dir, 'package.css'))
        if css_extra:
            with codecs.open(os.path.join(css_output_dir, 'custom.css'), 'wb',
                             'utf-8') as fh:
                fh.write(css_extra)

        # 拷贝icons路径文件
        icons_path = self.options.get('icons_path')
        icons_output_dir = os.path.join(self.build_source_dir, 'icons')
        shutil.rmtree(icons_output_dir, ignore_errors=True)
        if icons_path:
            shutil.copytree(icons_path, icons_output_dir)

        # 获取content模板对象
        template_content_path = os.path.join(self.template_dir, 'OEBPS',
                                             'content.opf')
        with open(template_content_path, 'r') as fh:
            template_content = Template(fh.read())

        # 渲染content目标文件
        content_path = os.path.join(self.build_source_dir, 'moear.opf')
        with codecs.open(content_path, 'wb', 'utf-8') as fh:
            fh.write(
                template_content.render(data=self.data,
                                        spider=self.spider,
                                        options=self.options))

        # 获取toc.ncx模板对象
        template_toc_path = os.path.join(self.template_dir, 'OEBPS', 'toc.ncx')
        with open(template_toc_path, 'r') as fh:
            template_toc = Template(fh.read())

        # 渲染toc.ncx目标文件
        toc_path = os.path.join(self.build_source_dir, 'misc', 'toc.ncx')
        utils.mkdirp(os.path.dirname(toc_path))
        with codecs.open(toc_path, 'wb', 'utf-8') as fh:
            fh.write(
                template_toc.render(data=self.data,
                                    spider=self.spider,
                                    options=self.options))

        # 获取toc.html模板对象
        template_toc_path = os.path.join(self.template_dir, 'OEBPS',
                                         'toc.html')
        with open(template_toc_path, 'r') as fh:
            template_toc = Template(fh.read())

        # 渲染toc.html目标文件
        toc_path = os.path.join(self.build_source_dir, 'html', 'toc.html')
        utils.mkdirp(os.path.dirname(toc_path))
        with codecs.open(toc_path, 'wb', 'utf-8') as fh:
            fh.write(template_toc.render(data=self.data, options=self.options))

        # 生成mobi文件到mobi_dir
        self.generate_mobi_file()