def test_100_crawl(self): rc = entry.ZhihuDaily().crawl() data = json.dumps(rc, ensure_ascii=False) utils.mkdirp(_build_dir) with open(os.path.join(_build_dir, 'crawl.json'), 'w') as fh: fh.write(data) log.debug(rc)
def test_100_gray_image(self): '''测试灰度图片处理的工具方法''' raw_path = os.path.join(_assets_dir, 'raw.jpg') with open(raw_path, 'rb') as fh: raw_img_buf = fh.read() data = img.gray_image(raw_img_buf) output_path = os.path.join(_build_dir, 'gray.jpg') utils.mkdirp(_build_dir) with open(output_path, 'wb') as fh: fh.write(data)
def test_000_rescale_image(self): """测试调整图片工具方法""" raw_path = os.path.join(_assets_dir, 'raw.jpg') with open(raw_path, 'rb') as fh: raw_img_buf = fh.read() data = img.rescale_image(raw_img_buf, dimen=config.img_max_thumb_dimen, maxsizeb=config.img_max_thumb_size) output_path = os.path.join(_build_dir, 'rescale.jpg') utils.mkdirp(_build_dir) with open(output_path, 'wb') as fh: fh.write(data)
def __init__(self, date=None, *args, **kwargs): """ 知乎日报爬虫类,用于爬取&解析知乎日报页面&相关协议 :param str date: 爬取日期,命令行参数,默认为空,即爬取当日最新,内容格式:``yyyymmdd`` :param str output_file: (可选,关键字参数)结果输出文件, 用以将最终爬取到的数据写入到指定文件中,默认为 ``moear_spider_zhihudaily`` 下的 ``build`` 路径,建议仅作为测试时使用 """ super(ZhihuDailySpider, self).__init__(*args, **kwargs) self.start_urls = ['http://news-at.zhihu.com/api/4/news/latest'] # 此处由于知乎日报的协议为爬取指定日期的前一天 # 故需要将Spider接受的date日期+1天作为爬取参数 if date is not None: self.logger.info('指定爬取参数:date={}'.format(date)) try: spider_date = datetime.datetime.strptime(date, '%Y%m%d') spider_date += datetime.timedelta(days=1) spider_date_str = spider_date.strftime('%Y%m%d') self.logger.info('格式化后的知乎爬取日期参数:{}'.format(spider_date_str)) self.start_urls = [ 'http://news.at.zhihu.com/api/4/news/before/{}'.format( spider_date_str) ] except ValueError: self.logger.error('指定的爬取日期错误(yyymmdd):{}'.format(date)) self.start_urls = [] self.item_list = [] self.output_file = kwargs.get('output_file', None) if not self.output_file: # 此处逻辑用于命令行执行 ``scrapy crawl zhihu_daily`` 测试时的文件输出 _base_dir = os.path.dirname( os.path.dirname(os.path.abspath(__file__))) _output_file_default = os.path.join(_base_dir, 'build', 'output.json') utils.mkdirp(os.path.dirname(_output_file_default)) self.output_file = _output_file_default self.logger.info('输出文件路径: {}'.format(self.output_file))
def parse(self, response): """ 从 self.data 中将文章信息格式化为 :class:`.MoearPackageMobiItem` """ # 工作&输出路径 self.template_dir = self.settings.get('TEMPLATE_DIR') shutil.rmtree(self.settings.get('BUILD_SOURCE_DIR'), ignore_errors=True) self.build_source_dir = utils.mkdirp( self.settings.get('BUILD_SOURCE_DIR')) # 获取Post模板对象 template_post_path = os.path.join(self.template_dir, 'post.html') with open(template_post_path, 'r') as f: self.template_post = Template(f.read()) self._logger.info('构建处理路径 => {0}'.format(self.build_source_dir)) image_filter = self.options.get('image_filter', '') common_image_filter = self.options.get('common_image_filter', []) for sections in self.data.values(): for p in sections: item = MoearPackageMobiItem() pmeta = p.get('meta', {}) item['url'] = p.get('origin_url', '') item['title'] = p.get('title', '') item['cover_image'] = pmeta.get('moear.cover_image_slug') item['content'] = p.get('content', '') # 为图片持久化pipeline执行做数据准备 item['image_urls'] = [item['cover_image']] \ if item['cover_image'] is not None else [] item['image_urls'] += \ self._populated_image_urls_with_content(item['content']) self._logger.debug('待处理的图片url(过滤前): {}'.format( item['image_urls'])) item['image_urls'], item['image_urls_removed'] = \ self.filter_images_urls( item['image_urls'], image_filter, common_image_filter) self._logger.debug('待处理的图片url: {}'.format(item['image_urls'])) yield item
# 国际化语言支持 LANGUAGES = ( ('en', gettext_noop('English')), ('zh-hans', gettext_noop('Simplified Chinese')), ) LOCALE_PATHS = [ os.path.join(BASE_DIR, 'locale'), ] # Static files (CSS, JavaScript, Images) # https://docs.djangoproject.com/en/2.0/howto/static-files/ STATIC_URL = '/static/' STATIC_ROOT = os.path.join(RUNTIME_DIR, 'static') # 静态站点生成器 SITE_PAGES_DIR = utils.mkdirp(os.path.join(BASE_DIR, 'templates', 'pages')) # 模板消息Tags名称重定义 MESSAGE_TAGS = { messages.DEBUG: 'dark', messages.INFO: 'info', messages.SUCCESS: 'success', messages.WARNING: 'warning', messages.ERROR: 'danger', } # 设置日期时间显示格式 DATETIME_FORMAT = 'Y/m/d H:i:s' # Email EMAIL_HOST = _get_config('EMAIL_HOST', '')
def closed(self, reason): ''' 异步爬取本地化处理完成后,使用结果数据,进行输出文件的渲染,渲染完毕, 调用 :meth:`.MobiSpider.generate_mobi_file` 方法,生成目标 ``mobi`` 文件 ''' # 拷贝封面&报头图片文件 utils.mkdirp(os.path.join(self.build_source_dir, 'images')) self._logger.info(self.options) shutil.copy(self.options.get('img_cover'), os.path.join(self.build_source_dir, 'images', 'cover.jpg')) shutil.copy( self.options.get('img_masthead'), os.path.join(self.build_source_dir, 'images', 'masthead.gif')) # 拷贝css文件 css_base_path = self.options.get('css_base') css_package_path = self.options.get('css_package') css_extra = self.options.get('extra_css', '') css_output_dir = os.path.join(self.build_source_dir, 'css') utils.mkdirp(css_output_dir) if css_base_path: shutil.copy(css_base_path, os.path.join(css_output_dir, 'base.css')) if css_package_path: shutil.copy(css_package_path, os.path.join(css_output_dir, 'package.css')) if css_extra: with codecs.open(os.path.join(css_output_dir, 'custom.css'), 'wb', 'utf-8') as fh: fh.write(css_extra) # 拷贝icons路径文件 icons_path = self.options.get('icons_path') icons_output_dir = os.path.join(self.build_source_dir, 'icons') shutil.rmtree(icons_output_dir, ignore_errors=True) if icons_path: shutil.copytree(icons_path, icons_output_dir) # 获取content模板对象 template_content_path = os.path.join(self.template_dir, 'OEBPS', 'content.opf') with open(template_content_path, 'r') as fh: template_content = Template(fh.read()) # 渲染content目标文件 content_path = os.path.join(self.build_source_dir, 'moear.opf') with codecs.open(content_path, 'wb', 'utf-8') as fh: fh.write( template_content.render(data=self.data, spider=self.spider, options=self.options)) # 获取toc.ncx模板对象 template_toc_path = os.path.join(self.template_dir, 'OEBPS', 'toc.ncx') with open(template_toc_path, 'r') as fh: template_toc = Template(fh.read()) # 渲染toc.ncx目标文件 toc_path = os.path.join(self.build_source_dir, 'misc', 'toc.ncx') utils.mkdirp(os.path.dirname(toc_path)) with codecs.open(toc_path, 'wb', 'utf-8') as fh: fh.write( template_toc.render(data=self.data, spider=self.spider, options=self.options)) # 获取toc.html模板对象 template_toc_path = os.path.join(self.template_dir, 'OEBPS', 'toc.html') with open(template_toc_path, 'r') as fh: template_toc = Template(fh.read()) # 渲染toc.html目标文件 toc_path = os.path.join(self.build_source_dir, 'html', 'toc.html') utils.mkdirp(os.path.dirname(toc_path)) with codecs.open(toc_path, 'wb', 'utf-8') as fh: fh.write(template_toc.render(data=self.data, options=self.options)) # 生成mobi文件到mobi_dir self.generate_mobi_file()