def create_book(self, command, counter): Path.reset_path() Debug.logger.info(u"开始制作第 {} 本电子书".format(counter)) Debug.logger.info(u"对记录 {} 进行分析".format(command)) task_list = CommandParser.get_task_list(command) # 分析命令 if len(task_list) == 0: return for task in task_list: if Config.debug_for_create_book: pass else: Worker.distribute(task) Debug.logger.info(u"网页信息抓取完毕") task_result_list = [] for task in task_list: task_result = TaskResult(task) task_result.extract_data() task_result_list.append(task_result) Debug.logger.info(u"数据库信息获取完毕") # 下载图片 for task_result in task_result_list: task_result.download_img() Debug.logger.info(u"所有任务图片获取完毕") # 按体积自动分卷 # 渲染html && 压缩为电子书 book = Book(task_result_list) book_list = book.auto_split(Config.max_book_size_mb * 1024) for chapter in book_list: chapter.create_book() return
def create_book(self, book_package): book_package.image_container.set_save_path(Path.image_pool_path) book_package.image_container.start_download() title = book_package.get_title() if not title: # 电子书题目为空时自动跳过 # 否则会发生『rm -rf / 』的惨剧 return Path.chdir(Path.base_path + u'/知乎电子书临时资源库/') epub = Epub(title) html_tmp_path = Path.html_pool_path + u'/' image_tmp_path = Path.image_pool_path + u'/' epub.set_creator(u'ZhihuHelp1.7.0') epub.set_book_id() epub.set_output_path(Path.result_path) epub.add_css(Path.base_path + u'/www/css/markdown.css') epub.add_css(Path.base_path + u'/www/css/customer.css') epub.add_css(Path.base_path + u'/www/css/normalize.css') for book in book_package.book_list: page = book.page_list[0] with open(html_tmp_path + page.filename, u'w') as html: html.write(page.content) epub.create_chapter(html_tmp_path + page.filename, page.title) for page in book.page_list[1:]: with open(html_tmp_path + page.filename, u'w') as html: html.write(page.content) epub.add_html(html_tmp_path + page.filename, page.title) epub.finish_chapter() for image in book_package.image_list: epub.add_image(image_tmp_path + image['filename']) epub.create() Path.reset_path() return
def create(self): self.image_container.set_save_path(Path.image_pool_path) self.image_container.start_download() title = "_".join([book.property.epub.title for book in self.book_list]) title = title.strip() Path.chdir(Path.base_path + u"/知乎电子书临时资源库/") epub = Book(title, 27149527) html_tmp_path = Path.html_pool_path + "/" image_tmp_path = Path.image_pool_path + "/" for book in self.book_list: page = book.page_list[0] with open(html_tmp_path + page.filename, "w") as html: html.write(page.content) epub.createChapter(html_tmp_path + page.filename, ExtraTools.get_time(), page.title) for page in book.page_list[1:]: with open(html_tmp_path + page.filename, "w") as html: html.write(page.content) epub.addHtml(html_tmp_path + page.filename, page.title) for image in self.book["image_list"]: epub.addImg(image_tmp_path + image["filename"]) epub.addLanguage("zh-cn") epub.addCreator("ZhihuHelp1.7.0") epub.addDesc(u"该电子书由知乎助手生成,知乎助手是姚泽源为知友制作的仅供个人使用的简易电子书制作工具,源代码遵循WTFPL,希望大家能认真领会该协议的真谛,为飞面事业做出自己的贡献 XD") epub.addRight("CC") epub.addPublisher("ZhihuHelp") Debug.logger.debug(u"当前目录为") Path.pwd() epub.addCss(Path.base_path + u"/epubResource/markdown.css") epub.addCss(Path.base_path + u"/epubResource/front.css") epub.buildingEpub() return
def add_index_html(self, src, title): Path.copy(src, EpubPath.html_path) filename = Path.get_filename(src) new_src = u'html/' + filename resource_id = self.opf.add_html(new_src) self.toc.add_item(resource_id, new_src, title) return
def filepath(answertmp = []): if answertmp: answerpath = Path.answer_path+'/Answer_qid_aid/'+'{0}'.format(answertmp['question_id']) if not Path.is_dir(answerpath): Path.mkdirs(answerpath) filename = '{0}'.format(answertmp['question_id'])+'_'+'{0}'.format(answertmp['answer_id'])+'.txt' return Path.join_dir(answerpath,filename)
def __init__(self, recipe_kind='Notset', read_list='ReadList.txt', url=None, debug=False): u""" 配置文件使用$符区隔,同一行内的配置文件归并至一本电子书内 :param recipe_kind: :param read_list: default value: ReadList.txt :param url: :param debug: :return: """ self.recipe_kind = recipe_kind self.read_list = read_list self.url = url log.warning_log(u"website type: " + str(self.recipe_kind) + '\n') import logging if debug is True: Debug.logger.setLevel(logging.DEBUG) else: Debug.logger.setLevel(logging.INFO) Debug.logger.debug(u"read_list: " + str(self.read_list)) Debug.logger.debug(u"url: " + str(self.url)) Debug.logger.debug(u"recipe type:" + str(recipe_kind)) Path.init_base_path(recipe_kind) # 设置路径 Path.init_work_directory() # 创建路径 self.init_database() # 初始化数据库 Config._load() return
def create_chapter(self, src, title): Path.copy(src, EpubPath.html_path) filename = Path.get_filename(src) new_src = u'html/' + filename resource_id = self.opf.add_title_page_html(new_src) self.directory.create_chapter(new_src, title) self.toc.create_chapter(resource_id, new_src, title) return
def __init__(self): u""" 配置文件使用$符区隔,同一行内的配置文件归并至一本电子书内 """ init.init_database() Path.init_base_path() Config._load() return
def __init__(self): # 初始化目录结构 Path.init_base_path() Path.init_work_directory() # 初始化数据库链接 DB.init_database() # 初始化配置 Config.init_config() return
def __init__(self): u""" 配置文件使用$符区隔,同一行内的配置文件归并至一本电子书内 """ Path.init_base_path() # 设置路径 Path.init_work_directory() # 创建路径 self.init_database() # 初始化数据库 Config._load() return
def create_book(self): # 确定文件信息 title = Match.fix_filename(self.book_title) if self.is_split: title = self.book_title + u'_卷{}'.format(self.chapter_no) # 先切换到电子书临时资源目录下 Path.chdir(Path.book_pool_path) epub = Epub(title) for task_result in self.task_result_list: chapter_src = '' # info_page if task_result.task.task_type == Type.question: chapter_src = self.generate_question_info_page(task_result.info_page) elif task_result.task.task_type == Type.answer: chapter_src = self.generate_question_info_page(task_result.info_page) elif task_result.task.task_type == Type.collection: chapter_src = self.generate_collection_info_page(task_result.info_page) elif task_result.task.task_type == Type.topic: chapter_src = self.generate_topic_info_page(task_result.info_page) elif task_result.task.task_type == Type.author: chapter_src = self.generate_author_info_page(task_result.info_page) elif task_result.task.task_type == Type.column: chapter_src = self.generate_column_info_page(task_result.info_page) elif task_result.task.task_type == Type.article: chapter_src = self.generate_article_info_page(task_result.info_page) epub.create_chapter(chapter_src, task_result.get_title()) for question in task_result.question_list: # 添加图片文件 for filename in question.img_filename_list: epub.add_image(Path.image_pool_path + '/' + filename) question_src = self.generate_question_page(question) epub.add_html(question_src, question.question_info.title) for column in task_result.column_list: # 添加图片文件 for filename in column.img_filename_list: epub.add_image(Path.image_pool_path + '/' + filename) for article in column.article_list: article_src = self.generate_article_page(article) epub.add_html(article_src, article.title) epub.finish_chapter() epub.set_creator(u'ZhihuHelp1.8.0') epub.set_language(u'zh-cn') epub.set_book_id() epub.set_output_path(Path.result_path) epub.add_css(Path.base_path + u'/www/css/markdown.css') epub.add_css(Path.base_path + u'/www/css/customer.css') epub.add_css(Path.base_path + u'/www/css/normalize.css') epub.add_css(Path.base_path + u'/www/css/bootstrap.css') epub.create() Path.reset_path() return
def create_book(command, counter): Path.reset_path() Debug.logger.info(u"开始制作第 {} 本电子书".format(counter)) Debug.logger.info(u"对记录 {} 进行分析".format(command)) task_package = ReadListParser.get_task(command) # 分析命令 if not task_package.is_work_list_empty(): worker_factory(task_package.work_list) # 执行抓取程序 Debug.logger.info(u"网页信息抓取完毕") if not task_package.is_book_list_empty(): Debug.logger.info(u"开始从数据库中生成电子书") book = Book(task_package.book_list) book.create() return
def download_img(self): from src.container.image_container import ImageContainer img_container = ImageContainer() img_src_dict = Match.match_img_with_src_dict(self.content) self.img_filename_list = [] for img in img_src_dict: src = img_src_dict[img] filename = img_container.add(src) self.img_filename_list.append(filename) self.content = self.content.replace(img, Match.create_img_element_with_file_name(filename)) # 下载文章封面图像 filename = img_container.add(self.image_url) self.img_filename_list.append(filename) self.image_url = Match.create_local_img_src(filename) # 下载用户头像 filename = img_container.add(self.author_avatar_url) self.img_filename_list.append(filename) self.author_avatar_url = Match.create_local_img_src(filename) img_container.start_download() # 下载完成后,更新图片大小 for filename in self.img_filename_list: self.total_img_size_kb += Path.get_img_size_by_filename_kb(filename) return
def download_img(self): from src.container.image_container import ImageContainer img_container = ImageContainer() img_src_dict = Match.match_img_with_src_dict(self.content) self.img_filename_list = [] for img in img_src_dict: src = img_src_dict[img] filename = img_container.add(src) self.img_filename_list.append(filename) self.content = self.content.replace( img, Match.create_img_element_with_file_name(filename)) # 答案作者的头像也要下载 filename = img_container.add(self.author_avatar_url) self.img_filename_list.append(filename) self.author_avatar_url = Match.create_local_img_src(filename) img_container.start_download() # 下载完成后,更新图片大小 for filename in self.img_filename_list: self.total_img_size_kb += Path.get_img_size_by_filename_kb( filename) print self.total_img_size_kb return
def init_database(): if Path.is_file(Path.db_path): DB.set_conn(sqlite3.connect(Path.db_path)) else: DB.set_conn(sqlite3.connect(Path.db_path)) with open(Path.sql_path) as sql_script: DB.cursor.executescript(sql_script.read()) DB.commit()
def download_img(self): from src.container.image_container import ImageContainer if str(self.content).__contains__('<div class="duokan-image-single">'): # print img_src_dict xtep = str(self.content) xxsoup = BeautifulSoup(xtep, 'lxml') list_tiezhi_tit = xxsoup.find_all('div', class_="duokan-image-single") for x in list_tiezhi_tit: list_pcyc_li = x.find_all('img') for li in list_pcyc_li: # print li src = li.get('src') st = str(src).split('/images/')[-1] newT = u'<img class="ke_img" src="file:///Users/ink/Desktop/images/{}" />'.format(st) xtep = xtep.replace(str(x), newT, 1) self.content = xtep # print xtep img_container = ImageContainer() img_src_dict = Match.match_img_with_src_dict(self.content) self.img_filename_list = [] for img in img_src_dict: src = img_src_dict[img] filename = img_container.add(src) # print 'src:' + src + ' and filename ' + filename self.img_filename_list.append(filename) if str(img).__contains__(u"class=\"avatar\""): self.content = self.content.replace(img, Match.avatar_create_img_element_with_file_name(filename)) else: self.content = self.content.replace(img, Match.create_img_element_with_file_name(filename)) # 下载文章封面图像 filename = img_container.add(self.image_url) self.img_filename_list.append(filename) self.image_url = Match.create_local_img_src(filename) # 下载用户头像 filename = img_container.add(self.author_avatar_url) self.img_filename_list.append(filename) self.author_avatar_url = Match.create_local_img_src(filename) img_container.start_download() # 下载完成后,更新图片大小 for filename in self.img_filename_list: self.total_img_size_kb += Path.get_img_size_by_filename_kb(filename) return
def add_html(self, src, title): u""" add_index为add_html不需要添加文件时的特殊情况 """ self.add_index_html(src, title) filename = Path.get_filename(src) new_src = u'html/' + filename self.directory.add_html(new_src, title) return
def getAnswerContentFromFile( answertmp = {}): if not answertmp: return filepath_name = Ans2File.filepath(answertmp) if Path.is_file(filepath_name): with open(filepath_name, 'r') as f: fileContent = f.read() answertmp['content'] = fileContent return
def create_book(self, command, counter): Path.reset_path() Debug.logger.info(u"开始制作第 {} 本电子书".format(counter)) Debug.logger.info(u"对记录 {} 进行分析".format(command)) task_list = CommandParser.get_task_list(command) # 分析命令 if len(task_list) == 0: return for task in task_list: if Config.debug_for_create_book: pass else: Worker.distribute(task) Debug.logger.info(u"网页信息抓取完毕") task_result_list = [] toTo_list = [ Type.wechat, Type.huxiu, Type.huawei, Type.xueqiu, Type.sina, Type.zhengshitang, Type.jinwankansa, Type.wuxia, Type.doc360, Type.todo, Type.todo1, Type.todo2, Type.fiel, Type.taoguba_article ] for task in task_list: if task.get_task_type() in toTo_list: task = ColumnTask(task.account_id) task_result = TaskResult(task) task_result.extract_data() task_result_list.append(task_result) Debug.logger.info(u"数据库信息获取完毕") # 下载图片 for task_result in task_result_list: task_result.download_img() # print '所有任务图片获取完毕' Debug.logger.info(u"所有任务图片获取完毕") # 按体积自动分卷 # 渲染html && 压缩为电子书 book = Book(task_result_list) book_list = book.auto_split(Config.max_book_size_mb * 1024) for chapter in book_list: chapter.create_book() return
def create_chapter(self, src, title): template = self.get_template('directory', 'item_root') item = template.format(href=Path.get_filename(src), title=title) if self.chapter_deep == 0: template = self.get_template('directory', 'chapter') item = template.format(item=item, title=u'目录') self.content += item self.chapter_deep += 1 return
def reloadtocatch(): rootdir = "/Volumes/MacintoshHD/TC/todo" # 指明被遍历的文件夹 filelist = os.listdir(rootdir) ff = open('TGB_List.txt', 'w') for file in filelist: print file # if file.startswith('TGB'): article_id = (str(file).split('_')[-1]).strip(')') print article_id # tagrId = u"http://www.taoguba.com.cn/Article/{}/0\n".format(article_id) tagrId = u"https://www.huxiu.com/{}\n".format(article_id) ff.write(tagrId) # 图片文件夹 rootdir = "/Volumes/MacintoshHD/TC/todo" # 指明被遍历的文件夹 picdir = "/Volumes/MacintoshHD/TC/picture" filelist = os.listdir(rootdir) ff = open('TGB_List.txt', 'w') for parent, dirnames, filenames in os.walk( rootdir): # 三个参数:分别返回1.父目录 2.所有文件夹名字(不含路径) 3.所有文件名字 # for dirname in dirnames: # 输出文件夹信息 # # print "parent is: " + parent # print "dirname is: " + dirname for filename in filenames: # 输出文件信息 print "parent is :" + parent print "filename is:" + filename print "the full name of the file is:" + os.path.join( parent, filename) # 输出文件路径信息 if filename.endswith('jpg'): Path.copy(os.path.join(parent, filename), picdir) # for parent, dirnames, filenames in os.walk(rootdir): # 三个参数:分别返回1.父目录 2.所有文件夹名字(不含路径) 3.所有文件名字 # for dirname in dirnames: # 输出文件夹信息 # # # print "parent is: " + parent # print "dirname is: " + dirname # # # for filename in filenames: # 输出文件信息 # # print "parent is :" + parent # # print "filename is:" + filename # # print "the full name of the file is:" + os.path.join(parent, filename) # 输出文件路径信息 ff.close()
def init_database(): if Path.is_file(Path.db_path): Debug.logger.debug(u"Connect to the database...") Debug.logger.debug(u"db_path: " + str(Path.db_path)) DB.set_conn(sqlite3.connect(Path.db_path)) else: Debug.logger.debug(u"Create db file...") DB.set_conn(sqlite3.connect(Path.db_path)) with open(Path.sql_path) as sql_script: DB.cursor.executescript(sql_script.read()) DB.commit()
def create_book(self, book_package): book_package.image_container.set_save_path(Path.image_pool_path) book_package.image_container.start_download() title = book_package.get_title() Debug.logger.debug(u"title of the e-book:" + str(title)) if not title: # 电子书题目为空时自动跳过 # 否则会发生『rm -rf / 』的惨剧 return Path.chdir(Path.in_base_path + u'/e-books_tmp_source') epub = Epub(title) html_tmp_path = Path.html_pool_path + u'/' image_tmp_path = Path.image_pool_path + u'/' epub.set_creator(u'EEBookV0-1') epub.set_language(u'zh') epub.set_book_id() epub.set_output_path(Path.result_path) epub.add_css(Path.in_base_path + u'/www/css/markdown.css') epub.add_css(Path.in_base_path + u'/www/css/customer.css') epub.add_css(Path.in_base_path + u'/www/css/normalize.css') epub.add_css(Path.in_base_path + u'/www/css/bootstrap.css') # epub.add_css(Path.in_base_path + u'/www/css/article.css') # TODO: 来自新浪,需要精简 for book in book_package.book_list: page = book.page_list[0] with open(html_tmp_path + page.filename, 'w') as html: html.write(page.content) if '_' in page.title: page.title = ''.join(page.title.split('_')[1:]) # 删除章节前缀 epub.create_chapter(html_tmp_path + page.filename, page.title) for page in book.page_list[1:]: with open(html_tmp_path + page.filename, 'w') as html: html.write(page.content) epub.add_html(html_tmp_path + page.filename, page.title) epub.finish_chapter() for image in book_package.image_list: epub.add_image(image_tmp_path + image['filename']) epub.create() Path.reset_path() return
def create_book(command, counter): Path.reset_path() Debug.logger.info(u"Ready to make No.{} e-book".format(counter)) Debug.logger.info(u"Analysis {} ".format(command)) task_package = UrlParser.get_task(command) # 分析命令 Debug.logger.debug(u"#Debug:#task_package是:" + str(task_package)) if not task_package.is_work_list_empty(): worker_factory(task_package.work_list) # 执行抓取程序 Debug.logger.info(u"Complete fetching from web") file_name_set = None if not task_package.is_book_list_empty(): Debug.logger.info(u"Start generating e-book from the database") book = Book(task_package.book_list) file_name_set = book.create() if file_name_set is not None: file_name_set2list = list(file_name_set) file_name = '-'.join(file_name_set2list[0:3]) return file_name return u"Oops! no epub file produced"
def create_book(self, book_package): book_package.image_container.set_save_path(Path.image_pool_path) book_package.image_container.start_download() title = book_package.get_title() Debug.logger.debug(u"电子书的名称是???" + str(title)) if not title: # 电子书题目为空时自动跳过 # 否则会发生『rm -rf / 』的惨剧 return Path.chdir(Path.base_path + u'/电子书临时资源库') epub = Epub(title) html_tmp_path = Path.html_pool_path + u'/' image_tmp_path = Path.image_pool_path + u'/' epub.set_creator(u'SinaBlogV01') epub.set_book_id() epub.set_output_path(Path.result_path) epub.add_css(Path.base_path + u'/www/css/markdown.css') epub.add_css(Path.base_path + u'/www/css/customer.css') epub.add_css(Path.base_path + u'/www/css/normalize.css') epub.add_css(Path.base_path + u'/www/css/bootstrap.css') # epub.add_css(Path.base_path + u'/www/css/article.css') # TODO: 来自新浪,需要精简 for book in book_package.book_list: page = book.page_list[0] with open(html_tmp_path + page.filename, u'w') as html: html.write(page.content) if '_' in page.title: page.title = ''.join(page.title.split('_')[1:]) # 删除章节前缀 epub.create_chapter(html_tmp_path + page.filename, page.title) for page in book.page_list[1:]: with open(html_tmp_path + page.filename, u'w') as html: html.write(page.content) epub.add_html(html_tmp_path + page.filename, page.title) epub.finish_chapter() for image in book_package.image_list: epub.add_image(image_tmp_path + image['filename']) epub.create() Path.reset_path() return
def add_book(self): u""" 打开已经在文件系统的电子书到电子书管理器中 :return: """ # Get filename and show only .epub files Mac 系统下返回的是native fiel dialog book_path = QtGui.QFileDialog.getOpenFileName(self, u'打开Epub格式电子书', ".", "(*.epub)") if str(book_path) is '': # 没有选中电子书 return if os.path.dirname(str(book_path)) + os.sep != str(LIBRARY_DIR): shutil.copy(str(book_path), LIBRARY_DIR) file_name = os.path.basename(str(book_path)) book_id = file_name.split('.epub')[0] bookdata_book_catalog = LIBRARY_DIR + book_id Path.mkdir(bookdata_book_catalog) Debug.logger.debug(u"移入bookdata中的是:" + str(LIBRARY_DIR + file_name)) Debug.logger.debug(u"bookdata中的书:" + str(bookdata_book_catalog)) Debug.logger.debug(u"book_path:" + os.path.dirname(str(book_path))) if os.path.dirname(str(book_path)) != bookdata_book_catalog: try: shutil.move(LIBRARY_DIR + file_name, bookdata_book_catalog) except shutil.Error: Debug.logger.debug(u"TODO:添加过这个书,删除原来的书") pass else: Debug.logger.debug(u"是相同文件夹, 添加的是bookdata中的书") os.remove(LIBRARY_DIR + file_name) book = Book(book_id) book.date = time.strftime(ISOTIMEFORMAT, time.localtime()) insert_library(book) self.update_library()
def add_book(self): u""" 打开已经在文件系统的电子书到电子书管理器中 :return: """ # Get filename and show only .epub files Mac 系统下返回的是native fiel dialog book_path = QtGui.QFileDialog.getOpenFileName(self, u'打开Epub格式电子书', ".", "(*.epub)") if str(book_path) is '': # 没有选中电子书 return if os.path.dirname(str(book_path))+os.sep != str(LIBRARY_DIR): shutil.copy(str(book_path), LIBRARY_DIR) file_name = os.path.basename(str(book_path)) book_id = file_name.split('.epub')[0] bookdata_book_catalog = LIBRARY_DIR+book_id Path.mkdir(bookdata_book_catalog) Debug.logger.debug(u"移入bookdata中的是:" + str(LIBRARY_DIR+file_name)) Debug.logger.debug(u"bookdata中的书:" + str(bookdata_book_catalog)) Debug.logger.debug(u"book_path:" + os.path.dirname(str(book_path))) if os.path.dirname(str(book_path)) != bookdata_book_catalog: try: shutil.move(LIBRARY_DIR+file_name, bookdata_book_catalog) except shutil.Error: Debug.logger.debug(u"TODO:添加过这个书,删除原来的书") pass else: Debug.logger.debug(u"是相同文件夹, 添加的是bookdata中的书") os.remove(LIBRARY_DIR+file_name) book = Book(book_id) book.date = time.strftime(ISOTIMEFORMAT, time.localtime()) insert_library(book) self.update_library()
def create(self): self.image_container.set_save_path(Path.image_pool_path) self.image_container.start_download() title = '_'.join([book.epub.title for book in self.book_list]) title = title.strip()[:128] # 避开window文件名长度限制 title = ExtraTools.fix_filename(title) # 移除特殊字符 if not title: # 电子书题目为空时自动跳过 # 否则会发生『rm -rf / 』的惨剧。。。 return Path.chdir(Path.base_path + u'/知乎电子书临时资源库/') epub = Book(title, 27149527) html_tmp_path = Path.html_pool_path + '/' image_tmp_path = Path.image_pool_path + '/' for book in self.book_list: page = book.page_list[0] with open(html_tmp_path + page.filename, 'w') as html: html.write(page.content) #epub.createChapter(html_tmp_path + page.filename, ExtraTools.get_time(), page.title) epub.addInfoPage(html_tmp_path + page.filename, page.title) for page in book.page_list[1:]: with open(html_tmp_path + page.filename, 'w') as html: html.write(page.content) epub.addHtml(html_tmp_path + page.filename, page.title) for image in self.book.image_list: epub.addImg(image_tmp_path + image['filename']) epub.addLanguage('zh-cn') epub.addCreator('ZhihuHelp1.7.0') epub.addDesc(u'该电子书由知乎助手生成,知乎助手是姚泽源为知友制作的仅供个人使用的简易电子书制作工具,源代码遵循WTFPL,希望大家能认真领会该协议的真谛,为飞面事业做出自己的贡献 XD') epub.addRight('CC') epub.addPublisher('ZhihuHelp') epub.addCss(Path.base_path + u'/www/css/markdown.css') epub.addCss(Path.base_path + u'/www/css/front.css') epub.buildingEpub() Path.reset_path() return
def svContent2File( filepath_name , contentInHttp): contentBefore='' if Path.is_file(filepath_name): with open(filepath_name ,'r') as f: contentBefore = f.read() def calcContent(contentBefore ,contentSave):#计算内容 是否保存 strContentSave = contentSave contentBefore = unicode(contentBefore, errors='ignore') if (contentSave.find(contentBefore[10:]) == -1): strContentSave += '\n -----------------------------------------答案更新分割线------------------\n' strContentSave += contentBefore return strContentSave content2SV = calcContent(contentBefore , contentInHttp) with open(filepath_name ,'w') as f: f.write(content2SV) return
def download_img_in_question_content(self): # 下载问题详情中的图片,同时更新 from src.container.image_container import ImageContainer img_container = ImageContainer() img_src_dict = Match.match_img_with_src_dict(self.question_info.detail) self.question_content_img_filename_list = [] for img in img_src_dict: src = img_src_dict[img] filename = img_container.add(src) self.question_content_img_filename_list.append(filename) self.question_info.detail = self.question_info.detail.replace(img, Match.create_img_element_with_file_name(filename)) img_container.start_download() # 下载完成后,更新图片大小 for filename in self.question_content_img_filename_list: self.question_content_img_size += Path.get_img_size_by_filename_kb(filename) return
def addFileSave(answertmp={}): save_content = hashlib.md5(answertmp['content']).hexdigest() filepath_name = Ans2File.filepath(answertmp) def isUpdateContent( save_content ,questionID , answerID):#是否更答案文本内容 answerHref = 'http://www.zhihu.com/question/{0}/answer/{1}'.format(questionID, answerID) Var = DB.cursor.execute( "select content from Answer where href = ?", (answerHref,)).fetchone() if not Var: Debug.logger.debug(u'读答案md5失败') return False #初始为空 则默认插入 return Var[0] == save_content bsaved = isUpdateContent(save_content , answertmp['question_id'] , answertmp['answer_id']) if not Path.is_file(filepath_name): bsaved = False#数据库中有,文件没有 if not bsaved : Ans2File.svContent2File(filepath_name , answertmp['content']) answertmp['content'] = str(save_content) return
def start(self): # 检查更新 self.check_update() # 登录 login = Login() zhihu_client = login.get_login_client() Worker.set_zhihu_client(zhihu_client) Debug.logger.info(u"开始读取ReadList.txt设置信息") if not Path.is_file(u'./ReadList.txt'): # 当ReadList不存在的时候自动创建之 with open(u'./ReadList.txt', u'w') as read_list: read_list.close() print Debug.logger.info(u"ReadList.txt 内容为空,自动退出") return book_counter = self.read_list() Debug.logger.info(u"所有书籍制作完成。") Debug.logger.info(u"本次共制作书籍{0}本".format(book_counter)) Debug.logger.info(u"感谢您的使用") Debug.logger.info(u"点按任意键退出") return
def start(self): print 'start 东财股吧 研报' stockList = [] file_name = 'annual.txt' with open(file_name, 'r') as read_list: read_list = read_list.readlines() resultsL = read_list.__len__() for x in range(0, resultsL): line = read_list[x] splits = line.split('#') code = (str)(splits[0]) fieName = (str)(splits[1]).strip() print fieName stockList.append({'URL': code, 'NAME': fieName}) for xx in stockList: for raw_front_page_index in range(1, 3): fileN = str(xx['NAME']).strip() uux = xx['URL'] sdPath = '/ink/work/62/ink/{}'.format(fileN) Path.mkdir(sdPath) burl = u"http://guba.eastmoney.com/list,{},2,f_{}.html" content = Http.get_content(burl.format(uux, raw_front_page_index)) xxsoup = BeautifulSoup(content, 'html.parser') tagrt = xxsoup.find_all('div', id='articlelistnew')[0] ols = tagrt.find_all('div', class_='articleh normal_post') olss = tagrt.find_all('div', class_='articleh normal_post odd') splicy = [] for xxos in ols: splicy.append(xxos) for xx in olss: splicy.append(xxos) for inkl in splicy: try: inklinkl = BeautifulSoup(str(inkl), 'html.parser') spp = inklinkl.find_all('span', class_='l3')[0] list_pcyc_li = spp.find_all('a') for li in list_pcyc_li: ttt = li.get('href') print ttt destU = u'http://guba.eastmoney.com{}'.format(ttt) result = Http.get_content(destU) # result = unicode(result, 'GBK').encode('UTF-8') xxsoup = BeautifulSoup(result, 'html.parser') title_tationl = xxsoup.find_all('div', id='zwconttbt') tt = str(title_tationl[0].text).strip() print tt title = Match.replace_specile_chars(tt) title = title.replace('/', '', 100) title = title.replace('查看原文', '') ttime = xxsoup.find_all('p', class_='publishdate')[0] tttttime = str(ttime.text)[-10:] print tttttime date_time = datetime.datetime.strptime(tttttime, '%Y-%m-%d') # print date_time.strftime('%Y-%m-%d') ttime = date_time.strftime('%Y-%m-%d') fileName = u"{}_{}.pdf".format(ttime, title) # 时间 券商 名称 author print fileName basePath = '{}/{}'.format(sdPath, fileName) # print basePath # 创建文件夹 # spx = xxsoup.find_all('span', class_='zwtitlepdf')[0] pdfu = spx.find_all('a') for li in pdfu: ttt = li.get('href') print ttt Path.mkdirAndPath(basePath) Debug.print_in_single_line(u'开始下载 {}'.format(ttt)) if ttt: content = Http.get_content(url=ttt, timeout=180) if not content: Debug.logger.debug(u'pdf『{}』下载失败'.format(ttt)) content = '' else: Debug.print_in_single_line(u'pdf {} 下载完成'.format(ttt)) else: # 当下载地址为空的时候,就没必要再去下载了 content = '' if content.__len__() > 10: with open(basePath, "wb") as pdf: pdf.write(content) except Exception as e: print('next')
def create_book(self): # 确定文件信息 title = Match.fix_filename(self.book_title) if self.is_split: title = self.book_title + u'_卷{}'.format(self.chapter_no) # 先切换到电子书临时资源目录下 Path.chdir(Path.book_pool_path) epub = Epub(title) for task_result in self.task_result_list: chapter_src = '' # info_page if task_result.task.task_type == Type.question: chapter_src = self.generate_question_info_page( task_result.info_page) elif task_result.task.task_type == Type.answer: chapter_src = self.generate_question_info_page( task_result.info_page) elif task_result.task.task_type == Type.collection: chapter_src = self.generate_collection_info_page( task_result.info_page) elif task_result.task.task_type == Type.topic: chapter_src = self.generate_topic_info_page( task_result.info_page) elif task_result.task.task_type == Type.author: chapter_src = self.generate_author_info_page( task_result.info_page) elif task_result.task.task_type == Type.column: task_result.info_page.article_count = ( task_result.column_list[0].article_list).__len__() chapter_src = self.generate_column_info_page( task_result.info_page) elif task_result.task.task_type == Type.article: chapter_src = self.generate_article_info_page( task_result.info_page) epub.create_chapter(chapter_src, task_result.get_title()) for question in task_result.question_list: # 添加图片文件 for filename in question.img_filename_list: epub.add_image(Path.image_pool_path + '/' + filename) question_src = self.generate_question_page(question) epub.add_html(question_src, question.question_info.title) for column in task_result.column_list: # 添加图片文件 for filename in column.img_filename_list: epub.add_image(Path.image_pool_path + '/' + filename) for article in column.article_list: article_src = self.generate_article_page(article) epub.add_html(article_src, article.title) epub.finish_chapter() href = self.task_result_list[0].info_page.image_url if len(href) > 0: print href if href: content = Http.get_content( url=href, timeout=Config.timeout_download_picture) if not content: Debug.logger.debug(u'图片『{}』下载失败'.format(href)) content = '' else: Debug.print_in_single_line(u'图片{}下载完成'.format(href)) else: # 当下载地址为空的时候,就没必要再去下载了 content = '' if content.__len__() > 10: filename = Path.image_pool_path + '/' + 'cover.jpg' with open(filename, 'wb') as image: image.write(content) epub.add_cover_image(filename) else: epub.add_cover_image('/Users/ex-liyan010/Desktop/cover.png') # epub.add_cover_image('/Users/ex-liyan010/Desktop/cover.png') epub.set_creator(u'macbookpro2100') epub.set_language(u'zh-cn') epub.set_book_id() epub.set_output_path(Path.result_path) epub.add_css(Path.base_path + u'/www/css/markdown.css') epub.add_css(Path.base_path + u'/www/css/customer.css') epub.add_css(Path.base_path + u'/www/css/normalize.css') epub.add_css(Path.base_path + u'/www/css/bootstrap.css') epub.create() Path.reset_path() return
def add_cover_image(self, src): Path.copy(src, EpubPath.image_path) filename = Path.get_filename(src) new_src = u'images/' + filename resource_id = self.opf.add_cover_image(new_src) return
def add_css(self, src): Path.copy(src, EpubPath.style_path) filename = Path.get_filename(src) new_src = u'style/' + filename resource_id = self.opf.add_css(new_src) return
def start(self): print 'start JRJ_Report' stockList = [] file_name = 'annual.txt' with open(file_name, 'r') as read_list: read_list = read_list.readlines() resultsL = read_list.__len__() for x in range(0, resultsL): line = read_list[x] splits = line.split('#') code = (str)(splits[0]) fieName = (str)(splits[1]).strip() print fieName stockList.append({'URL': code, 'NAME': fieName}) for xx in stockList: for raw_front_page_index in range(1, 8): fileN = str(xx['NAME']).strip() uux = xx['URL'] sdPath = '/ink/work/62/ink/{}'.format(fileN) Path.mkdir(sdPath) url = u"http://istock.jrj.com.cn/yanbao_{}_p{}.html" request_url = url.format(uux, raw_front_page_index) content = Http.get_content(request_url) soup = BeautifulSoup(content, 'html.parser') list_p_list = soup.find_all('td', class_="left") for p in list_p_list: # print p list_pcyc_li = p.find_all('a') for li in list_pcyc_li: xxurl = li.get('href') # print xxurl if not 'http://istock.jrj.com.cn/list,yanbao.html' == xxurl: time.sleep(1) result = Http.get_content(xxurl) result = unicode(str(result), 'GBK').encode('UTF-8') xxsoup = BeautifulSoup(result, 'html.parser') # title_tationl = xxsoup.find_all('h1') # tt = str(title_tationl[0].text).strip() xxlist_p_list = xxsoup.find_all('p', class_='title')[0] xxlist_ds = xxsoup.find_all('span', class_='fr')[0] realu = str(xxlist_p_list).replace( str(xxlist_ds), '', 1) realuxsoup = BeautifulSoup(realu, 'html.parser') sp = str(realuxsoup.text).split(' ') ttime = sp[1] if ttime.__contains__('发表于'): ttime = sp[2] # print (sp[2]).text # print (sp[3]).text # print ttime all_main = xxsoup.find_all('div', class_='main')[0] realuxsoup = BeautifulSoup(str(all_main), 'html.parser') reaupp = realuxsoup.find_all('p') for pp in reaupp: list_pcyc_li = pp.find_all('a') for li in list_pcyc_li: print li.text ttt = li.get('href') print ttt fileName = u"{}_{}.pdf".format( ttime, str(li.text).replace('/', "")) print fileName basePath = '/ink/work/62/ink/{}/{}'.format( fileN, fileName) Path.mkdirAndPath(basePath) Debug.print_in_single_line( u'开始下载 {}'.format(ttt)) if ttt: content = Http.get_content(url=ttt, timeout=180) if not content: Debug.logger.debug( u'pdf『{}』下载失败'.format(ttt)) content = '' else: Debug.print_in_single_line( u'pdf {} 下载完成'.format(ttt)) else: # 当下载地址为空的时候,就没必要再去下载了 content = '' if content.__len__() > 10: with open(basePath, "wb") as pdf: pdf.write(content)
def add_html(self, src, title): template = self.get_template('directory', 'item_leaf') self.content += template.format(href=Path.get_filename(src), title=title) return
def create_book(self): # 确定文件信息 title = Match.fix_filename(self.book_title) if self.is_split: title = self.book_title + u'_卷{}'.format(self.chapter_no) # 先切换到电子书临时资源目录下 Path.chdir(Path.book_pool_path) epub = Epub(title) for task_result in self.task_result_list: chapter_src = '' # info_page if task_result.task.task_type == Type.question: chapter_src = self.generate_question_info_page( task_result.info_page) elif task_result.task.task_type == Type.answer: chapter_src = self.generate_question_info_page( task_result.info_page) elif task_result.task.task_type == Type.collection: chapter_src = self.generate_collection_info_page( task_result.info_page) elif task_result.task.task_type == Type.topic: chapter_src = self.generate_topic_info_page( task_result.info_page) elif task_result.task.task_type == Type.author: chapter_src = self.generate_author_info_page( task_result.info_page) elif task_result.task.task_type == Type.column: chapter_src = self.generate_column_info_page( task_result.info_page) elif task_result.task.task_type == Type.article: chapter_src = self.generate_article_info_page( task_result.info_page) epub.create_chapter(chapter_src, task_result.get_title()) for question in task_result.question_list: # 添加图片文件 for filename in question.img_filename_list: epub.add_image(Path.image_pool_path + '/' + filename) question_src = self.generate_question_page(question) epub.add_html(question_src, question.question_info.title) for column in task_result.column_list: # 添加图片文件 for filename in column.img_filename_list: epub.add_image(Path.image_pool_path + '/' + filename) for article in column.article_list: article_src = self.generate_article_page(article) epub.add_html(article_src, article.title) epub.finish_chapter() epub.set_creator(u'ZhihuHelp1.8.0') epub.set_language(u'zh-cn') epub.set_book_id() epub.set_output_path(Path.result_path) epub.add_css(Path.base_path + u'/www/css/markdown.css') epub.add_css(Path.base_path + u'/www/css/customer.css') epub.add_css(Path.base_path + u'/www/css/normalize.css') epub.add_css(Path.base_path + u'/www/css/bootstrap.css') epub.create() Path.reset_path() return
def reset_path(): Path.chdir(EpubPath.work_path) return
def download_button_clicked(self): tags = str(self.custom_tags.text()) # url_id = self.recipes.model.data(1, QtCore.Qt.UserRole) # TODO: 获得选中的recipes url_id = str(self.row_clicked(self.recipes.currentIndex())) if url_id == 'None': QtGui.QMessageBox.information(self, u"Error", u"选择需要爬取的网站!") return readlist_content = self.plainTextEdit.toPlainText() if readlist_content == '': QtGui.QMessageBox.information(self, u"Error", u"请在文本框中输入网址") return read_list_path = Path.read_list_path readList_file = open(read_list_path, 'w') readList_file.write(readlist_content) readList_file.close() game = EEBook(recipe_kind=url_id) progress_dlg = QProgressDialog(self) # TODO: 设置大小, 区域 progress_dlg.setWindowModality(Qt.WindowModal) progress_dlg.setMinimumDuration(5) progress_dlg.setWindowTitle(u"请等待") progress_dlg.setLabelText(u"制作中...请稍候") progress_dlg.setCancelButtonText(u"取消") progress_dlg.resize(350, 250) progress_dlg.show() progress_dlg.setRange(0, 20) for i in range(0, 15): progress_dlg.setValue(i) QThread.msleep(100) for i in range(15, 20): progress_dlg.setValue(i) QThread.msleep(100) if progress_dlg.wasCanceled(): QtGui.QMessageBox.information(self, u"Error", u"电子书制作失败, 请重新操作") return try: filename = game.begin() # TODO: 一次只能生成一本书 except TypeError: QtGui.QMessageBox.information(self, u"Error", u"第一次使用请登录") progress_dlg.close() return progress_dlg.close() info_filename = ','.join(filename) QtGui.QMessageBox.information(self, u"info", u"电子书"+str(info_filename)+u"制作成功") for item in filename: file_path = EPUBSTOR_DIR + '/' + item Path.copy(str(file_path+'.epub'), LIBRARY_DIR) file_name = os.path.basename(str(file_path)) book_id = file_name.split('.epub')[0] Path.mkdir(LIBRARY_DIR + book_id) shutil.move(LIBRARY_DIR+book_id+'.epub', LIBRARY_DIR+book_id) book = Book(str(book_id)) book.date = time.strftime(ISOTIMEFORMAT, time.localtime()) book.tags += tags.replace(' ', '') book.tags += ','+str(self.now_url) if self.add_title_tag.isChecked(): book.tags += ','+str(book.title) insert_library(book) return
def init_epub_path(work_path): """ 设置工作地址,根据该路径进行创建文件夹,生成epub,压缩等操作 """ EpubPath.set_work_path(work_path) Path.mkdir(EpubPath.meta_inf_path) Path.mkdir(EpubPath.oebps_path) Path.chdir(EpubPath.oebps_path) Path.mkdir(EpubPath.html_path) Path.mkdir(EpubPath.image_path) Path.mkdir(EpubPath.style_path) return
def start(self): print ' 中文研报 ' stockList = [] for raw_front_page_index in range(1, 251): fileN = '策略' sdPath = '/ink/work/62/ink/{}'.format(fileN) Path.mkdir(sdPath) # http://datainterface.eastmoney.com//EM_DataCenter/js.aspx?type=SR&sty=HGYJ&cmd=4&code=&ps=50&p=2&js=var%20UxmjGoYW={%22data%22:[(x)],%22pages%22:%22(pc)%22,%22update%22:%22(ud)%22,%22count%22:%22(count)%22}& burl = u"http://datainterface.eastmoney.com//EM_DataCenter/js.aspx?type=SR&sty=CLBG&cmd=4&code=&ps=50&p=" # burl = u"http://datainterface.eastmoney.com//EM_DataCenter/js.aspx?type=SR&sty=HGYJ&cmd=4&code=&ps=50&p=" uu = u"&js=var%20GdYXcAjX={%22data%22:[(x)],%22pages%22:%22(pc)%22,%22update%22:%22(ud)%22,%22count%22:%22(count)%22}&" url = '%s%s%s' % (burl, str(raw_front_page_index), uu) # print url content = Http.get_content(url) if content: try: jsonD = str(content).split('=')[-1] jdata = json.loads(jsonD) articles = jdata['data'] for article in articles: xxxs = str(article).split(',') rticlet = xxxs[0] preTitle = xxxs[5] if str(preTitle).__contains__('川财') or str(preTitle).__contains__('或'): continue # if str(preTitle).__contains__('历史') or str(preTitle).__contains__('周期')or str(preTitle).__contains__('成长'): # if str(preTitle).__contains__('政治') or str(preTitle).__contains__('中央经济')or str(preTitle).__contains__('贸易战'): if str(preTitle).__contains__('日本'): print preTitle date_time = datetime.datetime.strptime(rticlet, '%Y/%m/%d %H:%M:%S') infoCode = xxxs[1] destU = u"http://data.eastmoney.com/report/{}/cl,{}.html ".format( date_time.strftime('%Y%m%d'), infoCode) print destU result = Http.get_content(destU) result = unicode(result, 'GBK').encode('UTF-8') xxsoup = BeautifulSoup(result, 'html.parser') title_tationl = xxsoup.find_all('h1') tt = str(title_tationl[0].text).strip() xxlist_p_list = xxsoup.find_all('div', class_='report-infos')[0] sp = xxlist_p_list.find_all('span') ttime = str((sp[1]).text) date_time = datetime.datetime.strptime(ttime, '%Y年%m月%d日 %H:%M') # print date_time.strftime('%Y-%m-%d') ttime = date_time.strftime('%Y-%m-%d') # print (sp[2]).text # print (sp[3]).text title = Match.replace_specile_chars(tt) title = title.replace('/', '', 100) fileName = u"{}_{}_{}_{}.pdf".format(ttime, (sp[2]).text, title, (sp[3]).text) # 时间 券商 名称 author print fileName urlsp = sp[-1] basePath = '{}/{}'.format(sdPath, fileName) # print basePath # 创建文件夹 list_pcyc_li = urlsp.find_all('a') for li in list_pcyc_li: ttt = li.get('href') Path.mkdirAndPath(basePath) print ttt Debug.print_in_single_line(u'开始下载 {}'.format(ttt)) if ttt: content = Http.get_content(url=ttt, timeout=180) if not content: Debug.logger.debug(u'pdf『{}』下载失败'.format(ttt)) content = '' else: Debug.print_in_single_line(u'pdf {} 下载完成'.format(ttt)) else: # 当下载地址为空的时候,就没必要再去下载了 content = '' if content.__len__() > 10: with open(basePath, "wb") as pdf: pdf.write(content) except Exception as e: print('next')
def download_button_clicked(self): tags = str(self.custom_tags.text()) # url_id = self.recipes.model.data(1, QtCore.Qt.UserRole) # TODO: 获得选中的recipes url_id = str(self.row_clicked(self.recipes.currentIndex())) if url_id == 'None': QtGui.QMessageBox.information(self, u"Error", u"选择需要爬取的网站!") return readlist_content = self.plainTextEdit.toPlainText() if readlist_content == '': QtGui.QMessageBox.information(self, u"Error", u"请在文本框中输入网址") return read_list_path = Path.read_list_path readList_file = open(read_list_path, 'w') readList_file.write(readlist_content) readList_file.close() game = EEBook(recipe_kind=url_id) progress_dlg = QProgressDialog(self) # TODO: 设置大小, 区域 progress_dlg.setWindowModality(Qt.WindowModal) progress_dlg.setMinimumDuration(5) progress_dlg.setWindowTitle(u"请等待") progress_dlg.setLabelText(u"制作中...请稍候") progress_dlg.setCancelButtonText(u"取消") progress_dlg.resize(350, 250) progress_dlg.show() progress_dlg.setRange(0, 20) for i in range(0, 15): progress_dlg.setValue(i) QThread.msleep(100) for i in range(15, 20): progress_dlg.setValue(i) QThread.msleep(100) if progress_dlg.wasCanceled(): QtGui.QMessageBox.information(self, u"Error", u"电子书制作失败, 请重新操作") return try: filename = game.begin() # TODO: 一次只能生成一本书 except TypeError: QtGui.QMessageBox.information(self, u"Error", u"第一次使用请登录") progress_dlg.close() return progress_dlg.close() info_filename = ','.join(filename) QtGui.QMessageBox.information( self, u"info", u"电子书" + str(info_filename) + u"制作成功") for item in filename: file_path = EPUBSTOR_DIR + '/' + item Path.copy(str(file_path + '.epub'), LIBRARY_DIR) file_name = os.path.basename(str(file_path)) book_id = file_name.split('.epub')[0] Path.mkdir(LIBRARY_DIR + book_id) shutil.move(LIBRARY_DIR + book_id + '.epub', LIBRARY_DIR + book_id) book = Book(str(book_id)) book.date = time.strftime(ISOTIMEFORMAT, time.localtime()) book.tags += tags.replace(' ', '') book.tags += ',' + str(self.now_url) if self.add_title_tag.isChecked(): book.tags += ',' + str(book.title) insert_library(book) return
def create_single_html_book(self, book_package): title = book_package.get_title() if not title: # 电子书题目为空时自动跳过 # 否则会发生『rm -rf / 』的惨剧 return Path.reset_path() Path.chdir(Path.result_path) Path.rmdir(u'./' + title) Path.mkdir(u'./' + title) Path.chdir(u'./' + title) page = [] for book in book_package.book_list: page += book.page_list content = u' \r\n '.join([Match.html_body(x.content) for x in page]).replace(u'../images/', u'./images/') with open(TemplateConfig.content_base_uri) as html: content = html.read().format(title=title, body=content).replace(u'../style/', u'./') with open(title + u'.html', 'w') as html: html.write(content) Path.copy(Path.html_pool_path + u'/../{}/OEBPS/images'.format(title), u'./images') Path.copy(Path.www_css + u'/customer.css', u'./customer.css') Path.copy(Path.www_css + u'/markdown.css', u'./markdown.css') Path.copy(Path.www_css + u'/normalize.css', u'./normalize.css') Path.reset_path() return
def start(self): print 'start 东财研报' stockList = [] file_name = 'annual.txt' with open(file_name, 'r') as read_list: read_list = read_list.readlines() resultsL = read_list.__len__() for x in range(0, resultsL): line = read_list[x] splits = line.split('#') code = (str)(splits[0]) fieName = (str)(splits[1]).strip() print fieName stockList.append({'URL': code, 'NAME': fieName}) for xx in stockList: for raw_front_page_index in range(1, 5): fileN = str(xx['NAME']).strip() uux = xx['URL'] sdPath = '/ink/work/62/ink/{}'.format(fileN) Path.mkdir(sdPath) # url = u"http://datainterface.eastmoney.com//EM_DataCenter/js.aspx?type=SR&sty=GGSR&js=var%20LhAYbcgn={%22data%22:[(x)],%22pages%22:%22(pc)%22,%22update%22:%22(ud)%22,%22count%22:%22(count)%22}&ps=25&p=1&code=000333&rt=51734025" burl = u"http://datainterface.eastmoney.com//EM_DataCenter/js.aspx?type=SR&sty=GGSR&js=var%20LhAYbcgn={%22data%22:[(x)],%22pages%22:%22(pc)%22,%22update%22:%22(ud)%22,%22count%22:%22(count)%22}&ps=25&" uu = u"p={0}&code={1}&rt=" url = '%s%s' % (burl, uu.format(raw_front_page_index, uux)) content = Http.get_content(url) if content: jsonD = str(content).split('=')[-1] jdata = json.loads(jsonD) articles = jdata['data'] for article in articles: rticlet = article['datetime'] date_time = datetime.datetime.strptime(rticlet, '%Y-%m-%dT%H:%M:%S') destU = u"http://data.eastmoney.com/report/{}/{}.html ".format(date_time.strftime('%Y%m%d'), article['infoCode']) result = Http.get_content(destU) result = unicode(result, 'GBK').encode('UTF-8') xxsoup = BeautifulSoup(result, 'html.parser') title_tationl = xxsoup.find_all('h1') tt = str(title_tationl[0].text).strip() xxlist_p_list = xxsoup.find_all('div', class_='report-infos')[0] sp = xxlist_p_list.find_all('span') ttime = str((sp[1]).text) date_time = datetime.datetime.strptime(ttime, '%Y年%m月%d日 %H:%M') # print date_time.strftime('%Y-%m-%d') ttime = date_time.strftime('%Y-%m-%d') # print (sp[2]).text # print (sp[3]).text title = Match.replace_specile_chars(tt) title = title.replace('/', '', 100) fileName = u"{}_{}_{}_{}.pdf".format(ttime, (sp[2]).text, title, (sp[3]).text) # 时间 券商 名称 author print fileName urlsp = sp[-1] basePath = '{}/{}'.format(sdPath, fileName) # print basePath # 创建文件夹 list_pcyc_li = urlsp.find_all('a') for li in list_pcyc_li: ttt = li.get('href') Path.mkdirAndPath(basePath) print ttt Debug.print_in_single_line(u'开始下载 {}'.format(ttt)) if ttt: content = Http.get_content(url=ttt, timeout=180) if not content: Debug.logger.debug(u'pdf『{}』下载失败'.format(ttt)) content = '' else: Debug.print_in_single_line(u'pdf {} 下载完成'.format(ttt)) else: # 当下载地址为空的时候,就没必要再去下载了 content = '' if content.__len__() > 10: with open(basePath, "wb") as pdf: pdf.write(content)