예제 #1
0
파일: main.py 프로젝트: zhejia/ZhihuHelp
    def create_book(self, command, counter):
        Path.reset_path()
        Debug.logger.info(u"开始制作第 {} 本电子书".format(counter))
        Debug.logger.info(u"对记录 {} 进行分析".format(command))
        task_list = CommandParser.get_task_list(command)  # 分析命令

        if len(task_list) == 0:
            return

        for task in task_list:
            if Config.debug_for_create_book:
                pass
            else:
                Worker.distribute(task)
        Debug.logger.info(u"网页信息抓取完毕")

        task_result_list = []
        for task in task_list:
            task_result = TaskResult(task)
            task_result.extract_data()
            task_result_list.append(task_result)
        Debug.logger.info(u"数据库信息获取完毕")

        #   下载图片
        for task_result in task_result_list:
            task_result.download_img()
        Debug.logger.info(u"所有任务图片获取完毕")

        #   按体积自动分卷
        #   渲染html && 压缩为电子书
        book = Book(task_result_list)
        book_list = book.auto_split(Config.max_book_size_mb * 1024)
        for chapter in book_list:
            chapter.create_book()
        return
예제 #2
0
파일: book.py 프로젝트: zuiwan/ZhihuHelp
 def create_book(self, book_package):
     book_package.image_container.set_save_path(Path.image_pool_path)
     book_package.image_container.start_download()
     title = book_package.get_title()
     if not title:
         # 电子书题目为空时自动跳过
         # 否则会发生『rm -rf / 』的惨剧
         return
     Path.chdir(Path.base_path + u'/知乎电子书临时资源库/')
     epub = Epub(title)
     html_tmp_path = Path.html_pool_path + u'/'
     image_tmp_path = Path.image_pool_path + u'/'
     epub.set_creator(u'ZhihuHelp1.7.0')
     epub.set_book_id()
     epub.set_output_path(Path.result_path)
     epub.add_css(Path.base_path + u'/www/css/markdown.css')
     epub.add_css(Path.base_path + u'/www/css/customer.css')
     epub.add_css(Path.base_path + u'/www/css/normalize.css')
     for book in book_package.book_list:
         page = book.page_list[0]
         with open(html_tmp_path + page.filename, u'w') as html:
             html.write(page.content)
         epub.create_chapter(html_tmp_path + page.filename, page.title)
         for page in book.page_list[1:]:
             with open(html_tmp_path + page.filename, u'w') as html:
                 html.write(page.content)
             epub.add_html(html_tmp_path + page.filename, page.title)
         epub.finish_chapter()
     for image in book_package.image_list:
         epub.add_image(image_tmp_path + image['filename'])
     epub.create()
     Path.reset_path()
     return
예제 #3
0
    def create(self):
        self.image_container.set_save_path(Path.image_pool_path)
        self.image_container.start_download()
        title = "_".join([book.property.epub.title for book in self.book_list])
        title = title.strip()
        Path.chdir(Path.base_path + u"/知乎电子书临时资源库/")
        epub = Book(title, 27149527)
        html_tmp_path = Path.html_pool_path + "/"
        image_tmp_path = Path.image_pool_path + "/"
        for book in self.book_list:
            page = book.page_list[0]
            with open(html_tmp_path + page.filename, "w") as html:
                html.write(page.content)
            epub.createChapter(html_tmp_path + page.filename, ExtraTools.get_time(), page.title)

            for page in book.page_list[1:]:
                with open(html_tmp_path + page.filename, "w") as html:
                    html.write(page.content)
                epub.addHtml(html_tmp_path + page.filename, page.title)
        for image in self.book["image_list"]:
            epub.addImg(image_tmp_path + image["filename"])
        epub.addLanguage("zh-cn")
        epub.addCreator("ZhihuHelp1.7.0")
        epub.addDesc(u"该电子书由知乎助手生成,知乎助手是姚泽源为知友制作的仅供个人使用的简易电子书制作工具,源代码遵循WTFPL,希望大家能认真领会该协议的真谛,为飞面事业做出自己的贡献 XD")
        epub.addRight("CC")
        epub.addPublisher("ZhihuHelp")
        Debug.logger.debug(u"当前目录为")
        Path.pwd()
        epub.addCss(Path.base_path + u"/epubResource/markdown.css")
        epub.addCss(Path.base_path + u"/epubResource/front.css")
        epub.buildingEpub()
        return
예제 #4
0
 def add_index_html(self, src, title):
     Path.copy(src, EpubPath.html_path)
     filename = Path.get_filename(src)
     new_src = u'html/' + filename
     resource_id = self.opf.add_html(new_src)
     self.toc.add_item(resource_id, new_src, title)
     return
예제 #5
0
파일: db.py 프로젝트: HiltonWei/zhihu
 def filepath(answertmp = []):
     if answertmp:
         answerpath = Path.answer_path+'/Answer_qid_aid/'+'{0}'.format(answertmp['question_id'])
     if not Path.is_dir(answerpath):
         Path.mkdirs(answerpath)
     filename = '{0}'.format(answertmp['question_id'])+'_'+'{0}'.format(answertmp['answer_id'])+'.txt'
     return Path.join_dir(answerpath,filename)
예제 #6
0
    def __init__(self, recipe_kind='Notset', read_list='ReadList.txt', url=None, debug=False):
        u"""
        配置文件使用$符区隔,同一行内的配置文件归并至一本电子书内
        :param recipe_kind:
        :param read_list: default value: ReadList.txt
        :param url:
        :param debug:
        :return:
        """
        self.recipe_kind = recipe_kind
        self.read_list = read_list
        self.url = url
        log.warning_log(u"website type: " + str(self.recipe_kind) + '\n')
        import logging
        if debug is True:
            Debug.logger.setLevel(logging.DEBUG)
        else:
            Debug.logger.setLevel(logging.INFO)

        Debug.logger.debug(u"read_list: " + str(self.read_list))
        Debug.logger.debug(u"url: " + str(self.url))
        Debug.logger.debug(u"recipe type:" + str(recipe_kind))

        Path.init_base_path(recipe_kind)        # 设置路径
        Path.init_work_directory()              # 创建路径
        self.init_database()                    # 初始化数据库
        Config._load()
        return
예제 #7
0
 def add_index_html(self, src, title):
     Path.copy(src, EpubPath.html_path)
     filename = Path.get_filename(src)
     new_src = u'html/' + filename
     resource_id = self.opf.add_html(new_src)
     self.toc.add_item(resource_id, new_src, title)
     return
예제 #8
0
 def create_chapter(self, src, title):
     Path.copy(src, EpubPath.html_path)
     filename = Path.get_filename(src)
     new_src = u'html/' + filename
     resource_id = self.opf.add_title_page_html(new_src)
     self.directory.create_chapter(new_src, title)
     self.toc.create_chapter(resource_id, new_src, title)
     return
예제 #9
0
 def create_chapter(self, src, title):
     Path.copy(src, EpubPath.html_path)
     filename = Path.get_filename(src)
     new_src = u'html/' + filename
     resource_id = self.opf.add_title_page_html(new_src)
     self.directory.create_chapter(new_src, title)
     self.toc.create_chapter(resource_id, new_src, title)
     return
예제 #10
0
 def __init__(self):
     u"""
     配置文件使用$符区隔,同一行内的配置文件归并至一本电子书内
     """
     init.init_database()
     Path.init_base_path()
     Config._load()
     return
예제 #11
0
파일: main.py 프로젝트: zhejia/ZhihuHelp
 def __init__(self):
     #   初始化目录结构
     Path.init_base_path()
     Path.init_work_directory()
     #   初始化数据库链接
     DB.init_database()
     #   初始化配置
     Config.init_config()
     return
예제 #12
0
 def __init__(self):
     u"""
     配置文件使用$符区隔,同一行内的配置文件归并至一本电子书内
     """
     Path.init_base_path()  # 设置路径
     Path.init_work_directory()  # 创建路径
     self.init_database()  # 初始化数据库
     Config._load()
     return
예제 #13
0
 def __init__(self):
     u"""
     配置文件使用$符区隔,同一行内的配置文件归并至一本电子书内
     """
     Path.init_base_path()       # 设置路径
     Path.init_work_directory()  # 创建路径
     self.init_database()        # 初始化数据库
     Config._load()
     return
예제 #14
0
    def create_book(self):
        #   确定文件信息
        title = Match.fix_filename(self.book_title)
        if self.is_split:
            title = self.book_title + u'_卷{}'.format(self.chapter_no)

        #   先切换到电子书临时资源目录下
        Path.chdir(Path.book_pool_path)
        epub = Epub(title)
        for task_result in self.task_result_list:
            chapter_src = ''
            # info_page
            if task_result.task.task_type == Type.question:
                chapter_src = self.generate_question_info_page(task_result.info_page)
            elif task_result.task.task_type == Type.answer:
                chapter_src = self.generate_question_info_page(task_result.info_page)
            elif task_result.task.task_type == Type.collection:
                chapter_src = self.generate_collection_info_page(task_result.info_page)
            elif task_result.task.task_type == Type.topic:
                chapter_src = self.generate_topic_info_page(task_result.info_page)
            elif task_result.task.task_type == Type.author:
                chapter_src = self.generate_author_info_page(task_result.info_page)
            elif task_result.task.task_type == Type.column:
                chapter_src = self.generate_column_info_page(task_result.info_page)
            elif task_result.task.task_type == Type.article:
                chapter_src = self.generate_article_info_page(task_result.info_page)
            epub.create_chapter(chapter_src, task_result.get_title())
            for question in task_result.question_list:
                #   添加图片文件
                for filename in question.img_filename_list:
                    epub.add_image(Path.image_pool_path + '/' + filename)
                question_src = self.generate_question_page(question)
                epub.add_html(question_src, question.question_info.title)

            for column in task_result.column_list:
                #   添加图片文件
                for filename in column.img_filename_list:
                    epub.add_image(Path.image_pool_path + '/' + filename)
                for article in column.article_list:
                    article_src = self.generate_article_page(article)
                    epub.add_html(article_src, article.title)
            epub.finish_chapter()

        epub.set_creator(u'ZhihuHelp1.8.0')
        epub.set_language(u'zh-cn')
        epub.set_book_id()
        epub.set_output_path(Path.result_path)
        epub.add_css(Path.base_path + u'/www/css/markdown.css')
        epub.add_css(Path.base_path + u'/www/css/customer.css')
        epub.add_css(Path.base_path + u'/www/css/normalize.css')
        epub.add_css(Path.base_path + u'/www/css/bootstrap.css')
        epub.create()

        Path.reset_path()
        return
예제 #15
0
    def create_book(command, counter):
        Path.reset_path()

        Debug.logger.info(u"开始制作第 {} 本电子书".format(counter))
        Debug.logger.info(u"对记录 {} 进行分析".format(command))
        task_package = ReadListParser.get_task(command)  # 分析命令

        if not task_package.is_work_list_empty():
            worker_factory(task_package.work_list)  # 执行抓取程序
            Debug.logger.info(u"网页信息抓取完毕")

        if not task_package.is_book_list_empty():
            Debug.logger.info(u"开始从数据库中生成电子书")
            book = Book(task_package.book_list)
            book.create()
        return
예제 #16
0
    def download_img(self):
        from src.container.image_container import ImageContainer
        img_container = ImageContainer()
        img_src_dict = Match.match_img_with_src_dict(self.content)
        self.img_filename_list = []
        for img in img_src_dict:
            src = img_src_dict[img]
            filename = img_container.add(src)
            self.img_filename_list.append(filename)
            self.content = self.content.replace(img, Match.create_img_element_with_file_name(filename))

        #   下载文章封面图像
        filename = img_container.add(self.image_url)
        self.img_filename_list.append(filename)
        self.image_url = Match.create_local_img_src(filename)

        #   下载用户头像
        filename = img_container.add(self.author_avatar_url)
        self.img_filename_list.append(filename)
        self.author_avatar_url = Match.create_local_img_src(filename)

        img_container.start_download()

        #   下载完成后,更新图片大小
        for filename in self.img_filename_list:
            self.total_img_size_kb += Path.get_img_size_by_filename_kb(filename)
        return
예제 #17
0
    def create_book(command, counter):
        Path.reset_path()

        Debug.logger.info(u"开始制作第 {} 本电子书".format(counter))
        Debug.logger.info(u"对记录 {} 进行分析".format(command))
        task_package = ReadListParser.get_task(command)  # 分析命令

        if not task_package.is_work_list_empty():
            worker_factory(task_package.work_list)  # 执行抓取程序
            Debug.logger.info(u"网页信息抓取完毕")

        if not task_package.is_book_list_empty():
            Debug.logger.info(u"开始从数据库中生成电子书")
            book = Book(task_package.book_list)
            book.create()
        return
예제 #18
0
    def download_img(self):
        from src.container.image_container import ImageContainer
        img_container = ImageContainer()
        img_src_dict = Match.match_img_with_src_dict(self.content)
        self.img_filename_list = []
        for img in img_src_dict:
            src = img_src_dict[img]
            filename = img_container.add(src)
            self.img_filename_list.append(filename)
            self.content = self.content.replace(
                img, Match.create_img_element_with_file_name(filename))

        #   答案作者的头像也要下载
        filename = img_container.add(self.author_avatar_url)
        self.img_filename_list.append(filename)
        self.author_avatar_url = Match.create_local_img_src(filename)

        img_container.start_download()

        #   下载完成后,更新图片大小
        for filename in self.img_filename_list:
            self.total_img_size_kb += Path.get_img_size_by_filename_kb(
                filename)

            print self.total_img_size_kb
        return
예제 #19
0
 def init_database():
     if Path.is_file(Path.db_path):
         DB.set_conn(sqlite3.connect(Path.db_path))
     else:
         DB.set_conn(sqlite3.connect(Path.db_path))
         with open(Path.sql_path) as sql_script:
             DB.cursor.executescript(sql_script.read())
         DB.commit()
예제 #20
0
 def init_database():
     if Path.is_file(Path.db_path):
         DB.set_conn(sqlite3.connect(Path.db_path))
     else:
         DB.set_conn(sqlite3.connect(Path.db_path))
         with open(Path.sql_path) as sql_script:
             DB.cursor.executescript(sql_script.read())
         DB.commit()
예제 #21
0
    def download_img(self):
        from src.container.image_container import ImageContainer

        if str(self.content).__contains__('<div class="duokan-image-single">'):
            # print img_src_dict

            xtep = str(self.content)
            xxsoup = BeautifulSoup(xtep, 'lxml')
            list_tiezhi_tit = xxsoup.find_all('div', class_="duokan-image-single")
            for x in list_tiezhi_tit:

                list_pcyc_li = x.find_all('img')
                for li in list_pcyc_li:
                    # print li
                    src = li.get('src')

                    st = str(src).split('/images/')[-1]

                    newT = u'<img class="ke_img" src="file:///Users/ink/Desktop/images/{}"  />'.format(st)

                    xtep = xtep.replace(str(x), newT, 1)

            self.content = xtep

            # print xtep

        img_container = ImageContainer()
        img_src_dict = Match.match_img_with_src_dict(self.content)

        self.img_filename_list = []
        for img in img_src_dict:
            src = img_src_dict[img]
            filename = img_container.add(src)

            # print 'src:' + src + '  and filename  ' + filename

            self.img_filename_list.append(filename)
            if str(img).__contains__(u"class=\"avatar\""):
                self.content = self.content.replace(img, Match.avatar_create_img_element_with_file_name(filename))
            else:
                self.content = self.content.replace(img, Match.create_img_element_with_file_name(filename))
        # 下载文章封面图像
        filename = img_container.add(self.image_url)
        self.img_filename_list.append(filename)
        self.image_url = Match.create_local_img_src(filename)

        #   下载用户头像
        filename = img_container.add(self.author_avatar_url)
        self.img_filename_list.append(filename)
        self.author_avatar_url = Match.create_local_img_src(filename)

        img_container.start_download()

        #   下载完成后,更新图片大小
        for filename in self.img_filename_list:
            self.total_img_size_kb += Path.get_img_size_by_filename_kb(filename)
        return
예제 #22
0
 def add_html(self, src, title):
     u"""
         add_index为add_html不需要添加文件时的特殊情况
     """
     self.add_index_html(src, title)
     filename = Path.get_filename(src)
     new_src = u'html/' + filename
     self.directory.add_html(new_src, title)
     return
예제 #23
0
 def add_html(self, src, title):
     u"""
         add_index为add_html不需要添加文件时的特殊情况
     """
     self.add_index_html(src, title)
     filename = Path.get_filename(src)
     new_src = u'html/' + filename
     self.directory.add_html(new_src, title)
     return
예제 #24
0
파일: db.py 프로젝트: HiltonWei/zhihu
 def getAnswerContentFromFile( answertmp = {}):
     if not answertmp:
         return
     filepath_name = Ans2File.filepath(answertmp)
     if Path.is_file(filepath_name):
         with open(filepath_name, 'r') as f:
             fileContent = f.read()
             answertmp['content'] = fileContent
     return
예제 #25
0
    def create_book(self, command, counter):
        Path.reset_path()
        Debug.logger.info(u"开始制作第 {} 本电子书".format(counter))
        Debug.logger.info(u"对记录 {} 进行分析".format(command))
        task_list = CommandParser.get_task_list(command)  # 分析命令

        if len(task_list) == 0:
            return

        for task in task_list:
            if Config.debug_for_create_book:
                pass
            else:
                Worker.distribute(task)
        Debug.logger.info(u"网页信息抓取完毕")

        task_result_list = []
        toTo_list = [
            Type.wechat, Type.huxiu, Type.huawei, Type.xueqiu, Type.sina,
            Type.zhengshitang, Type.jinwankansa, Type.wuxia, Type.doc360,
            Type.todo, Type.todo1, Type.todo2, Type.fiel, Type.taoguba_article
        ]
        for task in task_list:
            if task.get_task_type() in toTo_list:
                task = ColumnTask(task.account_id)
            task_result = TaskResult(task)
            task_result.extract_data()
            task_result_list.append(task_result)
        Debug.logger.info(u"数据库信息获取完毕")

        #   下载图片
        for task_result in task_result_list:
            task_result.download_img()
            # print '所有任务图片获取完毕'
        Debug.logger.info(u"所有任务图片获取完毕")

        #   按体积自动分卷
        #   渲染html && 压缩为电子书
        book = Book(task_result_list)
        book_list = book.auto_split(Config.max_book_size_mb * 1024)
        for chapter in book_list:
            chapter.create_book()

        return
예제 #26
0
    def create_chapter(self, src, title):
        template = self.get_template('directory', 'item_root')
        item = template.format(href=Path.get_filename(src), title=title)
        if self.chapter_deep == 0:
            template = self.get_template('directory', 'chapter')
            item = template.format(item=item, title=u'目录')
        self.content += item

        self.chapter_deep += 1
        return
예제 #27
0
def reloadtocatch():
    rootdir = "/Volumes/MacintoshHD/TC/todo"  # 指明被遍历的文件夹
    filelist = os.listdir(rootdir)
    ff = open('TGB_List.txt', 'w')
    for file in filelist:
        print file
        # if file.startswith('TGB'):
        article_id = (str(file).split('_')[-1]).strip(')')
        print article_id
        # tagrId = u"http://www.taoguba.com.cn/Article/{}/0\n".format(article_id)
        tagrId = u"https://www.huxiu.com/{}\n".format(article_id)
        ff.write(tagrId)

    # 图片文件夹
    rootdir = "/Volumes/MacintoshHD/TC/todo"  # 指明被遍历的文件夹
    picdir = "/Volumes/MacintoshHD/TC/picture"
    filelist = os.listdir(rootdir)
    ff = open('TGB_List.txt', 'w')
    for parent, dirnames, filenames in os.walk(
            rootdir):  # 三个参数:分别返回1.父目录 2.所有文件夹名字(不含路径) 3.所有文件名字
        # for dirname in dirnames:  # 输出文件夹信息
        #     # print "parent is: " + parent
        #     print  "dirname is: " + dirname

        for filename in filenames:  # 输出文件信息
            print "parent is :" + parent
            print "filename is:" + filename
            print "the full name of the file is:" + os.path.join(
                parent, filename)  # 输出文件路径信息

            if filename.endswith('jpg'):
                Path.copy(os.path.join(parent, filename), picdir)

    # for parent, dirnames, filenames in os.walk(rootdir):  # 三个参数:分别返回1.父目录 2.所有文件夹名字(不含路径) 3.所有文件名字
    #     for dirname in dirnames:  # 输出文件夹信息
    #     #     # print "parent is: " + parent
    #         print  "dirname is: " + dirname
    #
    #     # for filename in filenames:  # 输出文件信息
    #     #     print "parent is :" + parent
    #     #     print "filename is:" + filename
    #     #     print "the full name of the file is:" + os.path.join(parent, filename)  # 输出文件路径信息
    ff.close()
예제 #28
0
 def init_database():
     if Path.is_file(Path.db_path):
         Debug.logger.debug(u"Connect to the database...")
         Debug.logger.debug(u"db_path: " + str(Path.db_path))
         DB.set_conn(sqlite3.connect(Path.db_path))
     else:
         Debug.logger.debug(u"Create db file...")
         DB.set_conn(sqlite3.connect(Path.db_path))
         with open(Path.sql_path) as sql_script:
             DB.cursor.executescript(sql_script.read())
         DB.commit()
예제 #29
0
파일: book.py 프로젝트: mozii/EE-Book
 def create_book(self, book_package):
     book_package.image_container.set_save_path(Path.image_pool_path)
     book_package.image_container.start_download()
     title = book_package.get_title()
     Debug.logger.debug(u"title of the e-book:" + str(title))
     if not title:
         # 电子书题目为空时自动跳过
         # 否则会发生『rm -rf / 』的惨剧
         return
     Path.chdir(Path.in_base_path + u'/e-books_tmp_source')
     epub = Epub(title)
     html_tmp_path = Path.html_pool_path + u'/'
     image_tmp_path = Path.image_pool_path + u'/'
     epub.set_creator(u'EEBookV0-1')
     epub.set_language(u'zh')
     epub.set_book_id()
     epub.set_output_path(Path.result_path)
     epub.add_css(Path.in_base_path + u'/www/css/markdown.css')
     epub.add_css(Path.in_base_path + u'/www/css/customer.css')
     epub.add_css(Path.in_base_path + u'/www/css/normalize.css')
     epub.add_css(Path.in_base_path + u'/www/css/bootstrap.css')
     # epub.add_css(Path.in_base_path + u'/www/css/article.css')    # TODO: 来自新浪,需要精简
     for book in book_package.book_list:
         page = book.page_list[0]
         with open(html_tmp_path + page.filename, 'w') as html:
             html.write(page.content)
         if '_' in page.title:
             page.title = ''.join(page.title.split('_')[1:])  # 删除章节前缀
         epub.create_chapter(html_tmp_path + page.filename, page.title)
         for page in book.page_list[1:]:
             with open(html_tmp_path + page.filename, 'w') as html:
                 html.write(page.content)
             epub.add_html(html_tmp_path + page.filename, page.title)
         epub.finish_chapter()
     for image in book_package.image_list:
         epub.add_image(image_tmp_path + image['filename'])
     epub.create()
     Path.reset_path()
     return
예제 #30
0
    def create_book(command, counter):
        Path.reset_path()

        Debug.logger.info(u"Ready to make No.{} e-book".format(counter))
        Debug.logger.info(u"Analysis {} ".format(command))
        task_package = UrlParser.get_task(command)  # 分析命令

        Debug.logger.debug(u"#Debug:#task_package是:" + str(task_package))
        if not task_package.is_work_list_empty():
            worker_factory(task_package.work_list)  # 执行抓取程序
            Debug.logger.info(u"Complete fetching from web")

        file_name_set = None
        if not task_package.is_book_list_empty():
            Debug.logger.info(u"Start generating e-book from the database")
            book = Book(task_package.book_list)
            file_name_set = book.create()
        if file_name_set is not None:
            file_name_set2list = list(file_name_set)
            file_name = '-'.join(file_name_set2list[0:3])
            return file_name
        return u"Oops! no epub file produced"
예제 #31
0
 def create_book(self, book_package):
     book_package.image_container.set_save_path(Path.image_pool_path)
     book_package.image_container.start_download()
     title = book_package.get_title()
     Debug.logger.debug(u"电子书的名称是???" + str(title))
     if not title:
         # 电子书题目为空时自动跳过
         # 否则会发生『rm -rf / 』的惨剧
         return
     Path.chdir(Path.base_path + u'/电子书临时资源库')
     epub = Epub(title)
     html_tmp_path = Path.html_pool_path + u'/'
     image_tmp_path = Path.image_pool_path + u'/'
     epub.set_creator(u'SinaBlogV01')
     epub.set_book_id()
     epub.set_output_path(Path.result_path)
     epub.add_css(Path.base_path + u'/www/css/markdown.css')
     epub.add_css(Path.base_path + u'/www/css/customer.css')
     epub.add_css(Path.base_path + u'/www/css/normalize.css')
     epub.add_css(Path.base_path + u'/www/css/bootstrap.css')
     # epub.add_css(Path.base_path + u'/www/css/article.css')    # TODO: 来自新浪,需要精简
     for book in book_package.book_list:
         page = book.page_list[0]
         with open(html_tmp_path + page.filename, u'w') as html:
             html.write(page.content)
         if '_' in page.title:
             page.title = ''.join(page.title.split('_')[1:])  # 删除章节前缀
         epub.create_chapter(html_tmp_path + page.filename, page.title)
         for page in book.page_list[1:]:
             with open(html_tmp_path + page.filename, u'w') as html:
                 html.write(page.content)
             epub.add_html(html_tmp_path + page.filename, page.title)
         epub.finish_chapter()
     for image in book_package.image_list:
         epub.add_image(image_tmp_path + image['filename'])
     epub.create()
     Path.reset_path()
     return
예제 #32
0
    def add_book(self):
        u"""
        打开已经在文件系统的电子书到电子书管理器中
        :return:
        """
        # Get filename and show only .epub files    Mac 系统下返回的是native fiel dialog
        book_path = QtGui.QFileDialog.getOpenFileName(self, u'打开Epub格式电子书',
                                                      ".", "(*.epub)")

        if str(book_path) is '':
            # 没有选中电子书
            return

        if os.path.dirname(str(book_path)) + os.sep != str(LIBRARY_DIR):
            shutil.copy(str(book_path), LIBRARY_DIR)

        file_name = os.path.basename(str(book_path))
        book_id = file_name.split('.epub')[0]
        bookdata_book_catalog = LIBRARY_DIR + book_id

        Path.mkdir(bookdata_book_catalog)

        Debug.logger.debug(u"移入bookdata中的是:" + str(LIBRARY_DIR + file_name))
        Debug.logger.debug(u"bookdata中的书:" + str(bookdata_book_catalog))
        Debug.logger.debug(u"book_path:" + os.path.dirname(str(book_path)))
        if os.path.dirname(str(book_path)) != bookdata_book_catalog:
            try:
                shutil.move(LIBRARY_DIR + file_name, bookdata_book_catalog)
            except shutil.Error:
                Debug.logger.debug(u"TODO:添加过这个书,删除原来的书")
                pass
        else:
            Debug.logger.debug(u"是相同文件夹, 添加的是bookdata中的书")
        os.remove(LIBRARY_DIR + file_name)
        book = Book(book_id)
        book.date = time.strftime(ISOTIMEFORMAT, time.localtime())
        insert_library(book)
        self.update_library()
예제 #33
0
파일: ui.py 프로젝트: bindx/EE-Book
    def add_book(self):
        u"""
        打开已经在文件系统的电子书到电子书管理器中
        :return:
        """
        # Get filename and show only .epub files    Mac 系统下返回的是native fiel dialog
        book_path = QtGui.QFileDialog.getOpenFileName(self, u'打开Epub格式电子书', ".", "(*.epub)")

        if str(book_path) is '':
            # 没有选中电子书
            return

        if os.path.dirname(str(book_path))+os.sep != str(LIBRARY_DIR):
            shutil.copy(str(book_path), LIBRARY_DIR)

        file_name = os.path.basename(str(book_path))
        book_id = file_name.split('.epub')[0]
        bookdata_book_catalog = LIBRARY_DIR+book_id

        Path.mkdir(bookdata_book_catalog)

        Debug.logger.debug(u"移入bookdata中的是:" + str(LIBRARY_DIR+file_name))
        Debug.logger.debug(u"bookdata中的书:" + str(bookdata_book_catalog))
        Debug.logger.debug(u"book_path:" + os.path.dirname(str(book_path)))
        if os.path.dirname(str(book_path)) != bookdata_book_catalog:
            try:
                shutil.move(LIBRARY_DIR+file_name, bookdata_book_catalog)
            except shutil.Error:
                Debug.logger.debug(u"TODO:添加过这个书,删除原来的书")
                pass
        else:
            Debug.logger.debug(u"是相同文件夹, 添加的是bookdata中的书")
        os.remove(LIBRARY_DIR+file_name)
        book = Book(book_id)
        book.date = time.strftime(ISOTIMEFORMAT, time.localtime())
        insert_library(book)
        self.update_library()
예제 #34
0
 def create(self):
     self.image_container.set_save_path(Path.image_pool_path)
     self.image_container.start_download()
     title = '_'.join([book.epub.title for book in self.book_list])
     title = title.strip()[:128] # 避开window文件名长度限制
     title = ExtraTools.fix_filename(title) # 移除特殊字符
     if not title:
         # 电子书题目为空时自动跳过
         # 否则会发生『rm -rf / 』的惨剧。。。
         return
     Path.chdir(Path.base_path + u'/知乎电子书临时资源库/')
     epub = Book(title, 27149527)
     html_tmp_path = Path.html_pool_path + '/'
     image_tmp_path = Path.image_pool_path + '/'
     for book in self.book_list:
         page = book.page_list[0]
         with open(html_tmp_path + page.filename, 'w') as html:
             html.write(page.content)
         #epub.createChapter(html_tmp_path + page.filename, ExtraTools.get_time(), page.title)
         epub.addInfoPage(html_tmp_path + page.filename, page.title)
         for page in book.page_list[1:]:
             with open(html_tmp_path + page.filename, 'w') as html:
                 html.write(page.content)
             epub.addHtml(html_tmp_path + page.filename, page.title)
     for image in self.book.image_list:
         epub.addImg(image_tmp_path + image['filename'])
     epub.addLanguage('zh-cn')
     epub.addCreator('ZhihuHelp1.7.0')
     epub.addDesc(u'该电子书由知乎助手生成,知乎助手是姚泽源为知友制作的仅供个人使用的简易电子书制作工具,源代码遵循WTFPL,希望大家能认真领会该协议的真谛,为飞面事业做出自己的贡献 XD')
     epub.addRight('CC')
     epub.addPublisher('ZhihuHelp')
     epub.addCss(Path.base_path + u'/www/css/markdown.css')
     epub.addCss(Path.base_path + u'/www/css/front.css')
     epub.buildingEpub()
     Path.reset_path()
     return
예제 #35
0
파일: db.py 프로젝트: HiltonWei/zhihu
    def svContent2File( filepath_name , contentInHttp):
        contentBefore=''
        if Path.is_file(filepath_name):
            with open(filepath_name ,'r') as f:
                contentBefore = f.read()

        def calcContent(contentBefore ,contentSave):#计算内容 是否保存
            strContentSave = contentSave
            contentBefore = unicode(contentBefore, errors='ignore')
            if (contentSave.find(contentBefore[10:]) == -1):
                strContentSave += '\n -----------------------------------------答案更新分割线------------------\n'
                strContentSave += contentBefore
            return strContentSave
            
        content2SV = calcContent(contentBefore , contentInHttp)
        with open(filepath_name ,'w') as f:
            f.write(content2SV)
        return
예제 #36
0
    def download_img_in_question_content(self):
        #   下载问题详情中的图片,同时更新
        from src.container.image_container import ImageContainer
        img_container = ImageContainer()
        img_src_dict = Match.match_img_with_src_dict(self.question_info.detail)
        self.question_content_img_filename_list = []
        for img in img_src_dict:
            src = img_src_dict[img]
            filename = img_container.add(src)
            self.question_content_img_filename_list.append(filename)
            self.question_info.detail = self.question_info.detail.replace(img, Match.create_img_element_with_file_name(filename))

        img_container.start_download()

        #   下载完成后,更新图片大小
        for filename in self.question_content_img_filename_list:
            self.question_content_img_size += Path.get_img_size_by_filename_kb(filename)
        return
    def download_img_in_question_content(self):
        #   下载问题详情中的图片,同时更新
        from src.container.image_container import ImageContainer
        img_container = ImageContainer()
        img_src_dict = Match.match_img_with_src_dict(self.question_info.detail)
        self.question_content_img_filename_list = []
        for img in img_src_dict:
            src = img_src_dict[img]
            filename = img_container.add(src)
            self.question_content_img_filename_list.append(filename)
            self.question_info.detail = self.question_info.detail.replace(img, Match.create_img_element_with_file_name(filename))

        img_container.start_download()

        #   下载完成后,更新图片大小
        for filename in self.question_content_img_filename_list:
            self.question_content_img_size += Path.get_img_size_by_filename_kb(filename)
        return
예제 #38
0
파일: db.py 프로젝트: HiltonWei/zhihu
    def addFileSave(answertmp={}):
        save_content = hashlib.md5(answertmp['content']).hexdigest()
        filepath_name = Ans2File.filepath(answertmp)

        def isUpdateContent( save_content ,questionID , answerID):#是否更答案文本内容
            answerHref = 'http://www.zhihu.com/question/{0}/answer/{1}'.format(questionID, answerID)
            Var = DB.cursor.execute(
                    "select content from Answer  where href = ?", (answerHref,)).fetchone()
            if not Var:
                Debug.logger.debug(u'读答案md5失败')
                return False #初始为空 则默认插入
            return Var[0] == save_content

        bsaved = isUpdateContent(save_content , answertmp['question_id'] , answertmp['answer_id'])
        if not Path.is_file(filepath_name):
            bsaved = False#数据库中有,文件没有
        if not bsaved :
            Ans2File.svContent2File(filepath_name , answertmp['content'])
        answertmp['content'] = str(save_content)
        return
예제 #39
0
파일: main.py 프로젝트: zhejia/ZhihuHelp
    def start(self):
        #   检查更新
        self.check_update()

        #   登录
        login = Login()
        zhihu_client = login.get_login_client()
        Worker.set_zhihu_client(zhihu_client)

        Debug.logger.info(u"开始读取ReadList.txt设置信息")

        if not Path.is_file(u'./ReadList.txt'):
            #  当ReadList不存在的时候自动创建之
            with open(u'./ReadList.txt', u'w') as read_list:
                read_list.close()
            print Debug.logger.info(u"ReadList.txt 内容为空,自动退出")
            return
        book_counter = self.read_list()

        Debug.logger.info(u"所有书籍制作完成。")
        Debug.logger.info(u"本次共制作书籍{0}本".format(book_counter))
        Debug.logger.info(u"感谢您的使用")
        Debug.logger.info(u"点按任意键退出")
        return
예제 #40
0
    def start(self):
        print 'start 东财股吧 研报'

        stockList = []
        file_name = 'annual.txt'

        with open(file_name, 'r') as read_list:
            read_list = read_list.readlines()

            resultsL = read_list.__len__()
            for x in range(0, resultsL):
                line = read_list[x]
                splits = line.split('#')
                code = (str)(splits[0])
                fieName = (str)(splits[1]).strip()
                print fieName
                stockList.append({'URL': code, 'NAME': fieName})

        for xx in stockList:
            for raw_front_page_index in range(1, 3):
                fileN = str(xx['NAME']).strip()
                uux = xx['URL']

                sdPath = '/ink/work/62/ink/{}'.format(fileN)
                Path.mkdir(sdPath)

                burl = u"http://guba.eastmoney.com/list,{},2,f_{}.html"

                content = Http.get_content(burl.format(uux, raw_front_page_index))

                xxsoup = BeautifulSoup(content, 'html.parser')

                tagrt = xxsoup.find_all('div', id='articlelistnew')[0]

                ols = tagrt.find_all('div', class_='articleh normal_post')
                olss = tagrt.find_all('div', class_='articleh normal_post odd')

                splicy = []

                for xxos in ols:
                    splicy.append(xxos)
                for xx in olss:
                    splicy.append(xxos)

                for inkl in splicy:

                    try:

                        inklinkl = BeautifulSoup(str(inkl), 'html.parser')

                        spp = inklinkl.find_all('span', class_='l3')[0]

                        list_pcyc_li = spp.find_all('a')
                        for li in list_pcyc_li:
                            ttt = li.get('href')

                            print ttt

                            destU = u'http://guba.eastmoney.com{}'.format(ttt)

                            result = Http.get_content(destU)
                            # result = unicode(result, 'GBK').encode('UTF-8')

                            xxsoup = BeautifulSoup(result, 'html.parser')

                            title_tationl = xxsoup.find_all('div', id='zwconttbt')
                            tt = str(title_tationl[0].text).strip()
                            print tt

                            title = Match.replace_specile_chars(tt)
                            title = title.replace('/', '', 100)

                            title = title.replace('查看原文', '')

                            ttime = xxsoup.find_all('p', class_='publishdate')[0]

                            tttttime = str(ttime.text)[-10:]

                            print  tttttime

                            date_time = datetime.datetime.strptime(tttttime, '%Y-%m-%d')

                            # print date_time.strftime('%Y-%m-%d')

                            ttime = date_time.strftime('%Y-%m-%d')

                            fileName = u"{}_{}.pdf".format(ttime, title)
                            # 时间 券商 名称  author

                            print fileName

                            basePath = '{}/{}'.format(sdPath, fileName)

                            # print basePath

                            # 创建文件夹
                            #

                            spx = xxsoup.find_all('span', class_='zwtitlepdf')[0]

                            pdfu = spx.find_all('a')
                            for li in pdfu:
                                ttt = li.get('href')

                                print ttt
                                Path.mkdirAndPath(basePath)

                                Debug.print_in_single_line(u'开始下载   {}'.format(ttt))
                                if ttt:
                                    content = Http.get_content(url=ttt, timeout=180)
                                    if not content:
                                        Debug.logger.debug(u'pdf『{}』下载失败'.format(ttt))
                                        content = ''
                                    else:
                                        Debug.print_in_single_line(u'pdf {} 下载完成'.format(ttt))
                                else:
                                    #   当下载地址为空的时候,就没必要再去下载了
                                    content = ''
                                if content.__len__() > 10:
                                    with open(basePath, "wb") as pdf:
                                        pdf.write(content)
                    except Exception as e:
                        print('next')
예제 #41
0
    def create_book(self):
        #   确定文件信息
        title = Match.fix_filename(self.book_title)
        if self.is_split:
            title = self.book_title + u'_卷{}'.format(self.chapter_no)

        #   先切换到电子书临时资源目录下
        Path.chdir(Path.book_pool_path)
        epub = Epub(title)
        for task_result in self.task_result_list:
            chapter_src = ''
            # info_page
            if task_result.task.task_type == Type.question:
                chapter_src = self.generate_question_info_page(
                    task_result.info_page)
            elif task_result.task.task_type == Type.answer:
                chapter_src = self.generate_question_info_page(
                    task_result.info_page)
            elif task_result.task.task_type == Type.collection:
                chapter_src = self.generate_collection_info_page(
                    task_result.info_page)
            elif task_result.task.task_type == Type.topic:
                chapter_src = self.generate_topic_info_page(
                    task_result.info_page)
            elif task_result.task.task_type == Type.author:
                chapter_src = self.generate_author_info_page(
                    task_result.info_page)
            elif task_result.task.task_type == Type.column:

                task_result.info_page.article_count = (
                    task_result.column_list[0].article_list).__len__()

                chapter_src = self.generate_column_info_page(
                    task_result.info_page)
            elif task_result.task.task_type == Type.article:
                chapter_src = self.generate_article_info_page(
                    task_result.info_page)
            epub.create_chapter(chapter_src, task_result.get_title())
            for question in task_result.question_list:
                #   添加图片文件
                for filename in question.img_filename_list:
                    epub.add_image(Path.image_pool_path + '/' + filename)
                question_src = self.generate_question_page(question)
                epub.add_html(question_src, question.question_info.title)

            for column in task_result.column_list:
                #   添加图片文件
                for filename in column.img_filename_list:
                    epub.add_image(Path.image_pool_path + '/' + filename)
                for article in column.article_list:
                    article_src = self.generate_article_page(article)
                    epub.add_html(article_src, article.title)
            epub.finish_chapter()

        href = self.task_result_list[0].info_page.image_url
        if len(href) > 0:
            print href

            if href:
                content = Http.get_content(
                    url=href, timeout=Config.timeout_download_picture)
                if not content:
                    Debug.logger.debug(u'图片『{}』下载失败'.format(href))
                    content = ''
                else:
                    Debug.print_in_single_line(u'图片{}下载完成'.format(href))
            else:
                #   当下载地址为空的时候,就没必要再去下载了
                content = ''
            if content.__len__() > 10:
                filename = Path.image_pool_path + '/' + 'cover.jpg'
                with open(filename, 'wb') as image:
                    image.write(content)

                epub.add_cover_image(filename)

        else:
            epub.add_cover_image('/Users/ex-liyan010/Desktop/cover.png')
            # epub.add_cover_image('/Users/ex-liyan010/Desktop/cover.png')

        epub.set_creator(u'macbookpro2100')
        epub.set_language(u'zh-cn')
        epub.set_book_id()
        epub.set_output_path(Path.result_path)
        epub.add_css(Path.base_path + u'/www/css/markdown.css')
        epub.add_css(Path.base_path + u'/www/css/customer.css')
        epub.add_css(Path.base_path + u'/www/css/normalize.css')
        epub.add_css(Path.base_path + u'/www/css/bootstrap.css')
        epub.create()

        Path.reset_path()
        return
예제 #42
0
 def add_cover_image(self, src):
     Path.copy(src, EpubPath.image_path)
     filename = Path.get_filename(src)
     new_src = u'images/' + filename
     resource_id = self.opf.add_cover_image(new_src)
     return
예제 #43
0
 def add_css(self, src):
     Path.copy(src, EpubPath.style_path)
     filename = Path.get_filename(src)
     new_src = u'style/' + filename
     resource_id = self.opf.add_css(new_src)
     return
예제 #44
0
 def add_cover_image(self, src):
     Path.copy(src, EpubPath.image_path)
     filename = Path.get_filename(src)
     new_src = u'images/' + filename
     resource_id = self.opf.add_cover_image(new_src)
     return
    def start(self):
        print 'start JRJ_Report'

        stockList = []
        file_name = 'annual.txt'

        with open(file_name, 'r') as read_list:
            read_list = read_list.readlines()

            resultsL = read_list.__len__()
            for x in range(0, resultsL):
                line = read_list[x]
                splits = line.split('#')
                code = (str)(splits[0])
                fieName = (str)(splits[1]).strip()
                print fieName
                stockList.append({'URL': code, 'NAME': fieName})

        for xx in stockList:

            for raw_front_page_index in range(1, 8):

                fileN = str(xx['NAME']).strip()
                uux = xx['URL']

                sdPath = '/ink/work/62/ink/{}'.format(fileN)

                Path.mkdir(sdPath)

                url = u"http://istock.jrj.com.cn/yanbao_{}_p{}.html"

                request_url = url.format(uux, raw_front_page_index)
                content = Http.get_content(request_url)

                soup = BeautifulSoup(content, 'html.parser')

                list_p_list = soup.find_all('td', class_="left")

                for p in list_p_list:
                    # print p

                    list_pcyc_li = p.find_all('a')
                    for li in list_pcyc_li:
                        xxurl = li.get('href')
                        # print xxurl

                        if not 'http://istock.jrj.com.cn/list,yanbao.html' == xxurl:

                            time.sleep(1)
                            result = Http.get_content(xxurl)
                            result = unicode(str(result),
                                             'GBK').encode('UTF-8')

                            xxsoup = BeautifulSoup(result, 'html.parser')

                            # title_tationl = xxsoup.find_all('h1')
                            # tt = str(title_tationl[0].text).strip()

                            xxlist_p_list = xxsoup.find_all('p',
                                                            class_='title')[0]
                            xxlist_ds = xxsoup.find_all('span', class_='fr')[0]

                            realu = str(xxlist_p_list).replace(
                                str(xxlist_ds), '', 1)

                            realuxsoup = BeautifulSoup(realu, 'html.parser')

                            sp = str(realuxsoup.text).split(' ')

                            ttime = sp[1]

                            if ttime.__contains__('发表于'):
                                ttime = sp[2]

                            # print (sp[2]).text
                            # print (sp[3]).text

                            # print ttime

                            all_main = xxsoup.find_all('div', class_='main')[0]

                            realuxsoup = BeautifulSoup(str(all_main),
                                                       'html.parser')

                            reaupp = realuxsoup.find_all('p')

                            for pp in reaupp:
                                list_pcyc_li = pp.find_all('a')

                                for li in list_pcyc_li:
                                    print li.text
                                    ttt = li.get('href')

                                    print ttt

                                    fileName = u"{}_{}.pdf".format(
                                        ttime,
                                        str(li.text).replace('/', ""))

                                    print fileName

                                    basePath = '/ink/work/62/ink/{}/{}'.format(
                                        fileN, fileName)

                                    Path.mkdirAndPath(basePath)

                                    Debug.print_in_single_line(
                                        u'开始下载   {}'.format(ttt))
                                    if ttt:
                                        content = Http.get_content(url=ttt,
                                                                   timeout=180)
                                        if not content:
                                            Debug.logger.debug(
                                                u'pdf『{}』下载失败'.format(ttt))
                                            content = ''
                                        else:
                                            Debug.print_in_single_line(
                                                u'pdf {} 下载完成'.format(ttt))
                                    else:
                                        #   当下载地址为空的时候,就没必要再去下载了
                                        content = ''
                                    if content.__len__() > 10:
                                        with open(basePath, "wb") as pdf:
                                            pdf.write(content)
예제 #46
0
 def add_html(self, src, title):
     template = self.get_template('directory', 'item_leaf')
     self.content += template.format(href=Path.get_filename(src), title=title)
     return
예제 #47
0
    def create_book(self):
        #   确定文件信息
        title = Match.fix_filename(self.book_title)
        if self.is_split:
            title = self.book_title + u'_卷{}'.format(self.chapter_no)

        #   先切换到电子书临时资源目录下
        Path.chdir(Path.book_pool_path)
        epub = Epub(title)
        for task_result in self.task_result_list:
            chapter_src = ''
            # info_page
            if task_result.task.task_type == Type.question:
                chapter_src = self.generate_question_info_page(
                    task_result.info_page)
            elif task_result.task.task_type == Type.answer:
                chapter_src = self.generate_question_info_page(
                    task_result.info_page)
            elif task_result.task.task_type == Type.collection:
                chapter_src = self.generate_collection_info_page(
                    task_result.info_page)
            elif task_result.task.task_type == Type.topic:
                chapter_src = self.generate_topic_info_page(
                    task_result.info_page)
            elif task_result.task.task_type == Type.author:
                chapter_src = self.generate_author_info_page(
                    task_result.info_page)
            elif task_result.task.task_type == Type.column:
                chapter_src = self.generate_column_info_page(
                    task_result.info_page)
            elif task_result.task.task_type == Type.article:
                chapter_src = self.generate_article_info_page(
                    task_result.info_page)
            epub.create_chapter(chapter_src, task_result.get_title())
            for question in task_result.question_list:
                #   添加图片文件
                for filename in question.img_filename_list:
                    epub.add_image(Path.image_pool_path + '/' + filename)
                question_src = self.generate_question_page(question)
                epub.add_html(question_src, question.question_info.title)

            for column in task_result.column_list:
                #   添加图片文件
                for filename in column.img_filename_list:
                    epub.add_image(Path.image_pool_path + '/' + filename)
                for article in column.article_list:
                    article_src = self.generate_article_page(article)
                    epub.add_html(article_src, article.title)
            epub.finish_chapter()

        epub.set_creator(u'ZhihuHelp1.8.0')
        epub.set_language(u'zh-cn')
        epub.set_book_id()
        epub.set_output_path(Path.result_path)
        epub.add_css(Path.base_path + u'/www/css/markdown.css')
        epub.add_css(Path.base_path + u'/www/css/customer.css')
        epub.add_css(Path.base_path + u'/www/css/normalize.css')
        epub.add_css(Path.base_path + u'/www/css/bootstrap.css')
        epub.create()

        Path.reset_path()
        return
예제 #48
0
 def reset_path():
     Path.chdir(EpubPath.work_path)
     return
예제 #49
0
파일: download.py 프로젝트: bindx/EE-Book
    def download_button_clicked(self):
        tags = str(self.custom_tags.text())

        # url_id = self.recipes.model.data(1, QtCore.Qt.UserRole)    # TODO: 获得选中的recipes
        url_id = str(self.row_clicked(self.recipes.currentIndex()))

        if url_id == 'None':
            QtGui.QMessageBox.information(self, u"Error", u"选择需要爬取的网站!")
            return

        readlist_content = self.plainTextEdit.toPlainText()

        if readlist_content == '':
            QtGui.QMessageBox.information(self, u"Error", u"请在文本框中输入网址")
            return

        read_list_path = Path.read_list_path

        readList_file = open(read_list_path, 'w')
        readList_file.write(readlist_content)

        readList_file.close()

        game = EEBook(recipe_kind=url_id)

        progress_dlg = QProgressDialog(self)        # TODO: 设置大小, 区域
        progress_dlg.setWindowModality(Qt.WindowModal)
        progress_dlg.setMinimumDuration(5)
        progress_dlg.setWindowTitle(u"请等待")
        progress_dlg.setLabelText(u"制作中...请稍候")
        progress_dlg.setCancelButtonText(u"取消")
        progress_dlg.resize(350, 250)
        progress_dlg.show()
        progress_dlg.setRange(0, 20)

        for i in range(0, 15):
            progress_dlg.setValue(i)
            QThread.msleep(100)

        for i in range(15, 20):
            progress_dlg.setValue(i)
            QThread.msleep(100)
            if progress_dlg.wasCanceled():
                QtGui.QMessageBox.information(self, u"Error", u"电子书制作失败, 请重新操作")
                return

            try:
                filename = game.begin()      # TODO: 一次只能生成一本书
            except TypeError:
                QtGui.QMessageBox.information(self, u"Error", u"第一次使用请登录")
                progress_dlg.close()
                return
            progress_dlg.close()

            info_filename = ','.join(filename)
            QtGui.QMessageBox.information(self, u"info", u"电子书"+str(info_filename)+u"制作成功")

            for item in filename:
                file_path = EPUBSTOR_DIR + '/' + item
                Path.copy(str(file_path+'.epub'), LIBRARY_DIR)
                file_name = os.path.basename(str(file_path))
                book_id = file_name.split('.epub')[0]

                Path.mkdir(LIBRARY_DIR + book_id)
                shutil.move(LIBRARY_DIR+book_id+'.epub', LIBRARY_DIR+book_id)

                book = Book(str(book_id))
                book.date = time.strftime(ISOTIMEFORMAT, time.localtime())
                book.tags += tags.replace(' ', '')
                book.tags += ','+str(self.now_url)
                if self.add_title_tag.isChecked():
                    book.tags += ','+str(book.title)
                insert_library(book)
            return
예제 #50
0
 def init_epub_path(work_path):
     """
     设置工作地址,根据该路径进行创建文件夹,生成epub,压缩等操作
     """
     EpubPath.set_work_path(work_path)
     Path.mkdir(EpubPath.meta_inf_path)
     Path.mkdir(EpubPath.oebps_path)
     Path.chdir(EpubPath.oebps_path)
     Path.mkdir(EpubPath.html_path)
     Path.mkdir(EpubPath.image_path)
     Path.mkdir(EpubPath.style_path)
     return
예제 #51
0
    def start(self):
        print ' 中文研报 '

        stockList = []


        for raw_front_page_index in range(1, 251):
            fileN = '策略'

            sdPath = '/ink/work/62/ink/{}'.format(fileN)
            Path.mkdir(sdPath)

       #    http://datainterface.eastmoney.com//EM_DataCenter/js.aspx?type=SR&sty=HGYJ&cmd=4&code=&ps=50&p=2&js=var%20UxmjGoYW={%22data%22:[(x)],%22pages%22:%22(pc)%22,%22update%22:%22(ud)%22,%22count%22:%22(count)%22}&
            burl = u"http://datainterface.eastmoney.com//EM_DataCenter/js.aspx?type=SR&sty=CLBG&cmd=4&code=&ps=50&p="
            # burl = u"http://datainterface.eastmoney.com//EM_DataCenter/js.aspx?type=SR&sty=HGYJ&cmd=4&code=&ps=50&p="
            uu = u"&js=var%20GdYXcAjX={%22data%22:[(x)],%22pages%22:%22(pc)%22,%22update%22:%22(ud)%22,%22count%22:%22(count)%22}&"

            url = '%s%s%s' % (burl, str(raw_front_page_index), uu)

            # print url

            content = Http.get_content(url)

            if content:
                try:
                    jsonD = str(content).split('=')[-1]

                    jdata = json.loads(jsonD)
                    articles = jdata['data']
                    for article in articles:

                        xxxs = str(article).split(',')
                        rticlet = xxxs[0]

                        preTitle = xxxs[5]

                        if str(preTitle).__contains__('川财') or str(preTitle).__contains__('或'):
                           continue

                        # if str(preTitle).__contains__('历史') or str(preTitle).__contains__('周期')or str(preTitle).__contains__('成长'):
                        # if str(preTitle).__contains__('政治') or str(preTitle).__contains__('中央经济')or str(preTitle).__contains__('贸易战'):
                        if str(preTitle).__contains__('日本'):
                            print preTitle
                            date_time = datetime.datetime.strptime(rticlet, '%Y/%m/%d %H:%M:%S')

                            infoCode = xxxs[1]
                            destU = u"http://data.eastmoney.com/report/{}/cl,{}.html ".format(
                                date_time.strftime('%Y%m%d'), infoCode)

                            print destU

                            result = Http.get_content(destU)
                            result = unicode(result, 'GBK').encode('UTF-8')

                            xxsoup = BeautifulSoup(result, 'html.parser')

                            title_tationl = xxsoup.find_all('h1')
                            tt = str(title_tationl[0].text).strip()

                            xxlist_p_list = xxsoup.find_all('div', class_='report-infos')[0]

                            sp = xxlist_p_list.find_all('span')

                            ttime = str((sp[1]).text)

                            date_time = datetime.datetime.strptime(ttime, '%Y年%m月%d日 %H:%M')

                            # print date_time.strftime('%Y-%m-%d')

                            ttime = date_time.strftime('%Y-%m-%d')

                            # print (sp[2]).text
                            # print (sp[3]).text

                            title = Match.replace_specile_chars(tt)
                            title = title.replace('/', '', 100)

                            fileName = u"{}_{}_{}_{}.pdf".format(ttime, (sp[2]).text, title, (sp[3]).text)
                            # 时间 券商 名称  author

                            print fileName

                            urlsp = sp[-1]

                            basePath = '{}/{}'.format(sdPath, fileName)

                            # print basePath

                            # 创建文件夹

                            list_pcyc_li = urlsp.find_all('a')
                            for li in list_pcyc_li:
                                ttt = li.get('href')
                                Path.mkdirAndPath(basePath)
                                print ttt

                                Debug.print_in_single_line(u'开始下载   {}'.format(ttt))
                                if ttt:
                                    content = Http.get_content(url=ttt, timeout=180)
                                    if not content:
                                        Debug.logger.debug(u'pdf『{}』下载失败'.format(ttt))
                                        content = ''
                                    else:
                                        Debug.print_in_single_line(u'pdf {} 下载完成'.format(ttt))
                                else:
                                    #   当下载地址为空的时候,就没必要再去下载了
                                    content = ''
                                if content.__len__() > 10:
                                    with open(basePath, "wb") as pdf:
                                        pdf.write(content)



                except Exception as e:
                    print('next')
예제 #52
0
    def download_button_clicked(self):
        tags = str(self.custom_tags.text())

        # url_id = self.recipes.model.data(1, QtCore.Qt.UserRole)    # TODO: 获得选中的recipes
        url_id = str(self.row_clicked(self.recipes.currentIndex()))

        if url_id == 'None':
            QtGui.QMessageBox.information(self, u"Error", u"选择需要爬取的网站!")
            return

        readlist_content = self.plainTextEdit.toPlainText()

        if readlist_content == '':
            QtGui.QMessageBox.information(self, u"Error", u"请在文本框中输入网址")
            return

        read_list_path = Path.read_list_path

        readList_file = open(read_list_path, 'w')
        readList_file.write(readlist_content)

        readList_file.close()

        game = EEBook(recipe_kind=url_id)

        progress_dlg = QProgressDialog(self)  # TODO: 设置大小, 区域
        progress_dlg.setWindowModality(Qt.WindowModal)
        progress_dlg.setMinimumDuration(5)
        progress_dlg.setWindowTitle(u"请等待")
        progress_dlg.setLabelText(u"制作中...请稍候")
        progress_dlg.setCancelButtonText(u"取消")
        progress_dlg.resize(350, 250)
        progress_dlg.show()
        progress_dlg.setRange(0, 20)

        for i in range(0, 15):
            progress_dlg.setValue(i)
            QThread.msleep(100)

        for i in range(15, 20):
            progress_dlg.setValue(i)
            QThread.msleep(100)
            if progress_dlg.wasCanceled():
                QtGui.QMessageBox.information(self, u"Error",
                                              u"电子书制作失败, 请重新操作")
                return

            try:
                filename = game.begin()  # TODO: 一次只能生成一本书
            except TypeError:
                QtGui.QMessageBox.information(self, u"Error", u"第一次使用请登录")
                progress_dlg.close()
                return
            progress_dlg.close()

            info_filename = ','.join(filename)
            QtGui.QMessageBox.information(
                self, u"info", u"电子书" + str(info_filename) + u"制作成功")

            for item in filename:
                file_path = EPUBSTOR_DIR + '/' + item
                Path.copy(str(file_path + '.epub'), LIBRARY_DIR)
                file_name = os.path.basename(str(file_path))
                book_id = file_name.split('.epub')[0]

                Path.mkdir(LIBRARY_DIR + book_id)
                shutil.move(LIBRARY_DIR + book_id + '.epub',
                            LIBRARY_DIR + book_id)

                book = Book(str(book_id))
                book.date = time.strftime(ISOTIMEFORMAT, time.localtime())
                book.tags += tags.replace(' ', '')
                book.tags += ',' + str(self.now_url)
                if self.add_title_tag.isChecked():
                    book.tags += ',' + str(book.title)
                insert_library(book)
            return
예제 #53
0
 def create_single_html_book(self, book_package):
     title = book_package.get_title()
     if not title:
         # 电子书题目为空时自动跳过
         # 否则会发生『rm -rf / 』的惨剧
         return
     Path.reset_path()
     Path.chdir(Path.result_path)
     Path.rmdir(u'./' + title)
     Path.mkdir(u'./' + title)
     Path.chdir(u'./' + title)
     page = []
     for book in book_package.book_list:
         page += book.page_list
     content = u' \r\n '.join([Match.html_body(x.content) for x in page]).replace(u'../images/', u'./images/')
     with open(TemplateConfig.content_base_uri) as html:
         content = html.read().format(title=title, body=content).replace(u'../style/', u'./')
     with open(title + u'.html', 'w') as html:
         html.write(content)
     Path.copy(Path.html_pool_path + u'/../{}/OEBPS/images'.format(title), u'./images')
     Path.copy(Path.www_css + u'/customer.css', u'./customer.css')
     Path.copy(Path.www_css + u'/markdown.css', u'./markdown.css')
     Path.copy(Path.www_css + u'/normalize.css', u'./normalize.css')
     Path.reset_path()
     return
예제 #54
0
    def start(self):
        print 'start 东财研报'

        stockList = []
        file_name = 'annual.txt'

        with open(file_name, 'r') as read_list:
            read_list = read_list.readlines()

            resultsL = read_list.__len__()
            for x in range(0, resultsL):
                line = read_list[x]
                splits = line.split('#')
                code = (str)(splits[0])
                fieName = (str)(splits[1]).strip()
                print fieName
                stockList.append({'URL': code, 'NAME': fieName})

        for xx in stockList:
            for raw_front_page_index in range(1, 5):
                fileN = str(xx['NAME']).strip()
                uux = xx['URL']

                sdPath = '/ink/work/62/ink/{}'.format(fileN)
                Path.mkdir(sdPath)
                # url = u"http://datainterface.eastmoney.com//EM_DataCenter/js.aspx?type=SR&sty=GGSR&js=var%20LhAYbcgn={%22data%22:[(x)],%22pages%22:%22(pc)%22,%22update%22:%22(ud)%22,%22count%22:%22(count)%22}&ps=25&p=1&code=000333&rt=51734025"

                burl = u"http://datainterface.eastmoney.com//EM_DataCenter/js.aspx?type=SR&sty=GGSR&js=var%20LhAYbcgn={%22data%22:[(x)],%22pages%22:%22(pc)%22,%22update%22:%22(ud)%22,%22count%22:%22(count)%22}&ps=25&"
                uu = u"p={0}&code={1}&rt="

                url = '%s%s' % (burl, uu.format(raw_front_page_index, uux))

                content = Http.get_content(url)

                if content:
                    jsonD = str(content).split('=')[-1]

                    jdata = json.loads(jsonD)
                    articles = jdata['data']
                    for article in articles:
                        rticlet = article['datetime']

                        date_time = datetime.datetime.strptime(rticlet, '%Y-%m-%dT%H:%M:%S')
                        destU = u"http://data.eastmoney.com/report/{}/{}.html ".format(date_time.strftime('%Y%m%d'),
                                                                                       article['infoCode'])

                        result = Http.get_content(destU)
                        result = unicode(result, 'GBK').encode('UTF-8')

                        xxsoup = BeautifulSoup(result, 'html.parser')

                        title_tationl = xxsoup.find_all('h1')
                        tt = str(title_tationl[0].text).strip()

                        xxlist_p_list = xxsoup.find_all('div', class_='report-infos')[0]

                        sp = xxlist_p_list.find_all('span')

                        ttime = str((sp[1]).text)

                        date_time = datetime.datetime.strptime(ttime, '%Y年%m月%d日 %H:%M')

                        # print date_time.strftime('%Y-%m-%d')

                        ttime = date_time.strftime('%Y-%m-%d')

                        # print (sp[2]).text
                        # print (sp[3]).text

                        title = Match.replace_specile_chars(tt)
                        title = title.replace('/', '', 100)

                        fileName = u"{}_{}_{}_{}.pdf".format(ttime, (sp[2]).text, title, (sp[3]).text)
                        # 时间 券商 名称  author

                        print fileName

                        urlsp = sp[-1]

                        basePath = '{}/{}'.format(sdPath, fileName)

                        # print basePath

                        # 创建文件夹

                        list_pcyc_li = urlsp.find_all('a')
                        for li in list_pcyc_li:
                            ttt = li.get('href')
                            Path.mkdirAndPath(basePath)
                            print ttt

                            Debug.print_in_single_line(u'开始下载   {}'.format(ttt))
                            if ttt:
                                content = Http.get_content(url=ttt, timeout=180)
                                if not content:
                                    Debug.logger.debug(u'pdf『{}』下载失败'.format(ttt))
                                    content = ''
                                else:
                                    Debug.print_in_single_line(u'pdf {} 下载完成'.format(ttt))
                            else:
                                #   当下载地址为空的时候,就没必要再去下载了
                                content = ''
                            if content.__len__() > 10:
                                with open(basePath, "wb") as pdf:
                                    pdf.write(content)