Exemplo n.º 1
0
 def create_single_html_book(self, book_package):
     title = book_package.get_title()
     if not title:
         # 电子书题目为空时自动跳过
         # 否则会发生『rm -rf / 』的惨剧
         return
     Path.reset_path()
     Path.chdir(Path.result_path)
     Path.rmdir(u'./' + title)
     Path.mkdir(u'./' + title)
     Path.chdir(u'./' + title)
     page = []
     for book in book_package.book_list:
         page += book.page_list
     content = u' \r\n '.join([Match.html_body(x.content) for x in page]).replace(u'../images/', u'./images/')
     with open(TemplateConfig.content_base_uri) as html:
         content = html.read().format(title=title, body=content).replace(u'../style/', u'./')
     with open(title + u'.html', 'w') as html:
         html.write(content)
     Path.copy(Path.html_pool_path + u'/../{}/OEBPS/images'.format(title), u'./images')
     Path.copy(Path.www_css + u'/customer.css', u'./customer.css')
     Path.copy(Path.www_css + u'/markdown.css', u'./markdown.css')
     Path.copy(Path.www_css + u'/normalize.css', u'./normalize.css')
     Path.reset_path()
     return
Exemplo n.º 2
0
 def init_epub_path(work_path):
     """
     设置工作地址,根据该路径进行创建文件夹,生成epub,压缩等操作
     """
     EpubPath.set_work_path(work_path)
     Path.mkdir(EpubPath.meta_inf_path)
     Path.mkdir(EpubPath.oebps_path)
     Path.chdir(EpubPath.oebps_path)
     Path.mkdir(EpubPath.html_path)
     Path.mkdir(EpubPath.image_path)
     Path.mkdir(EpubPath.style_path)
     return
Exemplo n.º 3
0
 def init_epub_path(work_path):
     u"""
     设置工作地址,根据该路径进行创建文件夹,生成epub,压缩等操作
     """
     EpubPath.set_work_path(work_path)
     Path.mkdir(EpubPath.meta_inf_path)
     Path.mkdir(EpubPath.oebps_path)
     Path.chdir(EpubPath.oebps_path)
     Path.mkdir(EpubPath.html_path)
     Path.mkdir(EpubPath.image_path)
     Path.mkdir(EpubPath.style_path)
     return
Exemplo n.º 4
0
 def create_single_html_book(self):
     title = '_'.join([book.epub.title for book in self.book_list])
     title = title.strip()[:128] # 避开window文件名长度限制
     title = ExtraTools.fix_filename(title) # 移除特殊字符
     Path.reset_path()
     Path.chdir(Path.result_path)
     Path.rmdir(u'./' + title)
     Path.mkdir(u'./' + title)
     Path.chdir(u'./' + title)
     page = []
     for book in self.book_list:
         page += book.page_list
     content = ' \r\n<hr /> \r\n '.join([Match.html_body(x.content) for x in page]).replace('../images/', './images/')
     with open(Path.base_path + '/src/template/content/single_html.html') as html:
         template = html.read().format(title=title, content=content)
     with open(title + u'.html', 'w') as html:
         html.write(template)
     shutil.copytree(Path.html_pool_path + u'/../{}/OEBPS/images'.format(title), './images')
     shutil.copy(Path.www_css + '/front.css' , './front.css')
     shutil.copy(Path.www_css + '/markdown.css' , './markdown.css')
     Path.reset_path()
     return
Exemplo n.º 5
0
    def add_book(self):
        u"""
        打开已经在文件系统的电子书到电子书管理器中
        :return:
        """
        # Get filename and show only .epub files    Mac 系统下返回的是native fiel dialog
        book_path = QtGui.QFileDialog.getOpenFileName(self, u'打开Epub格式电子书',
                                                      ".", "(*.epub)")

        if str(book_path) is '':
            # 没有选中电子书
            return

        if os.path.dirname(str(book_path)) + os.sep != str(LIBRARY_DIR):
            shutil.copy(str(book_path), LIBRARY_DIR)

        file_name = os.path.basename(str(book_path))
        book_id = file_name.split('.epub')[0]
        bookdata_book_catalog = LIBRARY_DIR + book_id

        Path.mkdir(bookdata_book_catalog)

        Debug.logger.debug(u"移入bookdata中的是:" + str(LIBRARY_DIR + file_name))
        Debug.logger.debug(u"bookdata中的书:" + str(bookdata_book_catalog))
        Debug.logger.debug(u"book_path:" + os.path.dirname(str(book_path)))
        if os.path.dirname(str(book_path)) != bookdata_book_catalog:
            try:
                shutil.move(LIBRARY_DIR + file_name, bookdata_book_catalog)
            except shutil.Error:
                Debug.logger.debug(u"TODO:添加过这个书,删除原来的书")
                pass
        else:
            Debug.logger.debug(u"是相同文件夹, 添加的是bookdata中的书")
        os.remove(LIBRARY_DIR + file_name)
        book = Book(book_id)
        book.date = time.strftime(ISOTIMEFORMAT, time.localtime())
        insert_library(book)
        self.update_library()
Exemplo n.º 6
0
Arquivo: ui.py Projeto: bindx/EE-Book
    def add_book(self):
        u"""
        打开已经在文件系统的电子书到电子书管理器中
        :return:
        """
        # Get filename and show only .epub files    Mac 系统下返回的是native fiel dialog
        book_path = QtGui.QFileDialog.getOpenFileName(self, u'打开Epub格式电子书', ".", "(*.epub)")

        if str(book_path) is '':
            # 没有选中电子书
            return

        if os.path.dirname(str(book_path))+os.sep != str(LIBRARY_DIR):
            shutil.copy(str(book_path), LIBRARY_DIR)

        file_name = os.path.basename(str(book_path))
        book_id = file_name.split('.epub')[0]
        bookdata_book_catalog = LIBRARY_DIR+book_id

        Path.mkdir(bookdata_book_catalog)

        Debug.logger.debug(u"移入bookdata中的是:" + str(LIBRARY_DIR+file_name))
        Debug.logger.debug(u"bookdata中的书:" + str(bookdata_book_catalog))
        Debug.logger.debug(u"book_path:" + os.path.dirname(str(book_path)))
        if os.path.dirname(str(book_path)) != bookdata_book_catalog:
            try:
                shutil.move(LIBRARY_DIR+file_name, bookdata_book_catalog)
            except shutil.Error:
                Debug.logger.debug(u"TODO:添加过这个书,删除原来的书")
                pass
        else:
            Debug.logger.debug(u"是相同文件夹, 添加的是bookdata中的书")
        os.remove(LIBRARY_DIR+file_name)
        book = Book(book_id)
        book.date = time.strftime(ISOTIMEFORMAT, time.localtime())
        insert_library(book)
        self.update_library()
Exemplo n.º 7
0
    def download_button_clicked(self):
        tags = str(self.custom_tags.text())

        # url_id = self.recipes.model.data(1, QtCore.Qt.UserRole)    # TODO: 获得选中的recipes
        url_id = str(self.row_clicked(self.recipes.currentIndex()))

        if url_id == 'None':
            QtGui.QMessageBox.information(self, u"Error", u"选择需要爬取的网站!")
            return

        readlist_content = self.plainTextEdit.toPlainText()

        if readlist_content == '':
            QtGui.QMessageBox.information(self, u"Error", u"请在文本框中输入网址")
            return

        read_list_path = Path.read_list_path

        readList_file = open(read_list_path, 'w')
        readList_file.write(readlist_content)

        readList_file.close()

        game = EEBook(recipe_kind=url_id)

        progress_dlg = QProgressDialog(self)        # TODO: 设置大小, 区域
        progress_dlg.setWindowModality(Qt.WindowModal)
        progress_dlg.setMinimumDuration(5)
        progress_dlg.setWindowTitle(u"请等待")
        progress_dlg.setLabelText(u"制作中...请稍候")
        progress_dlg.setCancelButtonText(u"取消")
        progress_dlg.resize(350, 250)
        progress_dlg.show()
        progress_dlg.setRange(0, 20)

        for i in range(0, 15):
            progress_dlg.setValue(i)
            QThread.msleep(100)

        for i in range(15, 20):
            progress_dlg.setValue(i)
            QThread.msleep(100)
            if progress_dlg.wasCanceled():
                QtGui.QMessageBox.information(self, u"Error", u"电子书制作失败, 请重新操作")
                return

            try:
                filename = game.begin()      # TODO: 一次只能生成一本书
            except TypeError:
                QtGui.QMessageBox.information(self, u"Error", u"第一次使用请登录")
                progress_dlg.close()
                return
            progress_dlg.close()

            info_filename = ','.join(filename)
            QtGui.QMessageBox.information(self, u"info", u"电子书"+str(info_filename)+u"制作成功")

            for item in filename:
                file_path = EPUBSTOR_DIR + '/' + item
                Path.copy(str(file_path+'.epub'), LIBRARY_DIR)
                file_name = os.path.basename(str(file_path))
                book_id = file_name.split('.epub')[0]

                Path.mkdir(LIBRARY_DIR + book_id)
                shutil.move(LIBRARY_DIR+book_id+'.epub', LIBRARY_DIR+book_id)

                book = Book(str(book_id))
                book.date = time.strftime(ISOTIMEFORMAT, time.localtime())
                book.tags += tags.replace(' ', '')
                book.tags += ','+str(self.now_url)
                if self.add_title_tag.isChecked():
                    book.tags += ','+str(book.title)
                insert_library(book)
            return
Exemplo n.º 8
0
 def init_path(self):
     Path.rmdir(u'./' + self.title)
     Path.mkdir(u'./' + self.title)
     Path.chdir(u'./' + self.title)
     EpubPath.init_epub_path(Path.get_pwd())
     return
    def start(self):
        print 'start JRJ_Report'

        stockList = []
        file_name = 'annual.txt'

        with open(file_name, 'r') as read_list:
            read_list = read_list.readlines()

            resultsL = read_list.__len__()
            for x in range(0, resultsL):
                line = read_list[x]
                splits = line.split('#')
                code = (str)(splits[0])
                fieName = (str)(splits[1]).strip()
                print fieName
                stockList.append({'URL': code, 'NAME': fieName})

        for xx in stockList:

            for raw_front_page_index in range(1, 8):

                fileN = str(xx['NAME']).strip()
                uux = xx['URL']

                sdPath = '/ink/work/62/ink/{}'.format(fileN)

                Path.mkdir(sdPath)

                url = u"http://istock.jrj.com.cn/yanbao_{}_p{}.html"

                request_url = url.format(uux, raw_front_page_index)
                content = Http.get_content(request_url)

                soup = BeautifulSoup(content, 'html.parser')

                list_p_list = soup.find_all('td', class_="left")

                for p in list_p_list:
                    # print p

                    list_pcyc_li = p.find_all('a')
                    for li in list_pcyc_li:
                        xxurl = li.get('href')
                        # print xxurl

                        if not 'http://istock.jrj.com.cn/list,yanbao.html' == xxurl:

                            time.sleep(1)
                            result = Http.get_content(xxurl)
                            result = unicode(str(result),
                                             'GBK').encode('UTF-8')

                            xxsoup = BeautifulSoup(result, 'html.parser')

                            # title_tationl = xxsoup.find_all('h1')
                            # tt = str(title_tationl[0].text).strip()

                            xxlist_p_list = xxsoup.find_all('p',
                                                            class_='title')[0]
                            xxlist_ds = xxsoup.find_all('span', class_='fr')[0]

                            realu = str(xxlist_p_list).replace(
                                str(xxlist_ds), '', 1)

                            realuxsoup = BeautifulSoup(realu, 'html.parser')

                            sp = str(realuxsoup.text).split(' ')

                            ttime = sp[1]

                            if ttime.__contains__('发表于'):
                                ttime = sp[2]

                            # print (sp[2]).text
                            # print (sp[3]).text

                            # print ttime

                            all_main = xxsoup.find_all('div', class_='main')[0]

                            realuxsoup = BeautifulSoup(str(all_main),
                                                       'html.parser')

                            reaupp = realuxsoup.find_all('p')

                            for pp in reaupp:
                                list_pcyc_li = pp.find_all('a')

                                for li in list_pcyc_li:
                                    print li.text
                                    ttt = li.get('href')

                                    print ttt

                                    fileName = u"{}_{}.pdf".format(
                                        ttime,
                                        str(li.text).replace('/', ""))

                                    print fileName

                                    basePath = '/ink/work/62/ink/{}/{}'.format(
                                        fileN, fileName)

                                    Path.mkdirAndPath(basePath)

                                    Debug.print_in_single_line(
                                        u'开始下载   {}'.format(ttt))
                                    if ttt:
                                        content = Http.get_content(url=ttt,
                                                                   timeout=180)
                                        if not content:
                                            Debug.logger.debug(
                                                u'pdf『{}』下载失败'.format(ttt))
                                            content = ''
                                        else:
                                            Debug.print_in_single_line(
                                                u'pdf {} 下载完成'.format(ttt))
                                    else:
                                        #   当下载地址为空的时候,就没必要再去下载了
                                        content = ''
                                    if content.__len__() > 10:
                                        with open(basePath, "wb") as pdf:
                                            pdf.write(content)
Exemplo n.º 10
0
    def start(self):
        print 'start 东财研报'

        stockList = []
        file_name = 'annual.txt'

        with open(file_name, 'r') as read_list:
            read_list = read_list.readlines()

            resultsL = read_list.__len__()
            for x in range(0, resultsL):
                line = read_list[x]
                splits = line.split('#')
                code = (str)(splits[0])
                fieName = (str)(splits[1]).strip()
                print fieName
                stockList.append({'URL': code, 'NAME': fieName})

        for xx in stockList:
            for raw_front_page_index in range(1, 5):
                fileN = str(xx['NAME']).strip()
                uux = xx['URL']

                sdPath = '/ink/work/62/ink/{}'.format(fileN)
                Path.mkdir(sdPath)
                # url = u"http://datainterface.eastmoney.com//EM_DataCenter/js.aspx?type=SR&sty=GGSR&js=var%20LhAYbcgn={%22data%22:[(x)],%22pages%22:%22(pc)%22,%22update%22:%22(ud)%22,%22count%22:%22(count)%22}&ps=25&p=1&code=000333&rt=51734025"

                burl = u"http://datainterface.eastmoney.com//EM_DataCenter/js.aspx?type=SR&sty=GGSR&js=var%20LhAYbcgn={%22data%22:[(x)],%22pages%22:%22(pc)%22,%22update%22:%22(ud)%22,%22count%22:%22(count)%22}&ps=25&"
                uu = u"p={0}&code={1}&rt="

                url = '%s%s' % (burl, uu.format(raw_front_page_index, uux))

                content = Http.get_content(url)

                if content:
                    jsonD = str(content).split('=')[-1]

                    jdata = json.loads(jsonD)
                    articles = jdata['data']
                    for article in articles:
                        rticlet = article['datetime']

                        date_time = datetime.datetime.strptime(rticlet, '%Y-%m-%dT%H:%M:%S')
                        destU = u"http://data.eastmoney.com/report/{}/{}.html ".format(date_time.strftime('%Y%m%d'),
                                                                                       article['infoCode'])

                        result = Http.get_content(destU)
                        result = unicode(result, 'GBK').encode('UTF-8')

                        xxsoup = BeautifulSoup(result, 'html.parser')

                        title_tationl = xxsoup.find_all('h1')
                        tt = str(title_tationl[0].text).strip()

                        xxlist_p_list = xxsoup.find_all('div', class_='report-infos')[0]

                        sp = xxlist_p_list.find_all('span')

                        ttime = str((sp[1]).text)

                        date_time = datetime.datetime.strptime(ttime, '%Y年%m月%d日 %H:%M')

                        # print date_time.strftime('%Y-%m-%d')

                        ttime = date_time.strftime('%Y-%m-%d')

                        # print (sp[2]).text
                        # print (sp[3]).text

                        title = Match.replace_specile_chars(tt)
                        title = title.replace('/', '', 100)

                        fileName = u"{}_{}_{}_{}.pdf".format(ttime, (sp[2]).text, title, (sp[3]).text)
                        # 时间 券商 名称  author

                        print fileName

                        urlsp = sp[-1]

                        basePath = '{}/{}'.format(sdPath, fileName)

                        # print basePath

                        # 创建文件夹

                        list_pcyc_li = urlsp.find_all('a')
                        for li in list_pcyc_li:
                            ttt = li.get('href')
                            Path.mkdirAndPath(basePath)
                            print ttt

                            Debug.print_in_single_line(u'开始下载   {}'.format(ttt))
                            if ttt:
                                content = Http.get_content(url=ttt, timeout=180)
                                if not content:
                                    Debug.logger.debug(u'pdf『{}』下载失败'.format(ttt))
                                    content = ''
                                else:
                                    Debug.print_in_single_line(u'pdf {} 下载完成'.format(ttt))
                            else:
                                #   当下载地址为空的时候,就没必要再去下载了
                                content = ''
                            if content.__len__() > 10:
                                with open(basePath, "wb") as pdf:
                                    pdf.write(content)
Exemplo n.º 11
0
    def start(self):
        print ' 中文研报 '

        stockList = []


        for raw_front_page_index in range(1, 251):
            fileN = '策略'

            sdPath = '/ink/work/62/ink/{}'.format(fileN)
            Path.mkdir(sdPath)

       #    http://datainterface.eastmoney.com//EM_DataCenter/js.aspx?type=SR&sty=HGYJ&cmd=4&code=&ps=50&p=2&js=var%20UxmjGoYW={%22data%22:[(x)],%22pages%22:%22(pc)%22,%22update%22:%22(ud)%22,%22count%22:%22(count)%22}&
            burl = u"http://datainterface.eastmoney.com//EM_DataCenter/js.aspx?type=SR&sty=CLBG&cmd=4&code=&ps=50&p="
            # burl = u"http://datainterface.eastmoney.com//EM_DataCenter/js.aspx?type=SR&sty=HGYJ&cmd=4&code=&ps=50&p="
            uu = u"&js=var%20GdYXcAjX={%22data%22:[(x)],%22pages%22:%22(pc)%22,%22update%22:%22(ud)%22,%22count%22:%22(count)%22}&"

            url = '%s%s%s' % (burl, str(raw_front_page_index), uu)

            # print url

            content = Http.get_content(url)

            if content:
                try:
                    jsonD = str(content).split('=')[-1]

                    jdata = json.loads(jsonD)
                    articles = jdata['data']
                    for article in articles:

                        xxxs = str(article).split(',')
                        rticlet = xxxs[0]

                        preTitle = xxxs[5]

                        if str(preTitle).__contains__('川财') or str(preTitle).__contains__('或'):
                           continue

                        # if str(preTitle).__contains__('历史') or str(preTitle).__contains__('周期')or str(preTitle).__contains__('成长'):
                        # if str(preTitle).__contains__('政治') or str(preTitle).__contains__('中央经济')or str(preTitle).__contains__('贸易战'):
                        if str(preTitle).__contains__('日本'):
                            print preTitle
                            date_time = datetime.datetime.strptime(rticlet, '%Y/%m/%d %H:%M:%S')

                            infoCode = xxxs[1]
                            destU = u"http://data.eastmoney.com/report/{}/cl,{}.html ".format(
                                date_time.strftime('%Y%m%d'), infoCode)

                            print destU

                            result = Http.get_content(destU)
                            result = unicode(result, 'GBK').encode('UTF-8')

                            xxsoup = BeautifulSoup(result, 'html.parser')

                            title_tationl = xxsoup.find_all('h1')
                            tt = str(title_tationl[0].text).strip()

                            xxlist_p_list = xxsoup.find_all('div', class_='report-infos')[0]

                            sp = xxlist_p_list.find_all('span')

                            ttime = str((sp[1]).text)

                            date_time = datetime.datetime.strptime(ttime, '%Y年%m月%d日 %H:%M')

                            # print date_time.strftime('%Y-%m-%d')

                            ttime = date_time.strftime('%Y-%m-%d')

                            # print (sp[2]).text
                            # print (sp[3]).text

                            title = Match.replace_specile_chars(tt)
                            title = title.replace('/', '', 100)

                            fileName = u"{}_{}_{}_{}.pdf".format(ttime, (sp[2]).text, title, (sp[3]).text)
                            # 时间 券商 名称  author

                            print fileName

                            urlsp = sp[-1]

                            basePath = '{}/{}'.format(sdPath, fileName)

                            # print basePath

                            # 创建文件夹

                            list_pcyc_li = urlsp.find_all('a')
                            for li in list_pcyc_li:
                                ttt = li.get('href')
                                Path.mkdirAndPath(basePath)
                                print ttt

                                Debug.print_in_single_line(u'开始下载   {}'.format(ttt))
                                if ttt:
                                    content = Http.get_content(url=ttt, timeout=180)
                                    if not content:
                                        Debug.logger.debug(u'pdf『{}』下载失败'.format(ttt))
                                        content = ''
                                    else:
                                        Debug.print_in_single_line(u'pdf {} 下载完成'.format(ttt))
                                else:
                                    #   当下载地址为空的时候,就没必要再去下载了
                                    content = ''
                                if content.__len__() > 10:
                                    with open(basePath, "wb") as pdf:
                                        pdf.write(content)



                except Exception as e:
                    print('next')
Exemplo n.º 12
0
    def start(self):
        print 'start 东财股吧 研报'

        stockList = []
        file_name = 'annual.txt'

        with open(file_name, 'r') as read_list:
            read_list = read_list.readlines()

            resultsL = read_list.__len__()
            for x in range(0, resultsL):
                line = read_list[x]
                splits = line.split('#')
                code = (str)(splits[0])
                fieName = (str)(splits[1]).strip()
                print fieName
                stockList.append({'URL': code, 'NAME': fieName})

        for xx in stockList:
            for raw_front_page_index in range(1, 3):
                fileN = str(xx['NAME']).strip()
                uux = xx['URL']

                sdPath = '/ink/work/62/ink/{}'.format(fileN)
                Path.mkdir(sdPath)

                burl = u"http://guba.eastmoney.com/list,{},2,f_{}.html"

                content = Http.get_content(burl.format(uux, raw_front_page_index))

                xxsoup = BeautifulSoup(content, 'html.parser')

                tagrt = xxsoup.find_all('div', id='articlelistnew')[0]

                ols = tagrt.find_all('div', class_='articleh normal_post')
                olss = tagrt.find_all('div', class_='articleh normal_post odd')

                splicy = []

                for xxos in ols:
                    splicy.append(xxos)
                for xx in olss:
                    splicy.append(xxos)

                for inkl in splicy:

                    try:

                        inklinkl = BeautifulSoup(str(inkl), 'html.parser')

                        spp = inklinkl.find_all('span', class_='l3')[0]

                        list_pcyc_li = spp.find_all('a')
                        for li in list_pcyc_li:
                            ttt = li.get('href')

                            print ttt

                            destU = u'http://guba.eastmoney.com{}'.format(ttt)

                            result = Http.get_content(destU)
                            # result = unicode(result, 'GBK').encode('UTF-8')

                            xxsoup = BeautifulSoup(result, 'html.parser')

                            title_tationl = xxsoup.find_all('div', id='zwconttbt')
                            tt = str(title_tationl[0].text).strip()
                            print tt

                            title = Match.replace_specile_chars(tt)
                            title = title.replace('/', '', 100)

                            title = title.replace('查看原文', '')

                            ttime = xxsoup.find_all('p', class_='publishdate')[0]

                            tttttime = str(ttime.text)[-10:]

                            print  tttttime

                            date_time = datetime.datetime.strptime(tttttime, '%Y-%m-%d')

                            # print date_time.strftime('%Y-%m-%d')

                            ttime = date_time.strftime('%Y-%m-%d')

                            fileName = u"{}_{}.pdf".format(ttime, title)
                            # 时间 券商 名称  author

                            print fileName

                            basePath = '{}/{}'.format(sdPath, fileName)

                            # print basePath

                            # 创建文件夹
                            #

                            spx = xxsoup.find_all('span', class_='zwtitlepdf')[0]

                            pdfu = spx.find_all('a')
                            for li in pdfu:
                                ttt = li.get('href')

                                print ttt
                                Path.mkdirAndPath(basePath)

                                Debug.print_in_single_line(u'开始下载   {}'.format(ttt))
                                if ttt:
                                    content = Http.get_content(url=ttt, timeout=180)
                                    if not content:
                                        Debug.logger.debug(u'pdf『{}』下载失败'.format(ttt))
                                        content = ''
                                    else:
                                        Debug.print_in_single_line(u'pdf {} 下载完成'.format(ttt))
                                else:
                                    #   当下载地址为空的时候,就没必要再去下载了
                                    content = ''
                                if content.__len__() > 10:
                                    with open(basePath, "wb") as pdf:
                                        pdf.write(content)
                    except Exception as e:
                        print('next')
Exemplo n.º 13
0
    def download_button_clicked(self):
        tags = str(self.custom_tags.text())

        # url_id = self.recipes.model.data(1, QtCore.Qt.UserRole)    # TODO: 获得选中的recipes
        url_id = str(self.row_clicked(self.recipes.currentIndex()))

        if url_id == 'None':
            QtGui.QMessageBox.information(self, u"Error", u"选择需要爬取的网站!")
            return

        readlist_content = self.plainTextEdit.toPlainText()

        if readlist_content == '':
            QtGui.QMessageBox.information(self, u"Error", u"请在文本框中输入网址")
            return

        read_list_path = Path.read_list_path

        readList_file = open(read_list_path, 'w')
        readList_file.write(readlist_content)

        readList_file.close()

        game = EEBook(recipe_kind=url_id)

        progress_dlg = QProgressDialog(self)  # TODO: 设置大小, 区域
        progress_dlg.setWindowModality(Qt.WindowModal)
        progress_dlg.setMinimumDuration(5)
        progress_dlg.setWindowTitle(u"请等待")
        progress_dlg.setLabelText(u"制作中...请稍候")
        progress_dlg.setCancelButtonText(u"取消")
        progress_dlg.resize(350, 250)
        progress_dlg.show()
        progress_dlg.setRange(0, 20)

        for i in range(0, 15):
            progress_dlg.setValue(i)
            QThread.msleep(100)

        for i in range(15, 20):
            progress_dlg.setValue(i)
            QThread.msleep(100)
            if progress_dlg.wasCanceled():
                QtGui.QMessageBox.information(self, u"Error",
                                              u"电子书制作失败, 请重新操作")
                return

            try:
                filename = game.begin()  # TODO: 一次只能生成一本书
            except TypeError:
                QtGui.QMessageBox.information(self, u"Error", u"第一次使用请登录")
                progress_dlg.close()
                return
            progress_dlg.close()

            info_filename = ','.join(filename)
            QtGui.QMessageBox.information(
                self, u"info", u"电子书" + str(info_filename) + u"制作成功")

            for item in filename:
                file_path = EPUBSTOR_DIR + '/' + item
                Path.copy(str(file_path + '.epub'), LIBRARY_DIR)
                file_name = os.path.basename(str(file_path))
                book_id = file_name.split('.epub')[0]

                Path.mkdir(LIBRARY_DIR + book_id)
                shutil.move(LIBRARY_DIR + book_id + '.epub',
                            LIBRARY_DIR + book_id)

                book = Book(str(book_id))
                book.date = time.strftime(ISOTIMEFORMAT, time.localtime())
                book.tags += tags.replace(' ', '')
                book.tags += ',' + str(self.now_url)
                if self.add_title_tag.isChecked():
                    book.tags += ',' + str(book.title)
                insert_library(book)
            return
Exemplo n.º 14
0
 def init_path(self):
     Path.rmdir(u'./' + self.title)
     Path.mkdir(u'./' + self.title)
     Path.chdir(u'./' + self.title)
     EpubPath.init_epub_path(Path.get_pwd())
     return
Exemplo n.º 15
0
    def start(self):
        print 'start JRJ_Report'

        stockList = []

        stockList.append({'URL': '1', 'NAME': '宏观研究'})
        # stockList.append({'URL': '8', 'NAME': '策略趋势'})

        for xx in stockList:

            for raw_front_page_index in range(5, 50):

                print '开始第' + str(raw_front_page_index) + '页面 下载'

                fileN = str(xx['NAME']).strip()
                uux = xx['URL']

                sdPath = ' /Volumes/MacintoshHD/File/{}'.format(fileN)

                Path.mkdir(sdPath)

                url = u"http://istock.jrj.com.cn/yanbao_{}_p{}.html"

                request_url = url.format(uux, raw_front_page_index)
                content = Http.get_content(request_url)

                soup = BeautifulSoup(content, 'html.parser')

                list_p_list = soup.find_all('div', class_="yb_con")

                for p in list_p_list:
                    # print p

                    list_pcyc_li = p.find_all('a')
                    for li in list_pcyc_li:
                        xxurl = li.get('href')
                        # print xxurl

                        if not 'http://istock.jrj.com.cn/list,yanbao.html' == xxurl:

                            try:

                                result = Http.get_content(xxurl)
                                result = unicode(str(result), 'GBK').encode('UTF-8')

                                xxsoup = BeautifulSoup(result, 'html.parser')

                                # title_tationl = xxsoup.find_all('h1')
                                # tt = str(title_tationl[0].text).strip()

                                xxlist_p_list = xxsoup.find_all('p', class_='title')[0]
                                xxlist_ds = xxsoup.find_all('span', class_='fr')[0]

                                realu = str(xxlist_p_list).replace(str(xxlist_ds), '', 1)

                                realuxsoup = BeautifulSoup(realu, 'html.parser')

                                sp = str(realuxsoup.text).split(' ')

                                ttime = sp[1]

                                if ttime.__contains__('发表于'):
                                    ttime = sp[2]

                                # print (sp[2]).text
                                # print (sp[3]).text

                                # print ttime

                                all_main = xxsoup.find_all('div', class_='main')[0]

                                realuxsoup = BeautifulSoup(str(all_main), 'html.parser')

                                reaupp = realuxsoup.find_all('p')

                                for pp in reaupp:
                                    list_pcyc_li = pp.find_all('a')

                                    for li in list_pcyc_li:
                                        print li.text
                                        ttt = li.get('href')

                                        # print ttt

                                        ftype = 'pdf'

                                        if str(ttt).endswith('.xlsx'):
                                            ftype = 'xlsx'

                                        fileName = u"{}_{}.{}".format(ttime, str(li.text).replace('/', ""), ftype)

                                        print fileName

                                        basePath = '/ink/work/62/ink/{}/{}'.format(fileN, fileName)

                                        Path.mkdirAndPath(basePath)

                                        Debug.print_in_single_line(u'开始下载   {}  '.format(ttt))
                                        if ttt:
                                            content = Http.get_content(url=ttt, timeout=180)
                                            if not content:
                                                # Debug.logger.debug(u'文件『{}』下载失败'.format(ttt))
                                                content = ''
                                            else:
                                                Debug.print_in_single_line(u'文件 {} 下载完成'.format(ttt))
                                        else:
                                            #   当下载地址为空的时候,就没必要再去下载了
                                            content = ''
                                        if not os.path.exists(fileName):
                                            if content.__len__() > 10:
                                                with open(basePath, "wb") as pdf:
                                                    pdf.write(content)
                            except Exception as e:
                                print 'Exception ' + e.message