def parsel_detail_one():
    conn_1 = pymysql.connect(DBHOST, DBUSER, DBPWD, DB)
    conn_2 = sqlite3.connect('mirrorimutmeixingbook_20191218.db3')
    sub_db_id = '243'
    provider = 'mirrorimutmeixingbook'
    type = '1'
    date = '1900'
    date_created = '19000000'
    medium = '2'
    sql_in = "insert into modify_title_info_zt(Lngid, rawid, provider, type, language, country, provider_url, provider_id, batch, title, creator, provider_subject, date, date_created, medium) values(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)"
    result_2 = []
    now_time = time.strftime('%Y%m%d')
    fdir = '%s/%s' % (detail_path, now_time)
    for _, filename in utils.file_list(fdir):
        rawid = _.replace(".html", "")
        with open(filename, encoding='gb18030') as f:
            text = f.read()
        html = Selector(text, "html")
        creator = html.xpath(
            "//table[@style='WORD-BREAK: break-all']//tr/td/text()").extract(
            )[0].replace("作者:", "")
        if creator == "unknow":
            creator = ""
        if "ja" in rawid:
            id_ = rawid.replace('ja', '')
            url = "http://202.207.22.13:100/Soft_Showja.asp?SoftID=%s" % id_
            language = "JA"
            country = "JP"
            Lngid = utils.GetLngid(sub_db_id, rawid)
        else:
            language = "EN"
            country = "US"
            url = "http://202.207.22.13:100/Soft_Show.asp?SoftID=%s" % rawid
            Lngid = utils.GetLngid(sub_db_id, rawid)
        sql = "select title,provider_subject from detail where url = '%s'" % url
        cur = conn_1.cursor()
        cur.execute(sql)
        rows = cur.fetchall()
        title = rows[0][0].replace("\n", ' ')
        provider_subject = rows[0][1].replace("数字图书;", '')
        provider_url = provider + '@' + url
        provider_id = provider + '@' + rawid
        batch = str(now_time) + '00'
        result_2.append((Lngid, rawid, provider, type, language, country,
                         provider_url, provider_id, batch, title, creator,
                         provider_subject, date, date_created, medium))
    utils.parse_results_to_sql(conn_2, sql_in, result_2)
    utils.printf("插入剩下的%s条" % len(result_2))
    result_2.clear()
    def parse_detail_one(self, filename, fullname):
        language = 'ZH'
        country = 'CN'
        provider = 'ydylcninfo'
        type_ = 14
        medium = 2
        batch = time.strftime('%Y%m%d') + '00'
        rawid = filename.replace('.html', '')
        date = '1900'
        date_created = '19000000'
        infotype = fullname.split('\\')[-2]
        url = 'https://www.ydylcn.com/%s/%s.shtml' % (infotype, rawid)
        provider_url = provider + '@' + url
        provider_id = provider + '@' + rawid
        lngid = utils.GetLngid('00042', rawid)

        with open(fullname, encoding='utf8') as f:
            text = f.read()
        sel = Selector(text=text)
        try:
            title = sel.xpath('//div[@class="tit"]/h1/text()').extract_first()
            creator = description_source = description = ''
            for divstr in sel.xpath(
                    '//div[@class="info"]/span/text()').extract():
                utils.printf('divstr:%s' % divstr)
                if divstr.startswith('来源:'):
                    description_source = divstr.replace('来源:', '').replace(
                        '、', ';').replace(';', ';').strip()
                elif divstr.startswith('作者:'):
                    creator = divstr.replace('作者:', '').strip().replace(
                        ' ', ';').strip(';')
                elif divstr.startswith('发布时间:'):
                    date_created = divstr.replace('发布时间:',
                                                  '').replace('-', '').strip()
                    if len(date_created) == 6:
                        date_created = date_created + '00'
                    date = date_created[0:4]
            descriptions = sel.xpath(
                "//div[@class='txt']/p[@style='text-align: justify;']/span/text()"
            ).extract()
            for item in descriptions:
                description += item + "\n"

            onemessage = (lngid, rawid, creator, title, description,
                          description_source, date, date_created, language,
                          country, provider, provider_url, provider_id, type_,
                          medium, batch)
        except:
            exMsg = '* ' + traceback.format_exc()
            print(exMsg)
            utils.logerror(exMsg)
            utils.logerror(fullname)
            return False

        return onemessage
Пример #3
0
 def parse_detail(self):
     cover_list = []
     cover_now_path = self.coverPath + '\\' + self.now_time
     for root, dirs, files in os.walk(cover_now_path):
         for file in files:
             rawid = file.replace('.jpg','')
             cover_list.append(rawid)
     print(len(cover_list))
     conn = sqlite3.connect("video.db3")
     now_path = self.detailPath + '\\' + self.now_time
     sub_db_id = '203'
     sub_db = 'DMT'
     provider = 'fifeduvideo'
     type_ = '10'
     language = 'ZH'
     country = 'CN'
     date = '1900'
     date_created = '19000000'
     medium = '2'
     sql = "insert into modify_title_info_zt(Lngid, rawid, provider, type, language, country, provider_url, provider_id, cover, batch, title, description, provider_subject, date, date_created, medium) values(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)"
     result = []
     for root, dirs, files in os.walk(now_path):
         for file in files:
             rawid = file.replace('.html','')
             Lngid = utils.GetLngid(sub_db_id, rawid)
             provider_url = provider + '@' + "http://lib.fifedu.com/toVideoPage.do?id=%s" % rawid
             provider_id = provider + '@' + rawid
             if rawid in cover_list:
                     cover = "/smartlib" + "/" + provider + "/" + rawid + ".jpg"
             else:
                 cover = ''
             batch = str(self.now_time) + '00'
             file_path = root + '/' + file
             print(file_path)
             with open(file_path, encoding='utf8') as f:
                 text = f.read()
             html = Selector(text, type='html')
             provider_subject = description = title = ''
             provider_subjects = html.xpath("//div[@class='detailnavbody']/a/text()").extract()[1:-1]
             title = html.xpath("//div[@class='detailnavbody']/a/text()").extract()[-1]
             description = html.xpath("//div[@class='tabvalue']/text()").extract_first('').strip()
             for item in provider_subjects:
                 provider_subject += item + ';'
                 provider_subject = provider_subject.replace('在线课程库;','').replace('玩转多语种;','').replace('视听练兵场;','')
             result.append(
                 (Lngid, rawid, provider, type_, language, country, provider_url, provider_id, cover, batch, title, description, provider_subject, date, date_created, medium)
             )
             # if utils.parse_results_to_sql(conn, sql, result, 100):
             #     print("插入%s条" % len(result))
             #     result.clear()
         utils.parse_results_to_sql(conn, sql, result)
         print("插入剩下得%s条" % len(result))
         result.clear()
Пример #4
0
def parse_detail():
    result = []
    conn_db3 = sqlite3.connect("zt_template.db3")
    sql_in = """
    insert into modify_title_info_zt (Lngid, rawid, provider, type, language, country, provider_url, provider_id, cover, batch, title, description, provider_subject, date, date_created, creator, medium , publisher) values (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)
    """
    # 基本信息
    language = 'ZH'
    country = 'CN'
    type_ = '1'
    provider = '360elibbook'
    sub_db_id = '258'
    batch = now_time + '00'
    medium = "2"
    date = "1900"
    date_created = date + "0000"
    fdir = "%s\%s" % (detail_path,now_time)
    for _,dir_ in utils.file_list(fdir):
        utils.printf(dir_)
        pa = r"E:\\work\\360elib\\detail\\%s\\(.*)\\" % (now_time)
        provider_subject = re.findall(pa,dir_)[0]
        if provider_subject == 'None':
            provider_subject = ""
        with open(dir_,encoding='utf-8')as f:
            text = f.read()
        html = Selector(text,'html')
        rawid = _.replace(".html","")
        Lngid = utils.GetLngid(sub_db_id, rawid)
        provider_url = provider + '@' + "http://www.360elib.com:2100/chinese/web/Details.aspx?id=%s" % (rawid)
        provider_id = provider + '@' + rawid
        title = html.xpath("//span[@id='ctl00_ContentPlaceHolder1_lb_name']/text()").extract_first()
        creator = html.xpath("//span[@id='ctl00_ContentPlaceHolder1_lb_zz']/text()").extract_first("").replace(", ","").replace(",","").replace(",","").replace("、","")
        publisher = html.xpath("//span[@id='ctl00_ContentPlaceHolder1_lb_cbs']/text()").extract_first("")
        description = html.xpath("//span[@id='ctl00_ContentPlaceHolder1_lb_bookintro']/text()").extract_first("")
        cover_rawid = rawid.lower()
        cover_p = '%s/%s/%s.jpg' % (cover_path,now_time,cover_rawid)
        if os.path.exists(cover_p):
            cover = "/smartlib" + "/" + provider + "/" + cover_rawid + ".jpg"
        else:
            cover = ""
        result.append(
            (Lngid, rawid, provider, type_, language, country, provider_url, provider_id, cover, batch, title, description, provider_subject, date, date_created, creator, medium , publisher)
        )
        if utils.parse_results_to_sql(conn_db3, sql_in, result, 1000):
            utils.printf("插入%s条" % len(result))
            result.clear()
    utils.parse_results_to_sql(conn_db3, sql_in, result)
    utils.printf("插入剩下得%s条" % len(result))
    result.clear()
Пример #5
0
    def parse_index_one(self, filename, fullname):
        language = 'ZH'
        country = 'CN'
        provider = 'pishuvideo'
        type_ = 10
        medium = 2
        batch = time.strftime('%Y%m%d') + '00'
        rawid = filename.replace('.html', '')
        publisher = '社会科学文献出版社'
        date = '1900'
        date_created = '19000000'
        url = 'https://www.pishu.com.cn/skwx_ps/multimedia/VideoDetail?SiteID=14&type=Video&ID=%s' % rawid
        provider_url = provider + '@' + url
        provider_id = provider + '@' + rawid
        lngid = utils.GetLngid('00059', rawid)
       
        with open(fullname, encoding='utf8') as f:
            text = f.read()
        sel = Selector(text=text)
        try:
            title = sel.xpath('//div[@class="zl_vedioTit"]/span/text()').extract_first()
            creator = title_alternative = identifier_pisbn = title_series = subject = description = ''
            for spanTag in sel.xpath('//div[@class="vedioCon"]/span'):
                spanstr = spanTag.xpath('string(.)').extract_first().strip()
                # utils.printf('trstr:%s' % trstr)
                if spanstr.startswith('制作时间:'):
                    date_created = spanTag.xpath('./following::text()[1]').extract_first().replace('-','')
                    date = date_created[:4]
                elif spanstr.startswith('关键词:'):
                    subject = spanTag.xpath('./following::text()[1]').extract_first().replace(' ',';')
                    subject = re.sub(';+',';',subject).strip().strip(';')
                elif spanstr.startswith('内容摘要:'):
                    # utils.printf('trstr:%s' % trstr)
                    description = spanTag.xpath('./following::text()[1]').extract_first().strip()
                    # utils.printf('identifier_pisbn:%s' % identifier_pisbn)
                
            onemessage = (lngid, rawid, title, subject,description, publisher, date, date_created, language, country,
            provider,provider_url, provider_id,type_, medium, batch)
        except:
            exMsg = '* ' + traceback.format_exc()
            print(exMsg)
            utils.logerror(exMsg)
            utils.logerror(fullname)
            return False

        return onemessage
Пример #6
0
def parse_detail():
    conn = sqlite3.connect('template.db3')
    language = "EN"
    type = "1"
    medium = "2"
    provider = "mirrorbeihuakingbook"
    country = "US"
    sub_db_id = "217"
    now_time = time.strftime('%Y%m%d')
    batch = now_time + "00"
    stmt = (
        '''insert into modify_title_info_zt(lngid,rawid,title,creator,description,date,date_created,identifier_pisbn,language,country,provider,provider_url,provider_id,type,medium,batch,publisher) 
            VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,);'''
        )
    results = []
    cnt=0
    dir_path = detail_path + '/' + now_time
    for file, fullpath in utils.file_list(dir_path):
        with open(fullpath, encoding='gb18030') as fp:
            txt = fp.read()
        title, creator, publishers, date, identifier_pisbn, description = _parse_detail_one(txt)
        # print(title, creator, publishers, date, identifier_pisbn, description)
        date_created = date + '0000'
        basename, _, ext = file.partition(".")
        rawid = basename
        provider_url = provider + "@http://10.5.23.18:8079/book/bookinfo.aspx?id=" + rawid
        provider_id = provider + "@" + rawid
        lngID = utils.GetLngid(sub_db_id, rawid)
        results.append(
            (
                lngID, rawid, title, creator, description, date, date_created, identifier_pisbn, language,
                country, provider, provider_url, provider_id, type, medium, batch, publishers
            )
        )
        if utils.parse_results_to_sql(conn, stmt, results, 1000):
                cnt+=len(results)
                print('%s:%s' % (time.strftime("%Y/%m/%d %X"),cnt))
                results.clear()
    utils.parse_results_to_sql(conn, stmt, results)
    cnt+=len(results)
    print('%s:%s' % (time.strftime("%Y/%m/%d %X"),cnt))
    conn.close()
    def parse_detail_one(self, filename, fullname):
        language = 'ZH'
        country = 'CN'
        provider = 'ydylcninfo'
        type_ = 14
        medium = 2
        batch = time.strftime('%Y%m%d') + '00'
        rawid = filename.replace('.html', '')
        publisher = '社会科学文献出版社'
        date = '1900'
        date_created = '19000000'
        url = 'https://www.ydylcn.com/skwx_ydyl/initDatabaseDetail?siteId=1&contentId=%s&contentType=literature' % rawid
        provider_url = provider + '@' + url
        provider_id = provider + '@' + rawid
        lngid = utils.GetLngid('00042', rawid)

        with open(fullname, encoding='utf8') as f:
            text = f.read()
        sel = Selector(text=text)
        try:
            title = sel.xpath('//div[@class="info"]/h1/text()').extract_first()
            creator = pagecount = source = title_series = subject = description = subject_en = creator_bio = ''
            for spanTag in sel.xpath(
                    '//div[@class="info"]/div[@class="con"]/span'):
                spanstr = spanTag.xpath('string(.)').extract_first().strip()
                # utils.printf('spanstr:%s' % spanstr)
                if spanstr.startswith('作者:'):
                    creator = spanstr.replace('作者:',
                                              '').strip().replace(' ', ';')
                elif spanstr.startswith('出版日期:'):
                    date_created = spanstr.replace(
                        '出版日期:', '').strip().replace('年', '').replace('月', '')
                    if len(date_created) == 6:
                        date_created = date_created + '00'
                    date = date_created[0:4]
                elif spanstr.startswith('报告页数:'):
                    pagecount = spanstr.replace('报告页数:',
                                                '').replace('页', '').replace(
                                                    ' ', '').strip()
                elif spanstr.startswith('所属丛书:'):
                    title_series = spanTag.xpath('./a/@title').extract_first()
                elif spanstr.startswith('所属图书:'):
                    source = spanTag.xpath('./a/@title').extract_first()
            description = sel.xpath(
                'string(//div[@class="item"]/div[@class="txt"])'
            ).extract_first(default='').strip()
            description_en = sel.xpath(
                'string(//div[@class="item en"]/div[@class="txt"])'
            ).extract_first(default='').strip()
            for divTag in sel.xpath('//div[@class="keywords"]'):
                divstr = divTag.xpath('string(.)').extract_first().strip()
                if divstr.startswith('关键词:'):
                    for aTag in divTag.xpath('./a/text()'):
                        subject = subject + aTag.extract().strip() + ';'
                    subject = subject.strip(';')
                elif divstr.startswith('Keywords:'):
                    for aTag in divTag.xpath('./a/text()'):
                        subject_en = subject_en + aTag.extract().strip() + ';'
                    subject_en = subject_en.strip(';')
            for divTag in sel.xpath('//div[@class="desc"]/div/p'):
                divstr = divTag.xpath('string(.)').extract_first().strip()
                if divstr.find('暂无简介') < 0:
                    creator_bio = creator_bio + divstr + '\n'
            creator_bio = creator_bio.strip()
            onemessage = (lngid, rawid, creator, title, title_series, subject,
                          subject_en, description, description_en, pagecount,
                          source, creator_bio, publisher, date, date_created,
                          language, country, provider, provider_url,
                          provider_id, type_, medium, batch)
        except:
            exMsg = '* ' + traceback.format_exc()
            print(exMsg)
            utils.logerror(exMsg)
            utils.logerror(fullname)
            return False

        return onemessage
    def parse_list_one(self, filename, fullname):
        language = 'ZH'
        country = 'CN'
        provider = 'ydylcnbook'
        type_ = 1
        medium = 2
        batch = time.strftime('%Y%m%d') + '00'
        rawid = filename.replace('.html', '')
        publisher = '社会科学文献出版社'
        date = '1900'
        date_created = '19000000'
        url = 'https://www.ydylcn.com/skwx_ydyl/bookdetail?SiteID=1&ID=%s' % rawid
        provider_url = provider + '@' + url
        provider_id = provider + '@' + rawid
        lngid = utils.GetLngid('00041', rawid)
        cover = '/smartlib/ydylcnbook/%s.jpg' % rawid
        cover_path = '%s/%s.jpg' % (self.cover_path, rawid)
        if not os.path.exists(cover_path):
            cover = ''
        with open(fullname, encoding='utf8') as f:
            text = f.read()
        sel = Selector(text=text)
        try:
            title = sel.xpath(
                '//div[@class="tit-g1"]/h3/text()').extract_first()
            creator = title_alternative = identifier_pisbn = title_series = subject = description = ''
            for divTag in sel.xpath('//div[@class="desc"]/p'):
                divstr = divTag.xpath('string(.)').extract_first().strip()
                # utils.printf('divstr:%s' % divstr)
                if divstr.startswith('英 文 名:'):
                    title_alternative = divstr.replace('英 文 名:', '').strip()
                elif divstr.startswith('作    者:'):
                    for author in divTag.xpath('./a/text()'):
                        creator = creator + author.extract() + ';'
                    creator = creator.strip(';')
                elif divstr.startswith('I S B N:'):
                    # utils.printf('divstr:%s' % divstr)
                    identifier_pisbn = divstr.replace('I S B N:',
                                                      '').replace('-',
                                                                  '').strip()
                    # utils.printf('identifier_pisbn:%s' % identifier_pisbn)
                elif divstr.startswith('丛 书 名:'):
                    title_series = divstr.replace('丛 书 名:', '').strip()
                elif divstr.startswith('关键词:'):
                    for keyword in divTag.xpath('./a/text()'):
                        subject = subject + keyword.extract() + ';'
                    subject = subject.strip(';')
            description = sel.xpath(
                'string(//div[@class="item"]/div[@class="txt"])'
            ).extract_first().strip()
            description = description.replace('●', '').strip()
            onemessage = (lngid, rawid, creator, title, title_alternative,
                          title_series, cover, subject, identifier_pisbn,
                          description, publisher, date, date_created, language,
                          country, provider, provider_url, provider_id, type_,
                          medium, batch)
            bookdetaillist = []
            for article_id in sel.xpath(
                    '//ul[@class="list-article-1"]/li/h5/a/@onclick'):
                pt = re.compile(r'toGeDataBase\((\d+),.*?\)')
                m = pt.match(article_id.extract())
                if m:
                    # utils.printf('文章号%s' % m.group(1))
                    bookdetaillist.append(m.group(1))
        except:
            exMsg = '* ' + traceback.format_exc()
            print(exMsg)
            utils.logerror(exMsg)
            utils.logerror(fullname)
            return False, False

        return onemessage, bookdetaillist
Пример #9
0
    def parse_detail_one(self, filename, fullname):
        try:
            language = 'EN'
            country = 'CN'
            provider = 'hepjournal'
            type_ = 3
            medium = 2
            batch = time.strftime('%Y%m%d') + '00'
            identifier_doi = filename.replace('.html', '').replace('_', '/')
            rawid = identifier_doi
            gch = fullname.split('\\')[-2]
            source, identifier_pissn, identifier_eissn, identifier_cnno = self.dic[
                gch]
            publisher = 'Higher Education Press'
            with open(fullname, encoding='utf8') as f:
                text = f.read()
            soup = BeautifulSoup(text, 'lxml')
            url = ''
            if gch == 'engi':
                url = 'http://engineering.org.cn/EN/%s' % rawid
            elif gch == 'laf' and not rawid.split('/')[0].startswith('10.'):
                identifier_doi = ''
                urlTag = soup.select_one('meta[name*="HW.ad-path"]')
                url = urlTag.get('content').strip()
                rawid = url.replace('http://journal.hep.com.cn/laf/EN/', '')
            else:
                url = 'http://journal.hep.com.cn/%s/EN/%s' % (gch, rawid)
            provider_url = provider + '@' + url
            provider_id = provider + '@' + rawid
            gch = provider + "@" + gch
            lngid = utils.GetLngid('00025', rawid)

            title = ''
            titleTag = soup.select_one('div.J_biaoti_en')
            if titleTag:
                title = ''.join(titleTag.stripped_strings)

            description = ''.strip()
            for tdTag in soup.select('td[class="J_zhaiyao"]'):
                if tdTag.select_one('p'):
                    description = ''.join(tdTag.p.stripped_strings)
                    break
                bTag = tdTag.select_one('b')
                if bTag:
                    if bTag.get_text() == 'Abstract':
                        description = ''.join(tdTag.stripped_strings).replace(
                            'Abstract', '')
                        break
            date_created = date = ''
            dateTag = soup.select_one('meta[name*="DC.Date"]')
            if dateTag:
                date_created = dateTag.get('content').replace('-', '')
            else:
                date_created = '19000000'
            if date_created == '':
                for spanTag in soup.select('span[class*="J_zhaiyao"]'):
                    strspan = ''.join(spanTag.stripped_strings)
                    if strspan.startswith('Online First Date:'):
                        date_created = strspan.replace('Online First Date:',
                                                       '').strip()
                        date_created = replacedate(date_created)
                        break

            date = date_created[:4]

            subject = ''
            subjectTag = soup.select_one('meta[name*="keywords"]')
            if subjectTag:
                subject = subjectTag.get('content').replace(',',
                                                            ';').strip(';')
                subject = re.sub(r'</?\w+[^>]*>', '', subject)
                if subject == '&nbsp':
                    subject = ''

            beginpage = ''
            beginpageTag = soup.select_one('meta[name*="citation_firstpage"]')
            if beginpageTag:
                beginpage = beginpageTag.get('content').strip()

            endpage = ''
            endpageTag = soup.select_one('meta[name*="citation_lastpage"]')
            if endpageTag:
                endpage = endpageTag.get('content').strip()

            if endpage == '':
                endpage = beginpage

            page = ''
            if not beginpage == '':
                page = beginpage + '-' + endpage

            volume = ''
            volumeTag = soup.select_one('meta[name*="citation_volume"]')
            if volumeTag:
                volume = volumeTag.get('content').strip()

            issue = ''
            issueTag = soup.select_one('meta[name*="citation_issue"]')
            if issueTag:
                issue = issueTag.get('content').strip()

            creator = ''
            authorTag = soup.select_one('td[class="J_author_EN"]')
            if authorTag:
                if authorTag.select_one('sup'):
                    supTags = authorTag.select('sup')
                    suplist = []
                    tmpsup = ''
                    cnt = 0
                    authorlist = []
                    for supTag in supTags:
                        strsup = supTag.get_text()
                        tmpsup += strsup
                        cnt += 1
                        nextTag = supTag.next_sibling
                        beforeTag = supTag.previous_sibling
                        if isinstance(beforeTag, NavigableString):
                            author = beforeTag.replace('(', '').replace(
                                ')', '').strip().strip(',')
                            if not author == '':
                                authorlist.append(author)

                        # supTag.extract()
                        if isinstance(nextTag, NavigableString):
                            if nextTag == '(':
                                suplist.append(tmpsup.strip(','))
                                tmpsup = ''
                                continue
                        if not tmpsup.endswith(','):
                            suplist.append(tmpsup)
                            tmpsup = ''
                        elif cnt == len(supTags):
                            suplist.append(tmpsup.strip(','))
                    # tmpauthor = ''.join(authorTag.stripped_strings)
                    # tmpauthor = tmpauthor.replace('(', '').replace(')', '').strip().strip(',')
                    # authorlist = tmpauthor.split(',')

                    if len(authorlist) == len(suplist):
                        for i in range(len(authorlist)):
                            creator = creator + '%s[%s];' % (authorlist[i],
                                                             suplist[i])
                    elif len(authorlist) == len(supTags):
                        for i in range(len(authorlist)):
                            creator = creator + '%s[%s];' % (
                                authorlist[i],
                                supTags[i].get_text().strip(','))
                    # print(authorlist)
                    # print(suplist)
                    if creator == '':
                        for i in range(len(authorlist)):
                            if len(authorlist) < len(suplist):
                                creator = creator + '%s[%s];' % (authorlist[i],
                                                                 suplist[i])
                            else:
                                creator = creator + authorlist[i] + ';'
                    creator = creator.strip(';')

                else:
                    creator = ''.join(authorTag.stripped_strings)
                    creator = creator.replace('(',
                                              '').replace(')', '').replace(
                                                  ',', ';').strip()
            creator = re.sub(r';\s*', ';', creator)
            insTag = soup.select_one('td[class="J_author"]')
            creator_institution = ''
            if insTag:
                for brTag in insTag.select('br'):
                    brTag.insert_after(soup.new_string(";"))
                affiliation = ''.join(insTag.stripped_strings)
                affiliation = re.sub(r'\n', '', affiliation)
                for ins in affiliation.split(';'):
                    ins = ins.strip()
                    ptins = re.compile('(\w{1,2})\.\s*(.*)')
                    m = ptins.match(ins)
                    if m:
                        creator_institution = creator_institution + '[%s]%s;' % (
                            m.group(1), m.group(2))
                if creator_institution == '':
                    creator_institution = affiliation
                creator_institution = creator_institution.strip(';')

            onemessage = (lngid, rawid, creator, title, volume, issue, page,
                          beginpage, endpage, publisher, subject, date,
                          creator_institution, date_created, source,
                          identifier_pissn, identifier_eissn, identifier_cnno,
                          description, identifier_doi, language, country,
                          provider, provider_url, provider_id, type_, medium,
                          batch, gch)
            return onemessage
        except:
            exMsg = '* ' + traceback.format_exc()
            print(exMsg)
            utils.logerror(exMsg)
            utils.logerror(fullname)
            return False
Пример #10
0
def parse_detail():
    conn = sqlite3.connect('video.db3')
    cover_list = []
    now_time = time.strftime('%Y%m%d')
    cover_now_path = cover_path + '\\' + now_time
    for root, dirs, files in os.walk(cover_now_path):
        for file in files:
            rawid = file.replace('.jpg', '')
            cover_list.append(rawid)
    sub_db_id = '223'
    provider = 'fzwjtvideo'
    type = '10'
    language = 'ZH'
    country = 'CN'
    date = '1900'
    date_created = '19000000'
    medium = '2'
    result = []
    sql = "insert into modify_title_info_zt(Lngid, rawid, provider, type, language, country, provider_url, provider_id, cover, batch, title, description, title_sub, creator, creator_bio, provider_subject, date, date_created, medium) values(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)"
    for _, filename in utils.file_list(detail_path):
        replace_1 = r'E:\work\fzwjt\detail' + '\\' + now_time + '\\'
        rawid = _.replace(".html", '')
        Lngid = utils.GetLngid(sub_db_id, rawid)
        provider_subject = filename.replace(replace_1,
                                            '').replace('\\' + _, '')
        provider_url = provider + '@' + "http://www.fzwjt.com/Course/Detail/%s" % rawid
        provider_id = provider + '@' + rawid
        batch = str(now_time) + '00'
        cover = ''
        if rawid in cover_list:
            cover = "/smartlib" + "/" + provider + "/" + rawid + ".jpg"
        with open(filename, encoding='utf8') as f:
            text = f.read()
        html = Selector(text, 'html')
        title_sub = creator_bio = ""
        title = html.xpath(
            "//div[@class='cInfo']/dl[@class='base']/dt/b/text()"
        ).extract_first('')
        description = html.xpath(
            "//div[@class='cInfo']/dl[@class='base']/dd[@class='info']/text()"
        ).extract_first('')
        title_subs = html.xpath(
            "//ul[@class='courseListL']/li/p/a/text()").extract()
        for item in title_subs:
            title_sub += item + ';'
        title_sub = title_sub[0:-1]
        creator = html.xpath(
            "//ul[@class='courseListL']/li/span[2]/text()").extract_first("")
        if creator == "解说":
            creator = ""
            creator_bio = ""
        else:
            # 取下一层一个url获取讲师简介
            a = "http://www.fzwjt.com" + html.xpath(
                "//ul[@class='courseListL']/li/p/a/@href").extract_first("")
            feature = 'tagB'
            res = utils.get_html(a, feature=feature, proxies=proxy, timeout=50)
            if res:
                html_2 = Selector(res.text, 'html')
                creator_bio = html_2.xpath(
                    "//div[@class='tagB']/p/text()").extract_first("").replace(
                        "&quot;", '').strip()
        utils.printf(title, 'write right')
        # utils.printf(title,creator,creator_bio)
        result.append(
            (Lngid, rawid, provider, type, language, country, provider_url,
             provider_id, cover, batch, title, description, title_sub, creator,
             creator_bio, provider_subject, date, date_created, medium))
        if utils.parse_results_to_sql(conn, sql, result, 100):
            print("插入%s条" % len(result))
            result.clear()
    utils.parse_results_to_sql(conn, sql, result)
    print("插入剩下得%s条" % len(result))
    result.clear()
    def parse_detail_one(self, filename, fullname, db3type):
        try:
            language = 'EN'
            country = 'CN'
            provider = 'hepengineeringjournal'
            type_ = 3
            medium = 2
            batch = time.strftime('%Y%m%d') + '00'
            identifier_doi = ''
            rawid = ''
            gch = fullname.split('\\')[-2]
            source, identifier_pissn, identifier_cnno = self.dic[gch]
            publisher = 'Higher Education Press'
            with open(fullname, encoding='utf8') as f:
                text = f.read()
            dicitem = json.loads(text, encoding='utf-8')['resultValue']

            date_created = dicitem['publicTime']
            if date_created == '':
                date_created = dicitem['onlineTime']
            if date_created != '':
                date_created = replacedate(date_created)
            elif dicitem['year'] != '':
                date_created = str(dicitem['year']) + '0000'
            else:
                date_created = '19000000'
            date = date_created[:4]
            identifier_doi = dicitem['doiNm']
            rawid = filename.replace('.json', '')

            is_oa = dicitem['isOa']

            # if identifier_doi == '':
            #     articlenum_dic = json.loads(dicitem['attachment'],encoding='utf-8')
            #     if articlenum_dic.__contains__('fileName'):
            #         articlenum = articlenum_dic['fileName'].strip('.pdf')
            #     else:
            #         articlenum = articlenum_dic['key'].split('_')[-1].strip('.pdf')
            #     url = 'http://www.engineering.org.cn/en/article/%s' % articlenum
            # else:
            #     url = 'http://www.engineering.org.cn/en/%s' % identifier_doi
            url = 'http://www.engineering.org.cn/default/page/loadPageIndex?pageId=ab4265bb601844d298ec9cd21f046661&id=%s' % rawid.split(
                '_')[-1]
            provider_url = provider + '@' + url
            provider_id = provider + '@' + rawid
            gch = provider + "@" + gch
            lngid = utils.GetLngid('00036', rawid)

            beginpage = str(dicitem['pageStart'])
            endpage = str(dicitem['pageEnd'])
            if endpage == '' or endpage == '0':
                endpage = beginpage
            page = ''
            if not beginpage == '':
                page = beginpage + '-' + endpage

            volume = dicitem['volumeNm']
            issue = dicitem['issueNm']

            dr = re.compile(r'<[^>]+>', re.S)

            subject = dicitem['keyWords'].replace(',',
                                                  ';').replace(';',
                                                               ';').strip()
            subject = re.sub(r';\s+', ';', subject)
            subject = re.sub(r'\s+;', ';', subject)
            subject = dr.sub('', subject)

            title = dicitem['title']
            title = dr.sub('', title)

            description = dicitem['summary']
            description = dr.sub('', description)
            if description == '' or description == '&nbsp;':
                description = dicitem['content']
                soup = BeautifulSoup(description, 'lxml')
                description = soup.get_text()
            description = description.strip()

            author_1st = ''
            creator = ''
            # authortext = dicitem['articleAuthor'].replace('&eacute;','é').replace('&egrave;','è').replace('&rsquo;','\'')
            authortext = dicitem['articleAuthor']
            if authortext.find('<sup>') > 0:
                authortext = authortext.replace('&nbsp;', '').replace(
                    '</sup>, ', '</sup>、')
                for author in authortext.split('、'):
                    author = author.strip()
                    # utils.printf(author)
                    ptsup = re.compile('.*?(<sup>(.*)</sup>).*?')
                    m = ptsup.match(author)
                    if m:
                        dauthor = author.replace(m.group(1), '').strip()
                        dauthor = dr.sub('', dauthor)
                        if author_1st == '':
                            author_1st = dauthor
                        creator = creator + '%s[%s];' % (
                            dauthor, dr.sub('',
                                            m.group(2).strip().strip(',')))
            else:
                creator = authortext.replace('、&nbsp; ', ';').replace(
                    '、 ', ';').replace('、', ';')
                creator = dr.sub('', creator)
                creator = creator.replace('&nbsp;', '')
                if author_1st == '':
                    author_1st = creator.split(';')[0]
            creator = creator.strip(';').replace(' and ', ';')
            creator = html.unescape(creator)

            organ_1st = ''
            creator_institution = ''
            institutiontext = dicitem['authorUnit']
            if creator.find('[') > -1:
                if institutiontext.find('<sup>') > -1:
                    institutiontext = institutiontext.replace(
                        '<sup>', '<br /><sup>')
                    for ins in institutiontext.split('<br />'):
                        ins = ins.strip()
                        ptsup = re.compile('.*(<sup>(.*?)</sup>).*')
                        m = ptsup.match(ins)
                        if m:
                            dins = ins.replace(m.group(1), '')
                            dins = dr.sub('', dins).strip()
                            if organ_1st == '':
                                organ_1st = dins.strip('. ')
                            creator_institution = creator_institution + '[%s]%s;' % (
                                m.group(2).strip(), dins.strip('. '))
                elif institutiontext.find('<p>') > -1:
                    soup = BeautifulSoup(institutiontext, 'lxml')
                    ptp = re.compile('^(\w)\s?\.\s?(.*?)$')
                    for pTag in soup.select('p'):
                        ptext = pTag.get_text()
                        m = ptp.match(ptext)
                        if m:
                            if organ_1st == '':
                                organ_1st = m.group(2).strip()
                            creator_institution = creator_institution + '[%s]%s;' % (
                                m.group(1).strip(), m.group(2).strip())
                else:
                    creator_institution = dr.sub('', institutiontext)
            else:

                creator_institution = dr.sub('', institutiontext)
            creator_institution = creator_institution.replace(
                '&nbsp;', '').replace('&#39;',
                                      '\'').strip(';').replace(';;', ';')
            organ_1st = organ_1st.replace('&nbsp;', '').replace(
                '&#39;', '\'').strip(';').replace(';;', ';')
            organ_1st = html.unescape(organ_1st)
            creator_institution = html.unescape(creator_institution)
            creator_institution = creator_institution.replace('(', '').replace(
                ')', '').replace('(', '').replace(')', '')

            if db3type == 'zt':
                onemessage = (lngid, rawid, creator, title, volume, issue,
                              page, beginpage, endpage, publisher, subject,
                              date, creator_institution, date_created, source,
                              identifier_pissn, is_oa, identifier_cnno,
                              description, identifier_doi, language, country,
                              provider, provider_url, provider_id, type_,
                              medium, batch, gch)
                return onemessage

            recv_date = dicitem['receiveTime']
            if recv_date != '':
                recv_date = replacedate(recv_date)
            accept_date = dicitem['onlineTime']
            if accept_date != '':
                accept_date = replacedate(accept_date)
            revision_date = dicitem['backTime']
            if revision_date != '':
                revision_date = replacedate(revision_date)
            journal_raw_id = fullname.split('\\')[-2]

            cited_cnt = dicitem['citedCount']
            if cited_cnt:
                cited_cnt = str(cited_cnt)
            else:
                cited_cnt = 0
            down_cnt = dicitem['downloadCount']
            if down_cnt:
                down_cnt = str(down_cnt)
            else:
                down_cnt = 0

            sub_db = 'QK'
            product = 'ENGINEERING'
            provider = 'HEP'
            sub_db_id = '00036'
            batch = time.strftime("%Y%m%d_%H%M%S")
            down_date = time.strftime("%Y%m%d")
            if dicitem['ossKey']:
                fulltext_type = 'pdf'
            else:
                fulltext_type = ''

            onemessage = (creator, author_1st, creator_institution, organ_1st,
                          title, subject, date, recv_date, accept_date,
                          revision_date, date_created, volume, issue,
                          journal_raw_id, source, page, beginpage, endpage,
                          is_oa, cited_cnt, down_cnt, lngid, rawid, product,
                          sub_db, provider, sub_db_id, type_, url, country,
                          language, batch, down_date, publisher,
                          identifier_pissn, identifier_cnno, description,
                          identifier_doi, fulltext_type)
            if db3type == 'meta':
                return onemessage
            else:
                return False

        except:
            exMsg = '* ' + traceback.format_exc()
            print(exMsg)
            utils.logerror(exMsg)
            utils.logerror(fullname)
            return False
Пример #12
0
    def parse_list_one(self, filename, fullname):
        language = 'ZH'
        country = 'CN'
        provider = 'pishubook'
        type_ = 1
        medium = 2
        batch = time.strftime('%Y%m%d') + '00'
        rawid = filename.replace('.html', '')
        publisher = '社会科学文献出版社'
        date = '1900'
        date_created = '19000000'
        url = 'https://www.pishu.com.cn/skwx_ps/bookdetail?SiteID=14&ID=%s' % rawid
        provider_url = provider + '@' + url
        provider_id = provider + '@' + rawid
        lngid = utils.GetLngid('00056', rawid)
        cover = '/smartlib/pishubook/%s/%s.jpg' % (rawid[:2], rawid)
        cover_path = '%s/%s/%s.jpg' % (self.cover_path, rawid[:2], rawid)
        if not os.path.exists(cover_path):
            cover = ''
        with open(fullname, encoding='utf8') as f:
            text = f.read()
        sel = Selector(text=text)
        try:
            title = sel.xpath('//h3[@class="Buy_tit2"]/text()').extract_first()
            creator = title_alternative = identifier_pisbn = title_series = subject = description = ''
            for trTag in sel.xpath(
                    '//div[@class="books margintop10"]/table/tbody/tr'):
                trstr = trTag.xpath('string(.)').extract_first().strip()
                # utils.printf('trstr:%s' % trstr)
                if trstr.startswith('英 文 名:'):
                    title_alternative = trstr.replace('英 文 名:', '').strip()
                elif trstr.startswith('作 者:'):
                    for author in trTag.xpath('./td/a/text()'):
                        creator = creator + author.extract() + ';'
                    creator = creator.strip(';')
                elif trstr.startswith('I S B N:'):
                    # utils.printf('trstr:%s' % trstr)
                    identifier_pisbn = trstr.replace('I S B N:',
                                                     '').replace('-',
                                                                 '').strip()
                    # utils.printf('identifier_pisbn:%s' % identifier_pisbn)
                elif trstr.startswith('丛 书 名:'):
                    title_series = trstr.replace('丛 书 名:', '').strip()
                elif trstr.startswith('关 键 词:'):
                    for keyword in trTag.xpath('./td/a/text()'):
                        subject = subject + keyword.extract() + ';'
                    subject = subject.strip(';')
                elif trstr.startswith('出版日期:'):
                    date_created = trstr.replace('出版日期:',
                                                 '').strip().replace('-', '')
                    date = date_created[:4]
            description = sel.xpath(
                'string(//div[@class="main_right fr margintop20"]/div/div[@class="summaryCon"])'
            ).extract_first(default='').strip('<<').strip()
            description = description.replace('●', '').replace('•', '').strip()
            onemessage = (lngid, rawid, creator, title, title_alternative,
                          title_series, cover, subject, identifier_pisbn,
                          description, publisher, date, date_created, language,
                          country, provider, provider_url, provider_id, type_,
                          medium, batch)
            bookdetaillist = []
            for article_id in sel.xpath(
                    '//ul[@class="w_checkbox"]/li/a/@onclick'):
                pt = re.compile(r'toGeDataBase\((\d+),.*?\)')
                m = pt.match(article_id.extract())
                if m:
                    # utils.printf('文章号%s' % m.group(1))
                    bookdetaillist.append(m.group(1))
        except:
            exMsg = '* ' + traceback.format_exc()
            print(exMsg)
            utils.logerror(exMsg)
            utils.logerror(fullname)
            return False, False

        return onemessage, bookdetaillist
Пример #13
0
    def parse_detail_one(self, filename, fullname, db3type):
        language = 'EN'
        country = 'US'
        provider = 'aiaabook'
        type_ = 1
        medium = 2
        batch = time.strftime('%Y%m%d') + '00'
        identifier_doi = '10.2514/' + filename.replace('.html', '')
        rawid = identifier_doi
        lngid = utils.GetLngid('00108', rawid)
        provider_url = provider + '@' + 'https://arc.aiaa.org/doi/book/' + identifier_doi
        provider_id = provider + '@' + identifier_doi
        publisher = 'American Institute of Aeronautics and Astronautics'

        date = '1900'
        date_created = '19000000'

        date = self.dic[identifier_doi]
        date_created = date + '0000'
        cover = ''
        cover_path = '%s/%s.jpg' % (self.cover_path, identifier_doi)
        if os.path.exists(cover_path):
            cover = '/smartlib/aiaabook/%s.jpg' % identifier_doi

        with open(fullname, encoding='utf8') as f:
            text = f.read()
        sel = Selector(text=text)
        creator = description = ''
        try:
            identifier_pisbn = identifier_eisbn = title_series = price = ''
            title = sel.xpath('//h5[@class="teaser__group-title"]/text()'
                              ).extract_first().strip()
            # if title == '':
            #     title = sel.xpath('//h1[@class="citation__title"]/text()').extract_first(default='').strip()
            creator = sel.xpath(
                'string(//ul[@class="rlist--inline loa mobile-authors"])'
            ).extract_first(default='').strip().replace('&nbsp;', ' ').replace(
                ' and ', ';')
            creator = creator.strip(';').replace(',', ';')
            creator = re.sub('\s+;', ';', creator)
            creator = re.sub(';\s+', ';', creator)
            for divTag in sel.xpath('//div[@class="teaser__item"]'):
                divstr = divTag.xpath('./text()').extract_first(
                    default='').strip()
                if divstr.startswith('ISBN (print):'):
                    identifier_pisbn = divstr.replace('ISBN (print):',
                                                      '').strip().replace(
                                                          '-', '')
                elif divstr.startswith('eISBN:'):
                    identifier_eisbn = divstr.replace('eISBN:',
                                                      '').strip().replace(
                                                          '-', '')

            title_series = sel.xpath('//head/title/text()').extract_first(
                default='')
            title_series = title_series.split('|')[-1].strip()

            description = sel.xpath('string(//div[@class="NLM_abstract"])'
                                    ).extract_first(default='').strip()
            if description.startswith('Description'):
                description = description[11:].strip()
            elif description.startswith('About the Book'):
                description = description[14:].strip()
            for divpriceTag in sel.xpath(
                    '//div[@class="book-product__content"]'):
                pricelist = divpriceTag.xpath(
                    './div/span[@class="book-product__price__value"]/text()'
                ).extract()
                index = len(pricelist)
                if index > 0:
                    price = pricelist[index - 1].strip()
                product_header = divpriceTag.xpath(
                    './h4/text()').extract_first(default='')
                if product_header == 'PDF':
                    break
            onemessage = (lngid, rawid, creator, title, identifier_pisbn,
                          identifier_eisbn, description, publisher, cover,
                          title_series, date, date_created, price, language,
                          country, provider, provider_url, identifier_doi,
                          provider_id, type_, medium, batch)

            if db3type == 'zt':
                return onemessage

            # keyword = subject
            # subject = ''
            # journal_raw_id = fullname.split('\\')[-2]
            # for sub_str in sel.xpath('//div[@class="jonListTitle"]/a/text()'):
            #     sub_str = sub_str.extract().strip()
            #     if sub_str == '首页':
            #         continue
            #     subject = subject + sub_str + ';'
            # subject = subject.strip(';')

            # down_cnt = divTag.xpath('./ul/li/span[@class="span02"]/text()').extract_first().replace('下载量:','').strip().replace(',','')
            # fulltext_type = ''
            # pdfTag = divTag.xpath('./ul/li/a[@id="clicknumber"]').extract_first()
            # if pdfTag:
            #     fulltext_type = 'pdf;'
            # htmlTag =  divTag.xpath('./ul/li/span[@id="ctl00_ContentPlaceHolder1_html_show"]/a')
            # if htmlTag:
            #     fulltext_type += 'html;'
            # xmlTag =  divTag.xpath('./ul/li/span[@id="ctl00_ContentPlaceHolder1_xml_show"]/a')
            # if xmlTag:
            #     fulltext_type += 'xml;'
            # fulltext_type = fulltext_type.strip(';')
            # product = 'HANS'
            # sub_db = 'QK'
            # provider = 'HANS'
            # sub_db_id = '00046'
            # provider_url = url
            # batch = time.strftime("%Y%m%d_%H%M%S")
            # down_date = time.strftime("%Y%m%d")
            # down_cnt = down_cnt + '@' + down_date
            # # utils.printf(subject,down_cnt,fulltext_type)
            # refTaglist = divTag.xpath('./div/table/tr/td[@width="45"]')
            # ref_cnt = ''
            # if len(refTaglist) > 0:
            #     ref_cnt = refTaglist[-1].xpath('string(.)').extract_first().strip().replace('[','').replace(']','')

            # onemessage = (creator,author_1st,creator_institution,organ_1st,title,title_alternative,keyword,date,
            # date_created,volume,issue,journal_raw_id,source,source_en,page,beginpage,endpage,subject,is_oa,down_cnt,
            # lngid,rawid,product,sub_db,provider,sub_db_id,type_,provider_url,country,language,batch,down_date,publisher,
            # identifier_pissn,identifier_eissn,description,description_en,identifier_doi,description_fund,ref_cnt,
            # fulltext_type)
            # if db3type == 'meta':
            #     return onemessage
            # else:
            #     return False
        except:
            exMsg = '* ' + traceback.format_exc()
            print(exMsg)
            utils.logerror(exMsg)
            utils.logerror(fullname)
            return False
Пример #14
0
    def parse_detail(self):
        cover_list = []
        cover_now_path = self.coverPath + '\\' + self.now_time
        for root, dirs, files in os.walk(cover_now_path):
            for file in files:
                rawid = file.replace('.jpg', '')
                cover_list.append(rawid)
        print(len(cover_list))
        conn = sqlite3.connect("book.db3")
        sub_db_id = '202'
        sub_db = 'TS'
        provider = 'fifedubook'
        type_ = '1'
        language = 'ZH'
        country = 'CN'
        date = '1900'
        date_created = '19000000'
        medium = '2'
        result = []
        result_2 = []
        sql_insert = "insert into modify_title_info_zt(title, rawid, Lngid, provider, provider_id, provider_url, cover, batch, description_unit, type, language, country, date, date_created, medium) values(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)"
        sql_up = 'update list set stat = 1 where rawid = %s'
        while True:
            sql = "select rawid, url, title from list where stat = 0 limit 1000"
            cur = self.conn.cursor()
            cur.execute(sql)
            rows = cur.fetchall()
            if len(rows) == 0:
                break
            else:
                for rawid, url, title in rows:
                    print(title)
                    Lngid = utils.GetLngid(sub_db_id, rawid)
                    provider_url = provider + '@' + "http://lib.fifedu.com/toVideoPage.do?id=%s" % rawid
                    provider_id = provider + '@' + rawid
                    if rawid in cover_list:
                        cover = "/smartlib" + "/" + provider + "/" + rawid + ".jpg"
                    else:
                        cover = ''
                    batch = str(self.now_time) + '00'
                    try:
                        res = requests.get(url,
                                           headers=self.headers,
                                           proxies=self.proxy)
                        fe = 'iframe'
                        if res.status_code == 200:
                            if res.text.find(fe) > 0:
                                html = Selector(res.text, type='html')
                                mulu_id = html.xpath(
                                    "//div[@align='center']/iframe/@src"
                                ).extract_first('').replace(
                                    'http://www.iyangcong.com/service/ilearning/reading?id=',
                                    '')
                                mulu_url = 'http://www.iyangcong.com/book/catalog/1/%s' % mulu_id
                                res_mulu = requests.get(mulu_url)
                                if res_mulu.text == 'null':
                                    mulu_url = 'http://www.iyangcong.com/book/catalog/10/%s' % mulu_id
                                    res_mulu = requests.get(mulu_url)
                                mulu_list = json.loads(res_mulu.text)
                                mulu_zh = mulu_en = ""
                                for mulu in mulu_list:
                                    if mulu['title_zh'].replace(
                                            '<p>', ''
                                    ).replace('</p>', '').replace(
                                            '<font color=#003D79>', ''
                                    ).replace('</font>', '').replace(
                                            '<center>', ''
                                    ).replace('</center>', '').replace(
                                            '[^1]', ''
                                    ).replace('<ol type="a">', '').replace(
                                            '<li></li>', ''
                                    ).replace('</ol>', '').replace(
                                            '<ol>', ''
                                    ).replace('<li>', '').replace(
                                            '</li>', '').strip().replace(
                                                '<u>', ''
                                            ).replace('</u>', '').replace(
                                                "</strong>", ''
                                            ).replace('<strong>', '').replace(
                                                '</CENTER>', ''
                                            ).replace('<CENTER>', '').replace(
                                                '</h1>', '').replace(
                                                    '<h1>', '').replace(
                                                        '<font color=#844200>',
                                                        '') != '':
                                        mulu_zh += mulu['title_zh'].replace(
                                            '<p>', ''
                                        ).replace('</p>', '').replace(
                                            '<font color=#003D79>', ''
                                        ).replace('</font>', '').replace(
                                            '<center>', ''
                                        ).replace('</center>', '').replace(
                                            '[^1]', ''
                                        ).replace('<ol type="a">', '').replace(
                                            '<li></li>', ''
                                        ).replace('</ol>', '').replace(
                                            '<ol>', ''
                                        ).replace('<li>', '').replace(
                                            '</li>', ''
                                        ).strip().replace('<u>', '').replace(
                                            '</u>', ''
                                        ).replace("</strong>", '').replace(
                                            '<strong>', ''
                                        ).replace('</CENTER>', '').replace(
                                            '<CENTER>',
                                            '').replace('</h1>', '').replace(
                                                '<h1>', ''
                                            ).replace('</h2>', '').replace(
                                                '<h2>', '').replace(
                                                    '<font color=#844200>', ''
                                                ).replace(
                                                    '<font color=brown>',
                                                    '').replace(
                                                        '<div align="center">',
                                                        '').replace(
                                                            '</div>', '') + ';'
                                    else:
                                        mulu_zh += ''

                                    if mulu['title_en'].replace(
                                            '<p>', ''
                                    ).replace('</p>', '').replace(
                                            '<font color=#003D79>', ''
                                    ).replace('</font>', '').replace(
                                            '<center>', ''
                                    ).replace('</center>', '').replace(
                                            '[^1]', ''
                                    ).replace('<ol type="a">', '').replace(
                                            '<li></li>', ''
                                    ).replace('</ol>', '').replace(
                                            '<ol>', ''
                                    ).replace('<li>', '').replace(
                                            '</li>', '').strip().replace(
                                                '<u>', ''
                                            ).replace('</u>', '').replace(
                                                "</strong>", ''
                                            ).replace('<strong>', '').replace(
                                                '</CENTER>', ''
                                            ).replace('<CENTER>', '').replace(
                                                '</h1>', '').replace(
                                                    '<h1>', '').replace(
                                                        '<font color=#844200>',
                                                        '') != '':
                                        mulu_en += mulu['title_en'].replace(
                                            '<p>', ''
                                        ).replace('</p>', '').replace(
                                            '<font color=#003D79>', ''
                                        ).replace('</font>', '').replace(
                                            '<center>', ''
                                        ).replace('</center>', '').replace(
                                            '[^1]', ''
                                        ).replace('<ol type="a">', '').replace(
                                            '<li></li>', ''
                                        ).replace('</ol>', '').replace(
                                            '<ol>', ''
                                        ).replace('<li>', '').replace(
                                            '</li>', ''
                                        ).strip().replace('<u>', '').replace(
                                            '</u>', ''
                                        ).replace("</strong>", '').replace(
                                            '<strong>', ''
                                        ).replace('</CENTER>', '').replace(
                                            '<CENTER>',
                                            '').replace('</h1>', '').replace(
                                                '<h1>', ''
                                            ).replace('</h2>', '').replace(
                                                '<h2>', '').replace(
                                                    '<font color=#844200>', ''
                                                ).replace(
                                                    '<font color=brown>',
                                                    '').replace(
                                                        '<div align="center">',
                                                        '').replace(
                                                            '</div>', '') + ';'
                                    else:
                                        mulu_en += ''
                                if mulu_zh.replace(';', '') == '':
                                    description_unit = mulu_en
                                else:
                                    description_unit = mulu_zh
                                # print(description_unit)
                                result.append(
                                    (title, rawid, Lngid, provider,
                                     provider_id, provider_url, cover, batch,
                                     description_unit, type_, language,
                                     country, date, date_created, medium))
                                result_2.append((rawid))
                                if utils.parse_results_to_sql(
                                        conn, sql_insert, result, 100):
                                    print("插入%s条" % len(result))
                                    result.clear()
                                if utils.parse_results_to_sql(
                                        self.conn, sql_up, result_2, 100):
                                    print("更新%s条" % len(result_2))
                                    result_2.clear()
                            else:
                                print('not find fe')
                    except Exception as e:
                        print(e)
                utils.parse_results_to_sql(conn, sql_insert, result)
                print("插入%s条" % len(result))
                result.clear()
                utils.parse_results_to_sql(self.conn, sql_up, result_2)
                print("更新%s条" % len(result_2))
                result_2.clear()