def parsel_detail_one(): conn_1 = pymysql.connect(DBHOST, DBUSER, DBPWD, DB) conn_2 = sqlite3.connect('mirrorimutmeixingbook_20191218.db3') sub_db_id = '243' provider = 'mirrorimutmeixingbook' type = '1' date = '1900' date_created = '19000000' medium = '2' sql_in = "insert into modify_title_info_zt(Lngid, rawid, provider, type, language, country, provider_url, provider_id, batch, title, creator, provider_subject, date, date_created, medium) values(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)" result_2 = [] now_time = time.strftime('%Y%m%d') fdir = '%s/%s' % (detail_path, now_time) for _, filename in utils.file_list(fdir): rawid = _.replace(".html", "") with open(filename, encoding='gb18030') as f: text = f.read() html = Selector(text, "html") creator = html.xpath( "//table[@style='WORD-BREAK: break-all']//tr/td/text()").extract( )[0].replace("作者:", "") if creator == "unknow": creator = "" if "ja" in rawid: id_ = rawid.replace('ja', '') url = "http://202.207.22.13:100/Soft_Showja.asp?SoftID=%s" % id_ language = "JA" country = "JP" Lngid = utils.GetLngid(sub_db_id, rawid) else: language = "EN" country = "US" url = "http://202.207.22.13:100/Soft_Show.asp?SoftID=%s" % rawid Lngid = utils.GetLngid(sub_db_id, rawid) sql = "select title,provider_subject from detail where url = '%s'" % url cur = conn_1.cursor() cur.execute(sql) rows = cur.fetchall() title = rows[0][0].replace("\n", ' ') provider_subject = rows[0][1].replace("数字图书;", '') provider_url = provider + '@' + url provider_id = provider + '@' + rawid batch = str(now_time) + '00' result_2.append((Lngid, rawid, provider, type, language, country, provider_url, provider_id, batch, title, creator, provider_subject, date, date_created, medium)) utils.parse_results_to_sql(conn_2, sql_in, result_2) utils.printf("插入剩下的%s条" % len(result_2)) result_2.clear()
def parse_detail_one(self, filename, fullname): language = 'ZH' country = 'CN' provider = 'ydylcninfo' type_ = 14 medium = 2 batch = time.strftime('%Y%m%d') + '00' rawid = filename.replace('.html', '') date = '1900' date_created = '19000000' infotype = fullname.split('\\')[-2] url = 'https://www.ydylcn.com/%s/%s.shtml' % (infotype, rawid) provider_url = provider + '@' + url provider_id = provider + '@' + rawid lngid = utils.GetLngid('00042', rawid) with open(fullname, encoding='utf8') as f: text = f.read() sel = Selector(text=text) try: title = sel.xpath('//div[@class="tit"]/h1/text()').extract_first() creator = description_source = description = '' for divstr in sel.xpath( '//div[@class="info"]/span/text()').extract(): utils.printf('divstr:%s' % divstr) if divstr.startswith('来源:'): description_source = divstr.replace('来源:', '').replace( '、', ';').replace(';', ';').strip() elif divstr.startswith('作者:'): creator = divstr.replace('作者:', '').strip().replace( ' ', ';').strip(';') elif divstr.startswith('发布时间:'): date_created = divstr.replace('发布时间:', '').replace('-', '').strip() if len(date_created) == 6: date_created = date_created + '00' date = date_created[0:4] descriptions = sel.xpath( "//div[@class='txt']/p[@style='text-align: justify;']/span/text()" ).extract() for item in descriptions: description += item + "\n" onemessage = (lngid, rawid, creator, title, description, description_source, date, date_created, language, country, provider, provider_url, provider_id, type_, medium, batch) except: exMsg = '* ' + traceback.format_exc() print(exMsg) utils.logerror(exMsg) utils.logerror(fullname) return False return onemessage
def parse_detail(self): cover_list = [] cover_now_path = self.coverPath + '\\' + self.now_time for root, dirs, files in os.walk(cover_now_path): for file in files: rawid = file.replace('.jpg','') cover_list.append(rawid) print(len(cover_list)) conn = sqlite3.connect("video.db3") now_path = self.detailPath + '\\' + self.now_time sub_db_id = '203' sub_db = 'DMT' provider = 'fifeduvideo' type_ = '10' language = 'ZH' country = 'CN' date = '1900' date_created = '19000000' medium = '2' sql = "insert into modify_title_info_zt(Lngid, rawid, provider, type, language, country, provider_url, provider_id, cover, batch, title, description, provider_subject, date, date_created, medium) values(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)" result = [] for root, dirs, files in os.walk(now_path): for file in files: rawid = file.replace('.html','') Lngid = utils.GetLngid(sub_db_id, rawid) provider_url = provider + '@' + "http://lib.fifedu.com/toVideoPage.do?id=%s" % rawid provider_id = provider + '@' + rawid if rawid in cover_list: cover = "/smartlib" + "/" + provider + "/" + rawid + ".jpg" else: cover = '' batch = str(self.now_time) + '00' file_path = root + '/' + file print(file_path) with open(file_path, encoding='utf8') as f: text = f.read() html = Selector(text, type='html') provider_subject = description = title = '' provider_subjects = html.xpath("//div[@class='detailnavbody']/a/text()").extract()[1:-1] title = html.xpath("//div[@class='detailnavbody']/a/text()").extract()[-1] description = html.xpath("//div[@class='tabvalue']/text()").extract_first('').strip() for item in provider_subjects: provider_subject += item + ';' provider_subject = provider_subject.replace('在线课程库;','').replace('玩转多语种;','').replace('视听练兵场;','') result.append( (Lngid, rawid, provider, type_, language, country, provider_url, provider_id, cover, batch, title, description, provider_subject, date, date_created, medium) ) # if utils.parse_results_to_sql(conn, sql, result, 100): # print("插入%s条" % len(result)) # result.clear() utils.parse_results_to_sql(conn, sql, result) print("插入剩下得%s条" % len(result)) result.clear()
def parse_detail(): result = [] conn_db3 = sqlite3.connect("zt_template.db3") sql_in = """ insert into modify_title_info_zt (Lngid, rawid, provider, type, language, country, provider_url, provider_id, cover, batch, title, description, provider_subject, date, date_created, creator, medium , publisher) values (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?) """ # 基本信息 language = 'ZH' country = 'CN' type_ = '1' provider = '360elibbook' sub_db_id = '258' batch = now_time + '00' medium = "2" date = "1900" date_created = date + "0000" fdir = "%s\%s" % (detail_path,now_time) for _,dir_ in utils.file_list(fdir): utils.printf(dir_) pa = r"E:\\work\\360elib\\detail\\%s\\(.*)\\" % (now_time) provider_subject = re.findall(pa,dir_)[0] if provider_subject == 'None': provider_subject = "" with open(dir_,encoding='utf-8')as f: text = f.read() html = Selector(text,'html') rawid = _.replace(".html","") Lngid = utils.GetLngid(sub_db_id, rawid) provider_url = provider + '@' + "http://www.360elib.com:2100/chinese/web/Details.aspx?id=%s" % (rawid) provider_id = provider + '@' + rawid title = html.xpath("//span[@id='ctl00_ContentPlaceHolder1_lb_name']/text()").extract_first() creator = html.xpath("//span[@id='ctl00_ContentPlaceHolder1_lb_zz']/text()").extract_first("").replace(", ","").replace(",","").replace(",","").replace("、","") publisher = html.xpath("//span[@id='ctl00_ContentPlaceHolder1_lb_cbs']/text()").extract_first("") description = html.xpath("//span[@id='ctl00_ContentPlaceHolder1_lb_bookintro']/text()").extract_first("") cover_rawid = rawid.lower() cover_p = '%s/%s/%s.jpg' % (cover_path,now_time,cover_rawid) if os.path.exists(cover_p): cover = "/smartlib" + "/" + provider + "/" + cover_rawid + ".jpg" else: cover = "" result.append( (Lngid, rawid, provider, type_, language, country, provider_url, provider_id, cover, batch, title, description, provider_subject, date, date_created, creator, medium , publisher) ) if utils.parse_results_to_sql(conn_db3, sql_in, result, 1000): utils.printf("插入%s条" % len(result)) result.clear() utils.parse_results_to_sql(conn_db3, sql_in, result) utils.printf("插入剩下得%s条" % len(result)) result.clear()
def parse_index_one(self, filename, fullname): language = 'ZH' country = 'CN' provider = 'pishuvideo' type_ = 10 medium = 2 batch = time.strftime('%Y%m%d') + '00' rawid = filename.replace('.html', '') publisher = '社会科学文献出版社' date = '1900' date_created = '19000000' url = 'https://www.pishu.com.cn/skwx_ps/multimedia/VideoDetail?SiteID=14&type=Video&ID=%s' % rawid provider_url = provider + '@' + url provider_id = provider + '@' + rawid lngid = utils.GetLngid('00059', rawid) with open(fullname, encoding='utf8') as f: text = f.read() sel = Selector(text=text) try: title = sel.xpath('//div[@class="zl_vedioTit"]/span/text()').extract_first() creator = title_alternative = identifier_pisbn = title_series = subject = description = '' for spanTag in sel.xpath('//div[@class="vedioCon"]/span'): spanstr = spanTag.xpath('string(.)').extract_first().strip() # utils.printf('trstr:%s' % trstr) if spanstr.startswith('制作时间:'): date_created = spanTag.xpath('./following::text()[1]').extract_first().replace('-','') date = date_created[:4] elif spanstr.startswith('关键词:'): subject = spanTag.xpath('./following::text()[1]').extract_first().replace(' ',';') subject = re.sub(';+',';',subject).strip().strip(';') elif spanstr.startswith('内容摘要:'): # utils.printf('trstr:%s' % trstr) description = spanTag.xpath('./following::text()[1]').extract_first().strip() # utils.printf('identifier_pisbn:%s' % identifier_pisbn) onemessage = (lngid, rawid, title, subject,description, publisher, date, date_created, language, country, provider,provider_url, provider_id,type_, medium, batch) except: exMsg = '* ' + traceback.format_exc() print(exMsg) utils.logerror(exMsg) utils.logerror(fullname) return False return onemessage
def parse_detail(): conn = sqlite3.connect('template.db3') language = "EN" type = "1" medium = "2" provider = "mirrorbeihuakingbook" country = "US" sub_db_id = "217" now_time = time.strftime('%Y%m%d') batch = now_time + "00" stmt = ( '''insert into modify_title_info_zt(lngid,rawid,title,creator,description,date,date_created,identifier_pisbn,language,country,provider,provider_url,provider_id,type,medium,batch,publisher) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,);''' ) results = [] cnt=0 dir_path = detail_path + '/' + now_time for file, fullpath in utils.file_list(dir_path): with open(fullpath, encoding='gb18030') as fp: txt = fp.read() title, creator, publishers, date, identifier_pisbn, description = _parse_detail_one(txt) # print(title, creator, publishers, date, identifier_pisbn, description) date_created = date + '0000' basename, _, ext = file.partition(".") rawid = basename provider_url = provider + "@http://10.5.23.18:8079/book/bookinfo.aspx?id=" + rawid provider_id = provider + "@" + rawid lngID = utils.GetLngid(sub_db_id, rawid) results.append( ( lngID, rawid, title, creator, description, date, date_created, identifier_pisbn, language, country, provider, provider_url, provider_id, type, medium, batch, publishers ) ) if utils.parse_results_to_sql(conn, stmt, results, 1000): cnt+=len(results) print('%s:%s' % (time.strftime("%Y/%m/%d %X"),cnt)) results.clear() utils.parse_results_to_sql(conn, stmt, results) cnt+=len(results) print('%s:%s' % (time.strftime("%Y/%m/%d %X"),cnt)) conn.close()
def parse_detail_one(self, filename, fullname): language = 'ZH' country = 'CN' provider = 'ydylcninfo' type_ = 14 medium = 2 batch = time.strftime('%Y%m%d') + '00' rawid = filename.replace('.html', '') publisher = '社会科学文献出版社' date = '1900' date_created = '19000000' url = 'https://www.ydylcn.com/skwx_ydyl/initDatabaseDetail?siteId=1&contentId=%s&contentType=literature' % rawid provider_url = provider + '@' + url provider_id = provider + '@' + rawid lngid = utils.GetLngid('00042', rawid) with open(fullname, encoding='utf8') as f: text = f.read() sel = Selector(text=text) try: title = sel.xpath('//div[@class="info"]/h1/text()').extract_first() creator = pagecount = source = title_series = subject = description = subject_en = creator_bio = '' for spanTag in sel.xpath( '//div[@class="info"]/div[@class="con"]/span'): spanstr = spanTag.xpath('string(.)').extract_first().strip() # utils.printf('spanstr:%s' % spanstr) if spanstr.startswith('作者:'): creator = spanstr.replace('作者:', '').strip().replace(' ', ';') elif spanstr.startswith('出版日期:'): date_created = spanstr.replace( '出版日期:', '').strip().replace('年', '').replace('月', '') if len(date_created) == 6: date_created = date_created + '00' date = date_created[0:4] elif spanstr.startswith('报告页数:'): pagecount = spanstr.replace('报告页数:', '').replace('页', '').replace( ' ', '').strip() elif spanstr.startswith('所属丛书:'): title_series = spanTag.xpath('./a/@title').extract_first() elif spanstr.startswith('所属图书:'): source = spanTag.xpath('./a/@title').extract_first() description = sel.xpath( 'string(//div[@class="item"]/div[@class="txt"])' ).extract_first(default='').strip() description_en = sel.xpath( 'string(//div[@class="item en"]/div[@class="txt"])' ).extract_first(default='').strip() for divTag in sel.xpath('//div[@class="keywords"]'): divstr = divTag.xpath('string(.)').extract_first().strip() if divstr.startswith('关键词:'): for aTag in divTag.xpath('./a/text()'): subject = subject + aTag.extract().strip() + ';' subject = subject.strip(';') elif divstr.startswith('Keywords:'): for aTag in divTag.xpath('./a/text()'): subject_en = subject_en + aTag.extract().strip() + ';' subject_en = subject_en.strip(';') for divTag in sel.xpath('//div[@class="desc"]/div/p'): divstr = divTag.xpath('string(.)').extract_first().strip() if divstr.find('暂无简介') < 0: creator_bio = creator_bio + divstr + '\n' creator_bio = creator_bio.strip() onemessage = (lngid, rawid, creator, title, title_series, subject, subject_en, description, description_en, pagecount, source, creator_bio, publisher, date, date_created, language, country, provider, provider_url, provider_id, type_, medium, batch) except: exMsg = '* ' + traceback.format_exc() print(exMsg) utils.logerror(exMsg) utils.logerror(fullname) return False return onemessage
def parse_list_one(self, filename, fullname): language = 'ZH' country = 'CN' provider = 'ydylcnbook' type_ = 1 medium = 2 batch = time.strftime('%Y%m%d') + '00' rawid = filename.replace('.html', '') publisher = '社会科学文献出版社' date = '1900' date_created = '19000000' url = 'https://www.ydylcn.com/skwx_ydyl/bookdetail?SiteID=1&ID=%s' % rawid provider_url = provider + '@' + url provider_id = provider + '@' + rawid lngid = utils.GetLngid('00041', rawid) cover = '/smartlib/ydylcnbook/%s.jpg' % rawid cover_path = '%s/%s.jpg' % (self.cover_path, rawid) if not os.path.exists(cover_path): cover = '' with open(fullname, encoding='utf8') as f: text = f.read() sel = Selector(text=text) try: title = sel.xpath( '//div[@class="tit-g1"]/h3/text()').extract_first() creator = title_alternative = identifier_pisbn = title_series = subject = description = '' for divTag in sel.xpath('//div[@class="desc"]/p'): divstr = divTag.xpath('string(.)').extract_first().strip() # utils.printf('divstr:%s' % divstr) if divstr.startswith('英 文 名:'): title_alternative = divstr.replace('英 文 名:', '').strip() elif divstr.startswith('作 者:'): for author in divTag.xpath('./a/text()'): creator = creator + author.extract() + ';' creator = creator.strip(';') elif divstr.startswith('I S B N:'): # utils.printf('divstr:%s' % divstr) identifier_pisbn = divstr.replace('I S B N:', '').replace('-', '').strip() # utils.printf('identifier_pisbn:%s' % identifier_pisbn) elif divstr.startswith('丛 书 名:'): title_series = divstr.replace('丛 书 名:', '').strip() elif divstr.startswith('关键词:'): for keyword in divTag.xpath('./a/text()'): subject = subject + keyword.extract() + ';' subject = subject.strip(';') description = sel.xpath( 'string(//div[@class="item"]/div[@class="txt"])' ).extract_first().strip() description = description.replace('●', '').strip() onemessage = (lngid, rawid, creator, title, title_alternative, title_series, cover, subject, identifier_pisbn, description, publisher, date, date_created, language, country, provider, provider_url, provider_id, type_, medium, batch) bookdetaillist = [] for article_id in sel.xpath( '//ul[@class="list-article-1"]/li/h5/a/@onclick'): pt = re.compile(r'toGeDataBase\((\d+),.*?\)') m = pt.match(article_id.extract()) if m: # utils.printf('文章号%s' % m.group(1)) bookdetaillist.append(m.group(1)) except: exMsg = '* ' + traceback.format_exc() print(exMsg) utils.logerror(exMsg) utils.logerror(fullname) return False, False return onemessage, bookdetaillist
def parse_detail_one(self, filename, fullname): try: language = 'EN' country = 'CN' provider = 'hepjournal' type_ = 3 medium = 2 batch = time.strftime('%Y%m%d') + '00' identifier_doi = filename.replace('.html', '').replace('_', '/') rawid = identifier_doi gch = fullname.split('\\')[-2] source, identifier_pissn, identifier_eissn, identifier_cnno = self.dic[ gch] publisher = 'Higher Education Press' with open(fullname, encoding='utf8') as f: text = f.read() soup = BeautifulSoup(text, 'lxml') url = '' if gch == 'engi': url = 'http://engineering.org.cn/EN/%s' % rawid elif gch == 'laf' and not rawid.split('/')[0].startswith('10.'): identifier_doi = '' urlTag = soup.select_one('meta[name*="HW.ad-path"]') url = urlTag.get('content').strip() rawid = url.replace('http://journal.hep.com.cn/laf/EN/', '') else: url = 'http://journal.hep.com.cn/%s/EN/%s' % (gch, rawid) provider_url = provider + '@' + url provider_id = provider + '@' + rawid gch = provider + "@" + gch lngid = utils.GetLngid('00025', rawid) title = '' titleTag = soup.select_one('div.J_biaoti_en') if titleTag: title = ''.join(titleTag.stripped_strings) description = ''.strip() for tdTag in soup.select('td[class="J_zhaiyao"]'): if tdTag.select_one('p'): description = ''.join(tdTag.p.stripped_strings) break bTag = tdTag.select_one('b') if bTag: if bTag.get_text() == 'Abstract': description = ''.join(tdTag.stripped_strings).replace( 'Abstract', '') break date_created = date = '' dateTag = soup.select_one('meta[name*="DC.Date"]') if dateTag: date_created = dateTag.get('content').replace('-', '') else: date_created = '19000000' if date_created == '': for spanTag in soup.select('span[class*="J_zhaiyao"]'): strspan = ''.join(spanTag.stripped_strings) if strspan.startswith('Online First Date:'): date_created = strspan.replace('Online First Date:', '').strip() date_created = replacedate(date_created) break date = date_created[:4] subject = '' subjectTag = soup.select_one('meta[name*="keywords"]') if subjectTag: subject = subjectTag.get('content').replace(',', ';').strip(';') subject = re.sub(r'</?\w+[^>]*>', '', subject) if subject == ' ': subject = '' beginpage = '' beginpageTag = soup.select_one('meta[name*="citation_firstpage"]') if beginpageTag: beginpage = beginpageTag.get('content').strip() endpage = '' endpageTag = soup.select_one('meta[name*="citation_lastpage"]') if endpageTag: endpage = endpageTag.get('content').strip() if endpage == '': endpage = beginpage page = '' if not beginpage == '': page = beginpage + '-' + endpage volume = '' volumeTag = soup.select_one('meta[name*="citation_volume"]') if volumeTag: volume = volumeTag.get('content').strip() issue = '' issueTag = soup.select_one('meta[name*="citation_issue"]') if issueTag: issue = issueTag.get('content').strip() creator = '' authorTag = soup.select_one('td[class="J_author_EN"]') if authorTag: if authorTag.select_one('sup'): supTags = authorTag.select('sup') suplist = [] tmpsup = '' cnt = 0 authorlist = [] for supTag in supTags: strsup = supTag.get_text() tmpsup += strsup cnt += 1 nextTag = supTag.next_sibling beforeTag = supTag.previous_sibling if isinstance(beforeTag, NavigableString): author = beforeTag.replace('(', '').replace( ')', '').strip().strip(',') if not author == '': authorlist.append(author) # supTag.extract() if isinstance(nextTag, NavigableString): if nextTag == '(': suplist.append(tmpsup.strip(',')) tmpsup = '' continue if not tmpsup.endswith(','): suplist.append(tmpsup) tmpsup = '' elif cnt == len(supTags): suplist.append(tmpsup.strip(',')) # tmpauthor = ''.join(authorTag.stripped_strings) # tmpauthor = tmpauthor.replace('(', '').replace(')', '').strip().strip(',') # authorlist = tmpauthor.split(',') if len(authorlist) == len(suplist): for i in range(len(authorlist)): creator = creator + '%s[%s];' % (authorlist[i], suplist[i]) elif len(authorlist) == len(supTags): for i in range(len(authorlist)): creator = creator + '%s[%s];' % ( authorlist[i], supTags[i].get_text().strip(',')) # print(authorlist) # print(suplist) if creator == '': for i in range(len(authorlist)): if len(authorlist) < len(suplist): creator = creator + '%s[%s];' % (authorlist[i], suplist[i]) else: creator = creator + authorlist[i] + ';' creator = creator.strip(';') else: creator = ''.join(authorTag.stripped_strings) creator = creator.replace('(', '').replace(')', '').replace( ',', ';').strip() creator = re.sub(r';\s*', ';', creator) insTag = soup.select_one('td[class="J_author"]') creator_institution = '' if insTag: for brTag in insTag.select('br'): brTag.insert_after(soup.new_string(";")) affiliation = ''.join(insTag.stripped_strings) affiliation = re.sub(r'\n', '', affiliation) for ins in affiliation.split(';'): ins = ins.strip() ptins = re.compile('(\w{1,2})\.\s*(.*)') m = ptins.match(ins) if m: creator_institution = creator_institution + '[%s]%s;' % ( m.group(1), m.group(2)) if creator_institution == '': creator_institution = affiliation creator_institution = creator_institution.strip(';') onemessage = (lngid, rawid, creator, title, volume, issue, page, beginpage, endpage, publisher, subject, date, creator_institution, date_created, source, identifier_pissn, identifier_eissn, identifier_cnno, description, identifier_doi, language, country, provider, provider_url, provider_id, type_, medium, batch, gch) return onemessage except: exMsg = '* ' + traceback.format_exc() print(exMsg) utils.logerror(exMsg) utils.logerror(fullname) return False
def parse_detail(): conn = sqlite3.connect('video.db3') cover_list = [] now_time = time.strftime('%Y%m%d') cover_now_path = cover_path + '\\' + now_time for root, dirs, files in os.walk(cover_now_path): for file in files: rawid = file.replace('.jpg', '') cover_list.append(rawid) sub_db_id = '223' provider = 'fzwjtvideo' type = '10' language = 'ZH' country = 'CN' date = '1900' date_created = '19000000' medium = '2' result = [] sql = "insert into modify_title_info_zt(Lngid, rawid, provider, type, language, country, provider_url, provider_id, cover, batch, title, description, title_sub, creator, creator_bio, provider_subject, date, date_created, medium) values(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)" for _, filename in utils.file_list(detail_path): replace_1 = r'E:\work\fzwjt\detail' + '\\' + now_time + '\\' rawid = _.replace(".html", '') Lngid = utils.GetLngid(sub_db_id, rawid) provider_subject = filename.replace(replace_1, '').replace('\\' + _, '') provider_url = provider + '@' + "http://www.fzwjt.com/Course/Detail/%s" % rawid provider_id = provider + '@' + rawid batch = str(now_time) + '00' cover = '' if rawid in cover_list: cover = "/smartlib" + "/" + provider + "/" + rawid + ".jpg" with open(filename, encoding='utf8') as f: text = f.read() html = Selector(text, 'html') title_sub = creator_bio = "" title = html.xpath( "//div[@class='cInfo']/dl[@class='base']/dt/b/text()" ).extract_first('') description = html.xpath( "//div[@class='cInfo']/dl[@class='base']/dd[@class='info']/text()" ).extract_first('') title_subs = html.xpath( "//ul[@class='courseListL']/li/p/a/text()").extract() for item in title_subs: title_sub += item + ';' title_sub = title_sub[0:-1] creator = html.xpath( "//ul[@class='courseListL']/li/span[2]/text()").extract_first("") if creator == "解说": creator = "" creator_bio = "" else: # 取下一层一个url获取讲师简介 a = "http://www.fzwjt.com" + html.xpath( "//ul[@class='courseListL']/li/p/a/@href").extract_first("") feature = 'tagB' res = utils.get_html(a, feature=feature, proxies=proxy, timeout=50) if res: html_2 = Selector(res.text, 'html') creator_bio = html_2.xpath( "//div[@class='tagB']/p/text()").extract_first("").replace( """, '').strip() utils.printf(title, 'write right') # utils.printf(title,creator,creator_bio) result.append( (Lngid, rawid, provider, type, language, country, provider_url, provider_id, cover, batch, title, description, title_sub, creator, creator_bio, provider_subject, date, date_created, medium)) if utils.parse_results_to_sql(conn, sql, result, 100): print("插入%s条" % len(result)) result.clear() utils.parse_results_to_sql(conn, sql, result) print("插入剩下得%s条" % len(result)) result.clear()
def parse_detail_one(self, filename, fullname, db3type): try: language = 'EN' country = 'CN' provider = 'hepengineeringjournal' type_ = 3 medium = 2 batch = time.strftime('%Y%m%d') + '00' identifier_doi = '' rawid = '' gch = fullname.split('\\')[-2] source, identifier_pissn, identifier_cnno = self.dic[gch] publisher = 'Higher Education Press' with open(fullname, encoding='utf8') as f: text = f.read() dicitem = json.loads(text, encoding='utf-8')['resultValue'] date_created = dicitem['publicTime'] if date_created == '': date_created = dicitem['onlineTime'] if date_created != '': date_created = replacedate(date_created) elif dicitem['year'] != '': date_created = str(dicitem['year']) + '0000' else: date_created = '19000000' date = date_created[:4] identifier_doi = dicitem['doiNm'] rawid = filename.replace('.json', '') is_oa = dicitem['isOa'] # if identifier_doi == '': # articlenum_dic = json.loads(dicitem['attachment'],encoding='utf-8') # if articlenum_dic.__contains__('fileName'): # articlenum = articlenum_dic['fileName'].strip('.pdf') # else: # articlenum = articlenum_dic['key'].split('_')[-1].strip('.pdf') # url = 'http://www.engineering.org.cn/en/article/%s' % articlenum # else: # url = 'http://www.engineering.org.cn/en/%s' % identifier_doi url = 'http://www.engineering.org.cn/default/page/loadPageIndex?pageId=ab4265bb601844d298ec9cd21f046661&id=%s' % rawid.split( '_')[-1] provider_url = provider + '@' + url provider_id = provider + '@' + rawid gch = provider + "@" + gch lngid = utils.GetLngid('00036', rawid) beginpage = str(dicitem['pageStart']) endpage = str(dicitem['pageEnd']) if endpage == '' or endpage == '0': endpage = beginpage page = '' if not beginpage == '': page = beginpage + '-' + endpage volume = dicitem['volumeNm'] issue = dicitem['issueNm'] dr = re.compile(r'<[^>]+>', re.S) subject = dicitem['keyWords'].replace(',', ';').replace(';', ';').strip() subject = re.sub(r';\s+', ';', subject) subject = re.sub(r'\s+;', ';', subject) subject = dr.sub('', subject) title = dicitem['title'] title = dr.sub('', title) description = dicitem['summary'] description = dr.sub('', description) if description == '' or description == ' ': description = dicitem['content'] soup = BeautifulSoup(description, 'lxml') description = soup.get_text() description = description.strip() author_1st = '' creator = '' # authortext = dicitem['articleAuthor'].replace('é','é').replace('è','è').replace('’','\'') authortext = dicitem['articleAuthor'] if authortext.find('<sup>') > 0: authortext = authortext.replace(' ', '').replace( '</sup>, ', '</sup>、') for author in authortext.split('、'): author = author.strip() # utils.printf(author) ptsup = re.compile('.*?(<sup>(.*)</sup>).*?') m = ptsup.match(author) if m: dauthor = author.replace(m.group(1), '').strip() dauthor = dr.sub('', dauthor) if author_1st == '': author_1st = dauthor creator = creator + '%s[%s];' % ( dauthor, dr.sub('', m.group(2).strip().strip(','))) else: creator = authortext.replace('、 ', ';').replace( '、 ', ';').replace('、', ';') creator = dr.sub('', creator) creator = creator.replace(' ', '') if author_1st == '': author_1st = creator.split(';')[0] creator = creator.strip(';').replace(' and ', ';') creator = html.unescape(creator) organ_1st = '' creator_institution = '' institutiontext = dicitem['authorUnit'] if creator.find('[') > -1: if institutiontext.find('<sup>') > -1: institutiontext = institutiontext.replace( '<sup>', '<br /><sup>') for ins in institutiontext.split('<br />'): ins = ins.strip() ptsup = re.compile('.*(<sup>(.*?)</sup>).*') m = ptsup.match(ins) if m: dins = ins.replace(m.group(1), '') dins = dr.sub('', dins).strip() if organ_1st == '': organ_1st = dins.strip('. ') creator_institution = creator_institution + '[%s]%s;' % ( m.group(2).strip(), dins.strip('. ')) elif institutiontext.find('<p>') > -1: soup = BeautifulSoup(institutiontext, 'lxml') ptp = re.compile('^(\w)\s?\.\s?(.*?)$') for pTag in soup.select('p'): ptext = pTag.get_text() m = ptp.match(ptext) if m: if organ_1st == '': organ_1st = m.group(2).strip() creator_institution = creator_institution + '[%s]%s;' % ( m.group(1).strip(), m.group(2).strip()) else: creator_institution = dr.sub('', institutiontext) else: creator_institution = dr.sub('', institutiontext) creator_institution = creator_institution.replace( ' ', '').replace(''', '\'').strip(';').replace(';;', ';') organ_1st = organ_1st.replace(' ', '').replace( ''', '\'').strip(';').replace(';;', ';') organ_1st = html.unescape(organ_1st) creator_institution = html.unescape(creator_institution) creator_institution = creator_institution.replace('(', '').replace( ')', '').replace('(', '').replace(')', '') if db3type == 'zt': onemessage = (lngid, rawid, creator, title, volume, issue, page, beginpage, endpage, publisher, subject, date, creator_institution, date_created, source, identifier_pissn, is_oa, identifier_cnno, description, identifier_doi, language, country, provider, provider_url, provider_id, type_, medium, batch, gch) return onemessage recv_date = dicitem['receiveTime'] if recv_date != '': recv_date = replacedate(recv_date) accept_date = dicitem['onlineTime'] if accept_date != '': accept_date = replacedate(accept_date) revision_date = dicitem['backTime'] if revision_date != '': revision_date = replacedate(revision_date) journal_raw_id = fullname.split('\\')[-2] cited_cnt = dicitem['citedCount'] if cited_cnt: cited_cnt = str(cited_cnt) else: cited_cnt = 0 down_cnt = dicitem['downloadCount'] if down_cnt: down_cnt = str(down_cnt) else: down_cnt = 0 sub_db = 'QK' product = 'ENGINEERING' provider = 'HEP' sub_db_id = '00036' batch = time.strftime("%Y%m%d_%H%M%S") down_date = time.strftime("%Y%m%d") if dicitem['ossKey']: fulltext_type = 'pdf' else: fulltext_type = '' onemessage = (creator, author_1st, creator_institution, organ_1st, title, subject, date, recv_date, accept_date, revision_date, date_created, volume, issue, journal_raw_id, source, page, beginpage, endpage, is_oa, cited_cnt, down_cnt, lngid, rawid, product, sub_db, provider, sub_db_id, type_, url, country, language, batch, down_date, publisher, identifier_pissn, identifier_cnno, description, identifier_doi, fulltext_type) if db3type == 'meta': return onemessage else: return False except: exMsg = '* ' + traceback.format_exc() print(exMsg) utils.logerror(exMsg) utils.logerror(fullname) return False
def parse_list_one(self, filename, fullname): language = 'ZH' country = 'CN' provider = 'pishubook' type_ = 1 medium = 2 batch = time.strftime('%Y%m%d') + '00' rawid = filename.replace('.html', '') publisher = '社会科学文献出版社' date = '1900' date_created = '19000000' url = 'https://www.pishu.com.cn/skwx_ps/bookdetail?SiteID=14&ID=%s' % rawid provider_url = provider + '@' + url provider_id = provider + '@' + rawid lngid = utils.GetLngid('00056', rawid) cover = '/smartlib/pishubook/%s/%s.jpg' % (rawid[:2], rawid) cover_path = '%s/%s/%s.jpg' % (self.cover_path, rawid[:2], rawid) if not os.path.exists(cover_path): cover = '' with open(fullname, encoding='utf8') as f: text = f.read() sel = Selector(text=text) try: title = sel.xpath('//h3[@class="Buy_tit2"]/text()').extract_first() creator = title_alternative = identifier_pisbn = title_series = subject = description = '' for trTag in sel.xpath( '//div[@class="books margintop10"]/table/tbody/tr'): trstr = trTag.xpath('string(.)').extract_first().strip() # utils.printf('trstr:%s' % trstr) if trstr.startswith('英 文 名:'): title_alternative = trstr.replace('英 文 名:', '').strip() elif trstr.startswith('作 者:'): for author in trTag.xpath('./td/a/text()'): creator = creator + author.extract() + ';' creator = creator.strip(';') elif trstr.startswith('I S B N:'): # utils.printf('trstr:%s' % trstr) identifier_pisbn = trstr.replace('I S B N:', '').replace('-', '').strip() # utils.printf('identifier_pisbn:%s' % identifier_pisbn) elif trstr.startswith('丛 书 名:'): title_series = trstr.replace('丛 书 名:', '').strip() elif trstr.startswith('关 键 词:'): for keyword in trTag.xpath('./td/a/text()'): subject = subject + keyword.extract() + ';' subject = subject.strip(';') elif trstr.startswith('出版日期:'): date_created = trstr.replace('出版日期:', '').strip().replace('-', '') date = date_created[:4] description = sel.xpath( 'string(//div[@class="main_right fr margintop20"]/div/div[@class="summaryCon"])' ).extract_first(default='').strip('<<').strip() description = description.replace('●', '').replace('•', '').strip() onemessage = (lngid, rawid, creator, title, title_alternative, title_series, cover, subject, identifier_pisbn, description, publisher, date, date_created, language, country, provider, provider_url, provider_id, type_, medium, batch) bookdetaillist = [] for article_id in sel.xpath( '//ul[@class="w_checkbox"]/li/a/@onclick'): pt = re.compile(r'toGeDataBase\((\d+),.*?\)') m = pt.match(article_id.extract()) if m: # utils.printf('文章号%s' % m.group(1)) bookdetaillist.append(m.group(1)) except: exMsg = '* ' + traceback.format_exc() print(exMsg) utils.logerror(exMsg) utils.logerror(fullname) return False, False return onemessage, bookdetaillist
def parse_detail_one(self, filename, fullname, db3type): language = 'EN' country = 'US' provider = 'aiaabook' type_ = 1 medium = 2 batch = time.strftime('%Y%m%d') + '00' identifier_doi = '10.2514/' + filename.replace('.html', '') rawid = identifier_doi lngid = utils.GetLngid('00108', rawid) provider_url = provider + '@' + 'https://arc.aiaa.org/doi/book/' + identifier_doi provider_id = provider + '@' + identifier_doi publisher = 'American Institute of Aeronautics and Astronautics' date = '1900' date_created = '19000000' date = self.dic[identifier_doi] date_created = date + '0000' cover = '' cover_path = '%s/%s.jpg' % (self.cover_path, identifier_doi) if os.path.exists(cover_path): cover = '/smartlib/aiaabook/%s.jpg' % identifier_doi with open(fullname, encoding='utf8') as f: text = f.read() sel = Selector(text=text) creator = description = '' try: identifier_pisbn = identifier_eisbn = title_series = price = '' title = sel.xpath('//h5[@class="teaser__group-title"]/text()' ).extract_first().strip() # if title == '': # title = sel.xpath('//h1[@class="citation__title"]/text()').extract_first(default='').strip() creator = sel.xpath( 'string(//ul[@class="rlist--inline loa mobile-authors"])' ).extract_first(default='').strip().replace(' ', ' ').replace( ' and ', ';') creator = creator.strip(';').replace(',', ';') creator = re.sub('\s+;', ';', creator) creator = re.sub(';\s+', ';', creator) for divTag in sel.xpath('//div[@class="teaser__item"]'): divstr = divTag.xpath('./text()').extract_first( default='').strip() if divstr.startswith('ISBN (print):'): identifier_pisbn = divstr.replace('ISBN (print):', '').strip().replace( '-', '') elif divstr.startswith('eISBN:'): identifier_eisbn = divstr.replace('eISBN:', '').strip().replace( '-', '') title_series = sel.xpath('//head/title/text()').extract_first( default='') title_series = title_series.split('|')[-1].strip() description = sel.xpath('string(//div[@class="NLM_abstract"])' ).extract_first(default='').strip() if description.startswith('Description'): description = description[11:].strip() elif description.startswith('About the Book'): description = description[14:].strip() for divpriceTag in sel.xpath( '//div[@class="book-product__content"]'): pricelist = divpriceTag.xpath( './div/span[@class="book-product__price__value"]/text()' ).extract() index = len(pricelist) if index > 0: price = pricelist[index - 1].strip() product_header = divpriceTag.xpath( './h4/text()').extract_first(default='') if product_header == 'PDF': break onemessage = (lngid, rawid, creator, title, identifier_pisbn, identifier_eisbn, description, publisher, cover, title_series, date, date_created, price, language, country, provider, provider_url, identifier_doi, provider_id, type_, medium, batch) if db3type == 'zt': return onemessage # keyword = subject # subject = '' # journal_raw_id = fullname.split('\\')[-2] # for sub_str in sel.xpath('//div[@class="jonListTitle"]/a/text()'): # sub_str = sub_str.extract().strip() # if sub_str == '首页': # continue # subject = subject + sub_str + ';' # subject = subject.strip(';') # down_cnt = divTag.xpath('./ul/li/span[@class="span02"]/text()').extract_first().replace('下载量:','').strip().replace(',','') # fulltext_type = '' # pdfTag = divTag.xpath('./ul/li/a[@id="clicknumber"]').extract_first() # if pdfTag: # fulltext_type = 'pdf;' # htmlTag = divTag.xpath('./ul/li/span[@id="ctl00_ContentPlaceHolder1_html_show"]/a') # if htmlTag: # fulltext_type += 'html;' # xmlTag = divTag.xpath('./ul/li/span[@id="ctl00_ContentPlaceHolder1_xml_show"]/a') # if xmlTag: # fulltext_type += 'xml;' # fulltext_type = fulltext_type.strip(';') # product = 'HANS' # sub_db = 'QK' # provider = 'HANS' # sub_db_id = '00046' # provider_url = url # batch = time.strftime("%Y%m%d_%H%M%S") # down_date = time.strftime("%Y%m%d") # down_cnt = down_cnt + '@' + down_date # # utils.printf(subject,down_cnt,fulltext_type) # refTaglist = divTag.xpath('./div/table/tr/td[@width="45"]') # ref_cnt = '' # if len(refTaglist) > 0: # ref_cnt = refTaglist[-1].xpath('string(.)').extract_first().strip().replace('[','').replace(']','') # onemessage = (creator,author_1st,creator_institution,organ_1st,title,title_alternative,keyword,date, # date_created,volume,issue,journal_raw_id,source,source_en,page,beginpage,endpage,subject,is_oa,down_cnt, # lngid,rawid,product,sub_db,provider,sub_db_id,type_,provider_url,country,language,batch,down_date,publisher, # identifier_pissn,identifier_eissn,description,description_en,identifier_doi,description_fund,ref_cnt, # fulltext_type) # if db3type == 'meta': # return onemessage # else: # return False except: exMsg = '* ' + traceback.format_exc() print(exMsg) utils.logerror(exMsg) utils.logerror(fullname) return False
def parse_detail(self): cover_list = [] cover_now_path = self.coverPath + '\\' + self.now_time for root, dirs, files in os.walk(cover_now_path): for file in files: rawid = file.replace('.jpg', '') cover_list.append(rawid) print(len(cover_list)) conn = sqlite3.connect("book.db3") sub_db_id = '202' sub_db = 'TS' provider = 'fifedubook' type_ = '1' language = 'ZH' country = 'CN' date = '1900' date_created = '19000000' medium = '2' result = [] result_2 = [] sql_insert = "insert into modify_title_info_zt(title, rawid, Lngid, provider, provider_id, provider_url, cover, batch, description_unit, type, language, country, date, date_created, medium) values(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)" sql_up = 'update list set stat = 1 where rawid = %s' while True: sql = "select rawid, url, title from list where stat = 0 limit 1000" cur = self.conn.cursor() cur.execute(sql) rows = cur.fetchall() if len(rows) == 0: break else: for rawid, url, title in rows: print(title) Lngid = utils.GetLngid(sub_db_id, rawid) provider_url = provider + '@' + "http://lib.fifedu.com/toVideoPage.do?id=%s" % rawid provider_id = provider + '@' + rawid if rawid in cover_list: cover = "/smartlib" + "/" + provider + "/" + rawid + ".jpg" else: cover = '' batch = str(self.now_time) + '00' try: res = requests.get(url, headers=self.headers, proxies=self.proxy) fe = 'iframe' if res.status_code == 200: if res.text.find(fe) > 0: html = Selector(res.text, type='html') mulu_id = html.xpath( "//div[@align='center']/iframe/@src" ).extract_first('').replace( 'http://www.iyangcong.com/service/ilearning/reading?id=', '') mulu_url = 'http://www.iyangcong.com/book/catalog/1/%s' % mulu_id res_mulu = requests.get(mulu_url) if res_mulu.text == 'null': mulu_url = 'http://www.iyangcong.com/book/catalog/10/%s' % mulu_id res_mulu = requests.get(mulu_url) mulu_list = json.loads(res_mulu.text) mulu_zh = mulu_en = "" for mulu in mulu_list: if mulu['title_zh'].replace( '<p>', '' ).replace('</p>', '').replace( '<font color=#003D79>', '' ).replace('</font>', '').replace( '<center>', '' ).replace('</center>', '').replace( '[^1]', '' ).replace('<ol type="a">', '').replace( '<li></li>', '' ).replace('</ol>', '').replace( '<ol>', '' ).replace('<li>', '').replace( '</li>', '').strip().replace( '<u>', '' ).replace('</u>', '').replace( "</strong>", '' ).replace('<strong>', '').replace( '</CENTER>', '' ).replace('<CENTER>', '').replace( '</h1>', '').replace( '<h1>', '').replace( '<font color=#844200>', '') != '': mulu_zh += mulu['title_zh'].replace( '<p>', '' ).replace('</p>', '').replace( '<font color=#003D79>', '' ).replace('</font>', '').replace( '<center>', '' ).replace('</center>', '').replace( '[^1]', '' ).replace('<ol type="a">', '').replace( '<li></li>', '' ).replace('</ol>', '').replace( '<ol>', '' ).replace('<li>', '').replace( '</li>', '' ).strip().replace('<u>', '').replace( '</u>', '' ).replace("</strong>", '').replace( '<strong>', '' ).replace('</CENTER>', '').replace( '<CENTER>', '').replace('</h1>', '').replace( '<h1>', '' ).replace('</h2>', '').replace( '<h2>', '').replace( '<font color=#844200>', '' ).replace( '<font color=brown>', '').replace( '<div align="center">', '').replace( '</div>', '') + ';' else: mulu_zh += '' if mulu['title_en'].replace( '<p>', '' ).replace('</p>', '').replace( '<font color=#003D79>', '' ).replace('</font>', '').replace( '<center>', '' ).replace('</center>', '').replace( '[^1]', '' ).replace('<ol type="a">', '').replace( '<li></li>', '' ).replace('</ol>', '').replace( '<ol>', '' ).replace('<li>', '').replace( '</li>', '').strip().replace( '<u>', '' ).replace('</u>', '').replace( "</strong>", '' ).replace('<strong>', '').replace( '</CENTER>', '' ).replace('<CENTER>', '').replace( '</h1>', '').replace( '<h1>', '').replace( '<font color=#844200>', '') != '': mulu_en += mulu['title_en'].replace( '<p>', '' ).replace('</p>', '').replace( '<font color=#003D79>', '' ).replace('</font>', '').replace( '<center>', '' ).replace('</center>', '').replace( '[^1]', '' ).replace('<ol type="a">', '').replace( '<li></li>', '' ).replace('</ol>', '').replace( '<ol>', '' ).replace('<li>', '').replace( '</li>', '' ).strip().replace('<u>', '').replace( '</u>', '' ).replace("</strong>", '').replace( '<strong>', '' ).replace('</CENTER>', '').replace( '<CENTER>', '').replace('</h1>', '').replace( '<h1>', '' ).replace('</h2>', '').replace( '<h2>', '').replace( '<font color=#844200>', '' ).replace( '<font color=brown>', '').replace( '<div align="center">', '').replace( '</div>', '') + ';' else: mulu_en += '' if mulu_zh.replace(';', '') == '': description_unit = mulu_en else: description_unit = mulu_zh # print(description_unit) result.append( (title, rawid, Lngid, provider, provider_id, provider_url, cover, batch, description_unit, type_, language, country, date, date_created, medium)) result_2.append((rawid)) if utils.parse_results_to_sql( conn, sql_insert, result, 100): print("插入%s条" % len(result)) result.clear() if utils.parse_results_to_sql( self.conn, sql_up, result_2, 100): print("更新%s条" % len(result_2)) result_2.clear() else: print('not find fe') except Exception as e: print(e) utils.parse_results_to_sql(conn, sql_insert, result) print("插入%s条" % len(result)) result.clear() utils.parse_results_to_sql(self.conn, sql_up, result_2) print("更新%s条" % len(result_2)) result_2.clear()