def down_vol(): sql_up = "update vol set stat = 1 where vol_url = %s" result = [] while True: conn = pymysql.connect(DBHOST, DBUSER, DBPWD, DB) cur = conn.cursor() sql = "select pub_year,vol,num,vol_url from vol where stat = 0 limit 1000" cur.execute(sql) rows = cur.fetchall() if len(rows) == 0: break else: for pub_year, vol, num, vol_url in rows: fdir = detail_path + '\\' + now_time + '\\' + pub_year + '\\' + vol if not os.path.exists(fdir): os.makedirs(fdir) feature = "highwire-cite-title" res = utils.get_html(vol_url, feature=feature, proxies=proxy, timeout=30) if res: fname = '%s/%s.html' % (fdir, num) with open(fname, 'w', encoding='utf8') as f: f.write(res.text) utils.printf("下载%s年%s卷%s期成功" % (pub_year, vol, num)) result.append((vol_url)) if utils.parse_results_to_sql(conn, sql_up, result, 50): print("更新%s条成功" % len(result)) result.clear() utils.parse_results_to_sql(conn, sql_up, result) print("更新剩下%s条成功" % len(result)) result.clear()
def get_info_from_journal_list(): sql = """ insert ignore into journal_list (j_id,journal_url, journal_name, journal_issn) values(%s,%s,%s,%s) """ result = [] for root, dirs, files in os.walk(journal_list_path): conn = pymysql.connect(DBHOST, DBUSER, DBPWD, DB) for file in files: file_path = root + '/' + file print(file_path) with open(file_path, encoding='utf-8') as fp: text = fp.read() html = Selector(text, 'html') journal_url_list = html.xpath( "//table[@class='tabel-lab']//tr/td/a/@href").extract() for i, item in enumerate(journal_url_list): # http://www.sinomed.ac.cn/en/journalSearch.do?method=detail&id=27706&db=journal&dbtype=en journal_url = "http://www.sinomed.ac.cn/en/" + item j_id = re.findall("detail&id=(.*)&db=journal", item)[0] journal_name = html.xpath( "//table[@class='tabel-lab']//tr/td/a/text()").extract()[i] journal_issn = html.xpath( "//table[@class='tabel-lab']//tr/td[2]/span/text()" ).extract()[i].strip() result.append((j_id, journal_url, journal_name, journal_issn)) if utils.parse_results_to_sql(conn, sql, result, 1000): print("插入%s条成功" % len(result)) result.clear() utils.parse_results_to_sql(conn, sql, result) print("插入剩下%s条成功" % len(result)) result.clear()
def run(self): conn = pymysql.connect(DBHOST, DBUSER, DBPWD, DB) sql_up = "update detail set stat = 1 where url = %s" result = [] while True: url = sql_queue.get() result.append((url)) utils.printf(result) utils.parse_results_to_sql(conn, sql_up, result) print("更新%s条成功" % len(result)) result.clear()
def down_cover(self): now_path = self.coverPath + '\\' + self.now_time if not os.path.exists(now_path): os.makedirs(now_path) sql_up = "update list set cover_stat = 1 where rawid = %s" result = [] while True: sql = "select rawid, cover_url, title from list where cover_stat = 0 limit 1000" cur = self.conn.cursor() cur.execute(sql) rows = cur.fetchall() if len(rows) == 0: break else: for rawid, cover_url, title in rows: path = now_path + '/%s.jpg' % rawid if os.path.exists(path): result.append( (rawid) ) print('%s exists' % path) elif 'zhlx.png' in cover_url: result.append( (rawid) ) print('%s no ' % path) elif cover_url == 'http://lib.fifedu.com/upload_dir/': result.append( (rawid) ) print('%s no ' % path) else: try: res = requests.get(cover_url,headers=self.headers) if res.status_code == 200: path = now_path + '/%s.jpg' % rawid # utils.Img2Jpg(res.content,path) if utils.Img2Jpg(res.content,path): print('%s -- down cover right' % title) result.append( (rawid) ) else: print('%s -- down cover error' % title) else: print('status_code != 200 ~') except Exception as e: print(e) if utils.parse_results_to_sql(self.conn, sql_up, result, 100): print('插入%s条' % len(result)) result.clear() utils.parse_results_to_sql(self.conn, sql_up, result) print('插入剩下%s条' % len(result)) result.clear()
def parse_detail(self): cover_list = [] cover_now_path = self.coverPath + '\\' + self.now_time for root, dirs, files in os.walk(cover_now_path): for file in files: rawid = file.replace('.jpg','') cover_list.append(rawid) print(len(cover_list)) conn = sqlite3.connect("video.db3") now_path = self.detailPath + '\\' + self.now_time sub_db_id = '203' sub_db = 'DMT' provider = 'fifeduvideo' type_ = '10' language = 'ZH' country = 'CN' date = '1900' date_created = '19000000' medium = '2' sql = "insert into modify_title_info_zt(Lngid, rawid, provider, type, language, country, provider_url, provider_id, cover, batch, title, description, provider_subject, date, date_created, medium) values(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)" result = [] for root, dirs, files in os.walk(now_path): for file in files: rawid = file.replace('.html','') Lngid = utils.GetLngid(sub_db_id, rawid) provider_url = provider + '@' + "http://lib.fifedu.com/toVideoPage.do?id=%s" % rawid provider_id = provider + '@' + rawid if rawid in cover_list: cover = "/smartlib" + "/" + provider + "/" + rawid + ".jpg" else: cover = '' batch = str(self.now_time) + '00' file_path = root + '/' + file print(file_path) with open(file_path, encoding='utf8') as f: text = f.read() html = Selector(text, type='html') provider_subject = description = title = '' provider_subjects = html.xpath("//div[@class='detailnavbody']/a/text()").extract()[1:-1] title = html.xpath("//div[@class='detailnavbody']/a/text()").extract()[-1] description = html.xpath("//div[@class='tabvalue']/text()").extract_first('').strip() for item in provider_subjects: provider_subject += item + ';' provider_subject = provider_subject.replace('在线课程库;','').replace('玩转多语种;','').replace('视听练兵场;','') result.append( (Lngid, rawid, provider, type_, language, country, provider_url, provider_id, cover, batch, title, description, provider_subject, date, date_created, medium) ) # if utils.parse_results_to_sql(conn, sql, result, 100): # print("插入%s条" % len(result)) # result.clear() utils.parse_results_to_sql(conn, sql, result) print("插入剩下得%s条" % len(result)) result.clear()
def parse_list(self, message): utils.printf('%s:解析列表页开始...' % self.provider) conn = utils.init_db('mysql', 'cambridgejournal') result = [] issueresult = [] stmt = 'insert ignore into article(uid,url,gch) Values(%s,%s,%s)' sql = 'insert ignore into issue(url,stat) Values(%s,%s)' cnt = 0 for filename, fullname in utils.file_list(self.list_path): with open(fullname, encoding='utf8') as f: text = f.read() gch = fullname.split('\\')[-2] soup = BeautifulSoup(text, 'lxml') ulTags = soup.select('ul.details') if len(ulTags) == 0: utils.logerror(fullname + '\n') for ulTag in ulTags: try: aTag = ulTag.select_one('li.title > a') if not aTag: aTag = ulTag.select_one('li.title > h5 > a') if aTag: url = aTag.get('href') uid = url.split('/')[-1] result.append((uid, url, gch)) except: utils.printf(fullname) utils.logerror(fullname) break if filename.find('_') < 0: pageTag = soup.select_one('ul.pagination') if pageTag: pTags = pageTag.select('li > a') for pTag in pTags: if operator.eq(pTag.get_text(), 'Last'): pagenum = int(pTag.get('data-page-number')) for page in range(2, pagenum + 1): uri = '/core/journals/%s/issue/%s?pageNum=%s' % ( gch, filename.replace('.html', ''), page) issueresult.append((uri, 0)) if utils.parse_results_to_sql(conn, stmt, result, 1000): cnt += len(result) result.clear() utils.printf(cnt) utils.parse_results_to_sql(conn, stmt, result) utils.parse_results_to_sql(conn, sql, issueresult) cnt += len(result) utils.printf(cnt) utils.printf('大于一页的个数为%s' % len(issueresult)) conn.close() utils.printf('%s:解析列表页完成...' % self.provider) self.senddistributefinish('startdown_detail')
def get_journal_cover(): sql_up = "update journal_list set stat_cover = 1 where journal_url = %s" result = [] while True: conn = pymysql.connect(DBHOST, DBUSER, DBPWD, DB) cur = conn.cursor() sql = "select j_id, journal_url, journal_name, cover_url from journal_list where stat_cover = 0 limit 1000" cur.execute(sql) rows = cur.fetchall() if len(rows) == 0: break else: for j_id, journal_url, journal_name, cover_url in rows: print(journal_name) if 'journalNameNull' in cover_url: print("图片错误") result.append( (journal_url) ) elif cover_url == "": print("无图片") result.append( (journal_url) ) else: try: res = requests.get(cover_url, headers=Headers,timeout=80) if res.status_code == 200: filename = '%s/%s.jpg' % (cover_path, j_id) srcImg = Image.open(io.BytesIO(res.content)) dstImg = srcImg.resize((108, 150), Image.ANTIALIAS).convert('RGB') dstImg.save(filename, 'JPEG') print('下载%s-%s成功'%(journal_name,j_id)) provider = 'sinomedkpjournal' pathtxt = r'E:\down_data_e\sinomed\kp\sinomedkpjournal_cover_20191028.txt' s = provider + '@' + j_id + '★' + '/smartlib/' + provider + '/' + j_id + '.jpg' +'\n' with open (pathtxt, 'a',encoding='utf-8') as f: f.write(s) result.append( (journal_url) ) else: print("%s-的封面%s status_code !=200" % (journal_name, cover_url)) except Exception as e: print(e) if utils.parse_results_to_sql(conn, sql_up, result, 100): print("更新%s条成功" % len(result)) result.clear() utils.parse_results_to_sql(conn, sql_up, result) print("更新剩下%s条成功" % len(result)) result.clear()
def parse_detail(): result = [] conn_db3 = sqlite3.connect("zt_template.db3") sql_in = """ insert into modify_title_info_zt (Lngid, rawid, provider, type, language, country, provider_url, provider_id, cover, batch, title, description, provider_subject, date, date_created, creator, medium , publisher) values (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?) """ # 基本信息 language = 'ZH' country = 'CN' type_ = '1' provider = '360elibbook' sub_db_id = '258' batch = now_time + '00' medium = "2" date = "1900" date_created = date + "0000" fdir = "%s\%s" % (detail_path,now_time) for _,dir_ in utils.file_list(fdir): utils.printf(dir_) pa = r"E:\\work\\360elib\\detail\\%s\\(.*)\\" % (now_time) provider_subject = re.findall(pa,dir_)[0] if provider_subject == 'None': provider_subject = "" with open(dir_,encoding='utf-8')as f: text = f.read() html = Selector(text,'html') rawid = _.replace(".html","") Lngid = utils.GetLngid(sub_db_id, rawid) provider_url = provider + '@' + "http://www.360elib.com:2100/chinese/web/Details.aspx?id=%s" % (rawid) provider_id = provider + '@' + rawid title = html.xpath("//span[@id='ctl00_ContentPlaceHolder1_lb_name']/text()").extract_first() creator = html.xpath("//span[@id='ctl00_ContentPlaceHolder1_lb_zz']/text()").extract_first("").replace(", ","").replace(",","").replace(",","").replace("、","") publisher = html.xpath("//span[@id='ctl00_ContentPlaceHolder1_lb_cbs']/text()").extract_first("") description = html.xpath("//span[@id='ctl00_ContentPlaceHolder1_lb_bookintro']/text()").extract_first("") cover_rawid = rawid.lower() cover_p = '%s/%s/%s.jpg' % (cover_path,now_time,cover_rawid) if os.path.exists(cover_p): cover = "/smartlib" + "/" + provider + "/" + cover_rawid + ".jpg" else: cover = "" result.append( (Lngid, rawid, provider, type_, language, country, provider_url, provider_id, cover, batch, title, description, provider_subject, date, date_created, creator, medium , publisher) ) if utils.parse_results_to_sql(conn_db3, sql_in, result, 1000): utils.printf("插入%s条" % len(result)) result.clear() utils.parse_results_to_sql(conn_db3, sql_in, result) utils.printf("插入剩下得%s条" % len(result)) result.clear()
def down_detail(self): now_path = self.detailPath + '\\' + self.now_time if not os.path.exists(now_path): os.makedirs(now_path) sql_up = "update list set stat = 1 where rawid = %s" result = [] while True: sql = "select rawid, url, title from list where stat = 0 limit 1000" cur = self.conn.cursor() cur.execute(sql) rows = cur.fetchall() if len(rows) == 0: break else: for rawid, url, title in rows: print(title) print(url) path = now_path + '/%s.html' % rawid if os.path.exists(path): result.append( (rawid) ) print('%s exists' % path) else: try: res = requests.get(url,headers=self.headers,proxies=self.proxy) fee = 'detailnavbody' if res.status_code == 200: res.encoding = res.apparent_encoding if res.text.find(fee) > 0: path = now_path + '/%s.html' % rawid with open(path, mode='w', encoding='utf-8') as f: f.write(res.content.decode('utf8')) print('%s -- down right' % title) result.append( (rawid) ) else: print("not find fee ~") else: print('status_code != 200 ~') except Exception as e: print(e) if utils.parse_results_to_sql(self.conn, sql_up, result, 100): print('插入%s条' % len(result)) result.clear() utils.parse_results_to_sql(self.conn, sql_up, result) print('插入剩下%s条' % len(result)) result.clear()
def parsel_detail_one(): conn_1 = pymysql.connect(DBHOST, DBUSER, DBPWD, DB) conn_2 = sqlite3.connect('mirrorimutmeixingbook_20191218.db3') sub_db_id = '243' provider = 'mirrorimutmeixingbook' type = '1' date = '1900' date_created = '19000000' medium = '2' sql_in = "insert into modify_title_info_zt(Lngid, rawid, provider, type, language, country, provider_url, provider_id, batch, title, creator, provider_subject, date, date_created, medium) values(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)" result_2 = [] now_time = time.strftime('%Y%m%d') fdir = '%s/%s' % (detail_path, now_time) for _, filename in utils.file_list(fdir): rawid = _.replace(".html", "") with open(filename, encoding='gb18030') as f: text = f.read() html = Selector(text, "html") creator = html.xpath( "//table[@style='WORD-BREAK: break-all']//tr/td/text()").extract( )[0].replace("作者:", "") if creator == "unknow": creator = "" if "ja" in rawid: id_ = rawid.replace('ja', '') url = "http://202.207.22.13:100/Soft_Showja.asp?SoftID=%s" % id_ language = "JA" country = "JP" Lngid = utils.GetLngid(sub_db_id, rawid) else: language = "EN" country = "US" url = "http://202.207.22.13:100/Soft_Show.asp?SoftID=%s" % rawid Lngid = utils.GetLngid(sub_db_id, rawid) sql = "select title,provider_subject from detail where url = '%s'" % url cur = conn_1.cursor() cur.execute(sql) rows = cur.fetchall() title = rows[0][0].replace("\n", ' ') provider_subject = rows[0][1].replace("数字图书;", '') provider_url = provider + '@' + url provider_id = provider + '@' + rawid batch = str(now_time) + '00' result_2.append((Lngid, rawid, provider, type, language, country, provider_url, provider_id, batch, title, creator, provider_subject, date, date_created, medium)) utils.parse_results_to_sql(conn_2, sql_in, result_2) utils.printf("插入剩下的%s条" % len(result_2)) result_2.clear()
def parse_list(): conn = pymysql.connect(DBHOST, DBUSER, DBPWD, DB) regex_bookid = re.compile(r"bookinfo.aspx\?id=(\d+)") stmt = 'insert ignore into book (bookid,stat) values(%s,%s)' results = [] for _, filename in utils.file_list(list_path): with open(filename, encoding='gb18030') as f: text = f.read() bookidlist = regex_bookid.findall(text) for bookid in bookidlist: results.append((bookid,0)) if utils.parse_results_to_sql(conn, stmt, results, 1000): total = len(results) results.clear() print('插入 ', total, ' 个结果到数据库成功') utils.parse_results_to_sql(conn, stmt, results) print('插入 ', len(results), ' 个结果到数据库成功')
def wirte_bigjson(): conn = pymysql.connect(DBHOST, DBUSER, DBPWD, DB) cur = conn.cursor() big_json_filepath = big_json_path + '/' + '%s.big_json' % now_time sql_up = "update detail set stat = 1 where url = %s" result = [] while True: down_date = now_time sql = "select url from detail where stat=0 limit 1000 " cur.execute(sql) rows = cur.fetchall() if len(rows) == 0: break else: for url in rows: url = url[0] feature = "highwire-cite-metadata" res = utils.get_html(url, feature=feature, proxies=proxy, timeout=30) # res.encoding = 'utf-8' if res: html = res.text.strip() sumDict = dict() sumDict['provider_url'] = url sumDict['down_date'] = down_date sumDict['htmlText'] = html with open(big_json_filepath, mode='a', encoding='utf-8') as f: line = json.dumps(sumDict, ensure_ascii=False).strip() + '\n' f.write(line) utils.printf(url, 'write to big_json') result.append((url)) if os.path.getsize(big_json_filepath) // (1024 * 1024 * 1024) >= 2: big_json_filepath = big_json_path + '/' + '%s_%s.big_json' % ( str(now_time), random.randrange(111, 999)) print("文件大小到2G,切换文件名为%s" % big_json_filepath) if utils.parse_results_to_sql(conn, sql_up, result, 100): print("更新%s条成功" % len(result)) result.clear() utils.parse_results_to_sql(conn, sql_up, result) print("更新剩下的%s条成功" % len(result)) result.clear()
def parselist(): base_link = 'http://www.sinomed.ac.cn' now_time = datetime.datetime.now().strftime("%Y%m%d") file_path = path + '\\' + now_time sql = """insert ignore into detail(provider_url,title,author,pub_date,degree,organ,contributor) values(%s,%s,%s,%s,%s,%s,%s)""" result = [] for root, dirs, files in os.walk(file_path): for file in files: file_name = root + '\\' + file # print(file_name) with open(file_name, encoding='utf-8') as fp: text = fp.read() html = Selector(text, 'html') div_all = html.xpath("//div[@class='right-wztxt fL']") for div in div_all: author = pud_date = degree = organ = contributor = "" # 标题 title = div.xpath(".//h2/span/a/text()").extract_first('') count += 1 # url provider_url = base_link + div.xpath( ".//h2/span/a/@href").extract_first('') for p in div.xpath(".//p").xpath("string(.)").extract(): p = p.replace('\n', '').replace('\t', '').replace(' ', '') # 作者 if p.startswith("研究生姓名:"): author = p.replace("研究生姓名:", '') # 出版时间 if p.startswith("出版时间:"): pub_date = p.replace("出版时间:", '').replace("-", '') # 授予学位 if p.startswith("授予学位:"): degree = p.replace("授予学位:", '') # 授予学位单位 if p.startswith("授予学位单位:"): organ = p.replace("授予学位单位:", '') # 导师 if p.startswith("导师:"): contributor = p.replace("导师:", '') result.append((provider_url, title, author, pub_date, degree, organ, contributor)) if utils.parse_results_to_sql(conn, sql, result, 100): print("插入%s条成功" % len(result)) result.clear() utils.parse_results_to_sql(conn, sql, result) print("全部插入结束")
def process_cover(self, message): self.count = self.count + 1 self.sqlList.append((1, message)) if self.count % 2 == 1: utils.printf('%s:下载成功图片 %s 个' % (self.provider, self.count)) conn = utils.init_db('mysql', 'ydylcninfo', 4) stmt = 'update book set cover_stat=%s where bookid=%s' utils.parse_results_to_sql(conn, stmt, self.sqlList) self.sqlList.clear() conn.close() if self.count % 100 == 0: self.refreshproxypool() if self.count == self.totalcount: conn = utils.init_db('mysql', 'ydylcninfo', 4) stmt = 'update book set cover_stat=%s where bookid=%s' utils.parse_results_to_sql(conn, stmt, self.sqlList) self.sqlList.clear() utils.printf('%s:下载图片完成' % self.provider)
def parse_detail(self): super().parse_detail() language = "EN" type = "1" medium = "2" provider = "cqjtukingbook" country = "US" batch = time.strftime('%Y%m%d') + "00" stmt = ( '''insert into modify_title_info_zt(lngid,rawid,title,creator,description,subject,date,date_created,identifier_pisbn,language,country,provider,provider_url,provider_id,type,medium,batch,publisher) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?);''') conn = utils.init_db("sqlite3", self.template_file) results = [] cnt = 0 for file, fullpath in utils.file_list(self.detail_path): with open(fullpath, encoding='utf8') as fp: txt = fp.read() try: title, creator, publishers, date, identifier_pisbn, subject, description = self._parse_detail_one( txt) except: exMsg = '* ' + traceback.format_exc() logerror(fullpath) logerror(exMsg) continue date_created = date + '0000' basename, _, ext = file.partition(".") rawid = basename provider_url = provider + "@http://123.56.143.23/kingbookwaiwen/book/info.aspx?id=" + rawid provider_id = provider + "@" + rawid lngID = "CQJTU_KINGBOOK_TS_" + rawid results.append( (lngID, rawid, title, creator, description, subject, date, date_created, identifier_pisbn, language, country, provider, provider_url, provider_id, type, medium, batch, publishers)) if utils.parse_results_to_sql(conn, stmt, results, 1000): cnt += len(results) print('%s:%s' % (time.strftime("%Y/%m/%d %X"), cnt)) results.clear() utils.parse_results_to_sql(conn, stmt, results) cnt += len(results) print('%s:%s' % (time.strftime("%Y/%m/%d %X"), cnt)) conn.close()
def process_index(self, message): self.count = self.count + 1 self.sqlList.append((1, message)) if self.count % 20 == 1: utils.printf('%s:下载成功 %s 页' % (self.provider, self.count)) conn = utils.init_db('mysql', 'bioonejournal') stmt = 'update issuelist set stat=%s where url=%s' utils.parse_results_to_sql(conn, stmt, self.sqlList) self.sqlList.clear() conn.close() if self.count % 100 == 0: self.refreshproxypool() if self.count == self.totalcount: conn = utils.init_db('mysql', 'bioonejournal') stmt = 'update issuelist set stat=%s where url=%s' utils.parse_results_to_sql(conn, stmt, self.sqlList) self.sqlList.clear() utils.printf('%s:down_index finish' % self.provider) self.sendwork('parse_index')
def parse_index(): result = [] conn = pymysql.connect(DBHOST, DBUSER, DBPWD, DB) sql = """ insert ignore into catalog (g_name,g_link) values (%s,%s) """ fname = "%s/index.html" % (index_path) with open(fname,encoding='utf-8')as f: text = f.read() html = Selector(text,'html') g_link_name_list = html.xpath("//div[@class='g-link']/a/text()").extract() g_link_list = html.xpath("//div[@class='g-link']/a/@href").extract() for num,g_link in enumerate(g_link_list): g_name = g_link_name_list[num] result.append( (g_name,g_link) ) utils.parse_results_to_sql(conn,sql,result) utils.printf("插入%s条分类信息" % len(result))
def parse_list(self): super().parse_list() conn = utils.init_db('mysql', 'cqjtu_kingbook') base_url = "http://123.56.143.23/kingbookwaiwen/book/" regex_bookid = re.compile(r"info.aspx\?id=(\d+)") stmt = 'insert ignore into book (bookid,stat) values(%s,%s)' results = [] for _, filename in utils.file_list(self.list_path): with open(filename, encoding='utf8') as f: text = f.read() bookidlist = regex_bookid.findall(text) for bookid in bookidlist: results.append((bookid, 0)) if utils.parse_results_to_sql(conn, stmt, results, 1000): total = len(results) results.clear() print('插入 ', total, ' 个结果到数据库成功') utils.parse_results_to_sql(conn, stmt, results) print('插入 ', len(results), ' 个结果到数据库成功')
def parse_html(self, message): utils.printf('%s:解析起始页开始...' % self.provider) conn = utils.init_db('mysql', 'aiaajournal', 2) result = [] stmt = 'insert ignore into journal(journal_name,url,eissn,cover_url,active) Values(%s,%s,%s,%s,%s)' active = 0 cnt = 0 for filename, fullname in utils.file_list(self.html_path): if filename == 'active.html': active = 1 else: active = 0 with open(fullname, encoding='utf8') as f: text = f.read() try: sel = Selector(text=text) for liTag in sel.xpath('//li[@class="search-item clearfix"]'): journal_name = liTag.xpath( './div/h4/a/text()').extract_first().strip() url = liTag.xpath( './div/h4/a/@href').extract_first().replace( 'journal', 'loi') eissn = liTag.xpath( './div/div/div/span[@class="meta__eissn"]/text()' ).extract_first().replace('eISSN: ', '').strip() cover_url = liTag.xpath( './div/a/img/@src').extract_first().strip() result.append( (journal_name, url, eissn, cover_url, active)) utils.printf(len(result)) except: exMsg = '* ' + traceback.format_exc() print(exMsg) utils.logerror(exMsg) utils.logerror(fullname) return utils.parse_results_to_sql(conn, stmt, result) cnt += len(result) utils.printf(cnt) conn.close() utils.printf('%s:解析起始页完成...' % self.provider) self.senddistributefinish('startdown_index')
def parse_detail(): conn = sqlite3.connect('template.db3') language = "EN" type = "1" medium = "2" provider = "mirrorbeihuakingbook" country = "US" sub_db_id = "217" now_time = time.strftime('%Y%m%d') batch = now_time + "00" stmt = ( '''insert into modify_title_info_zt(lngid,rawid,title,creator,description,date,date_created,identifier_pisbn,language,country,provider,provider_url,provider_id,type,medium,batch,publisher) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,);''' ) results = [] cnt=0 dir_path = detail_path + '/' + now_time for file, fullpath in utils.file_list(dir_path): with open(fullpath, encoding='gb18030') as fp: txt = fp.read() title, creator, publishers, date, identifier_pisbn, description = _parse_detail_one(txt) # print(title, creator, publishers, date, identifier_pisbn, description) date_created = date + '0000' basename, _, ext = file.partition(".") rawid = basename provider_url = provider + "@http://10.5.23.18:8079/book/bookinfo.aspx?id=" + rawid provider_id = provider + "@" + rawid lngID = utils.GetLngid(sub_db_id, rawid) results.append( ( lngID, rawid, title, creator, description, date, date_created, identifier_pisbn, language, country, provider, provider_url, provider_id, type, medium, batch, publishers ) ) if utils.parse_results_to_sql(conn, stmt, results, 1000): cnt+=len(results) print('%s:%s' % (time.strftime("%Y/%m/%d %X"),cnt)) results.clear() utils.parse_results_to_sql(conn, stmt, results) cnt+=len(results) print('%s:%s' % (time.strftime("%Y/%m/%d %X"),cnt)) conn.close()
def parse_article_list(): """ 每次更改时间变量 ,对应下载article_list_2_path文件夹 """ sql = """ insert ignore into article_list(j_id,article_name, article_url) values(%s,%s, %s) """ base_url = "http://www.sinomed.ac.cn" result = [] # shijian = '20191031' # file_p = article_list_2_path + '\\' + shijian for root, dirs, files in os.walk(article_list_path): conn = pymysql.connect(DBHOST, DBUSER, DBPWD, DB) for file in files: try: file_path = root + '/' + file print(file_path) # j_id = re.findall(r'E:\\down_data_e\\sinomed\\en\\article_list_2\\20191031\\(.*)\\',file_path)[0] j_id = re.findall( r'E:\\down_data_e\\sinomed\\en\\article_list\\(.*)\\', file_path)[0] with open(file_path, encoding='utf-8') as fp: text = fp.read() soup = BeautifulSoup(text, 'lxml') article_url_list = soup.select( 'div.right-wztxt.fL > h2 > div > a') for atag in article_url_list: url = base_url + atag['href'] article_name = atag.text result.append((j_id, article_name, url)) if utils.parse_results_to_sql(conn, sql, result, 1000): print("插入详情页%s条" % len(result)) result.clear() utils.parse_results_to_sql(conn, sql, result) print("插入剩下详情页%s条" % len(result)) result.clear() except Exception as e: print(e) line = file_path + '\t' + e + '\n' with open("log.txt", 'a', encoding='utf-8') as f: f.write(line)
def process_list(self, message): self.count = self.count + 1 self.sqlList.append(message) if self.count % 40 == 1: utils.printf('%s:下载成功 %s 页' % (self.provider, self.count)) conn = utils.init_db('mysql', 'ydylcnbook', 4) stmt = 'update book set stat=%s where bookid=%s' utils.parse_results_to_sql(conn, stmt, self.sqlList) self.sqlList.clear() conn.close() if self.count % 100 == 0: self.refreshproxypool() if self.count == self.totalcount: conn = utils.init_db('mysql', 'ydylcnbook', 4) stmt = 'update book set stat=%s where bookid=%s' utils.parse_results_to_sql(conn, stmt, self.sqlList) self.sqlList.clear() utils.printf('downloadlist finish') self.senddistributefinish('startdown_cover')
def process_index(self, message): self.count = self.count + 1 self.sqlList.append(message) if self.count % 40 == 1: utils.printf('%s:下载成功 %s 页' % (self.provider, self.count)) conn = utils.init_db('mysql', 'pishuinfo', 4) stmt = 'update video set stat=%s where video_id=%s' utils.parse_results_to_sql(conn, stmt, self.sqlList) self.sqlList.clear() conn.close() if self.count % 100 == 0: self.refreshproxypool() if self.count == self.totalcount: conn = utils.init_db('mysql', 'pishuinfo', 4) stmt = 'update video set stat=%s where video_id=%s' utils.parse_results_to_sql(conn, stmt, self.sqlList) self.sqlList.clear() utils.printf('downloadindex finish') self.sendwork('parse_index')
def parse_html(self,message): utils.printf('%s:解析起始页开始...' % self.provider) conn = utils.init_db('mysql', 'pishuinfo', 4) result = [] stmt = 'insert ignore into video(video_id,stat) Values(%s,%s)' cnt = 0 for filename, fullname in utils.file_list(self.html_path): with open(fullname, encoding='utf8') as f: text = f.read() sel = Selector(text=text) for href in sel.xpath('//*[@id="TitleList"]/div/a/@href'): video_id = href.re('.*ID=(\d+)&isHost=.*')[0] result.append((video_id, 0)) utils.printf(len(result)) utils.parse_results_to_sql(conn, stmt, result) cnt += len(result) utils.printf(cnt) conn.close() utils.printf('%s:解析起始页完成...' % self.provider) self.senddistributefinish('startdown_index')
def xiugai(): sql_uu = "update journal_info set j_id = %s where journal_name = %s" result = [] while True: conn = pymysql.connect(DBHOST, DBUSER, DBPWD, DB) cur = conn.cursor() sql = "select j_id,journal_name from journal_list" cur.execute(sql) rows = cur.fetchall() if len(rows) == 0: break else: for j_id, journal_name in rows: result.append((j_id, journal_name)) if utils.parse_results_to_sql(conn, sql_uu, result, 1000): print("更新%s条" % len(result)) result.clear() utils.parse_results_to_sql(conn, sql_uu, result) print("更新剩下的%s条" % len(result)) result.clear()
def process_list(self, message): self.count = self.count + 1 self.sqlList.append(message) if self.count % 40 == 1: utils.printf('%s:下载成功 %s 页' % (self.provider, self.count)) conn = utils.init_db('mysql', 'hepengineeringjournal', 4) stmt = 'update issue set stat=%s where journal_id=%s and issue_id=%s' utils.parse_results_to_sql(conn, stmt, self.sqlList) self.sqlList.clear() conn.close() if self.count % 100 == 0: self.refreshproxypool() if self.count == self.totalcount: conn = utils.init_db('mysql', 'hepengineeringjournal', 4) stmt = 'update issue set stat=%s where journal_id=%s and issue_id=%s' utils.parse_results_to_sql(conn, stmt, self.sqlList) self.sqlList.clear() utils.printf('downloadlist finish') self.sendwork('parse_list')
def down_detail(): result = [] conn = pymysql.connect(DBHOST, DBUSER, DBPWD, DB) cur = conn.cursor() sql = "select provider_subject,url from detail where stat = 0 limit 1000" sql_up = "update detail set stat = 1 where url = %s" while True: cur.execute(sql) rows = cur.fetchall() if len(rows) == 0: break else: for provider_subject,url in rows: fdir = "%s/%s/%s" % (detail_path,now_time,provider_subject) if not os.path.exists(fdir): os.makedirs(fdir) rawid = url.replace("/chinese/web/Details.aspx?id=","") fname = "%s/%s.html" % (fdir,rawid) if os.path.exists(fname): utils.printf("%s 存在" % rawid) result.append( (url) ) continue feature = "ctl00_ContentPlaceHolder1_lb_name" all_url = "http://www.360elib.com:2100" + url res = utils.get_html(all_url,feature=feature) if res: # print(res.apparent_encoding) with open(fname,'w',encoding='utf-8')as f: f.write(res.content.decode('gb18030')) utils.printf("下载%s 成功" % rawid) result.append( (url) ) if utils.parse_results_to_sql(conn,sql_up,result,1000): utils.printf("更新%s条" % len(result)) result.clear() utils.parse_results_to_sql(conn,sql_up,result) utils.printf("更新剩下%s条" % len(result)) result.clear()
def parse_html(self, message): utils.printf('%s:解析起始页开始...' % self.provider) conn = utils.init_db('mysql', 'ydylcnbook', 4) result = [] stmt = 'insert ignore into book(bookid,cover_url) Values(%s,%s)' cnt = 0 for filename, fullname in utils.file_list(self.html_path): with open(fullname, encoding='utf8') as f: text = f.read() sel = Selector(text=text) for aTag in sel.xpath('//ul[@class="list-book-1"]/li/a'): bookid = aTag.xpath('./@href').extract_first().split('=')[-1] cover_url = aTag.xpath('./div/div/img/@src').extract_first() result.append((bookid, cover_url)) utils.printf(len(result)) utils.parse_results_to_sql(conn, stmt, result) cnt += len(result) utils.printf(cnt) conn.close() utils.printf('%s:解析起始页完成...' % self.provider) self.senddistributefinish('startdown_list')
def parse_html(self, message): utils.printf('%s:解析起始页开始...' % self.provider) conn = utils.init_db('mysql', 'cambridgejournal') result = [] stmt = 'insert ignore into journal(url,cover_url) Values(%s,%s)' cnt = 0 fname = self.html_path + '/html.html' with open(fname, encoding='utf8') as f: text = f.read() soup = BeautifulSoup(text, 'lxml') aTaglist = soup.select('ul.listings > li > div > div > a') for aTag in aTaglist: url = aTag.get('href') cover_url = '' result.append((url, cover_url)) utils.parse_results_to_sql(conn, stmt, result) cnt += len(result) utils.printf(cnt) conn.close() utils.printf('%s:解析起始页完成...' % self.provider) self.sendwork('startdown_index')
def parse_index(): conn = pymysql.connect(DBHOST, DBUSER, DBPWD, DB) result = [] sql_in = "insert ignore into list(provider_subject,url) values (%s,%s)" for _, filedir in utils.file_list(index_path): with open(filedir, mode='r', encoding='gb18030') as f: text = f.read() html = Selector(text, 'html') big_subject = html.xpath( "//table[@class='txt_css']//td[2]/a[2]/text()").extract_first() list_urls = html.xpath( "//table[@class='title_main']//td[@class='title_maintxt'][1]//a/@href" ).extract() for i, item in enumerate(list_urls): provider_subject = big_subject + ";" + html.xpath( "//table[@class='title_main']//td[@class='title_maintxt'][1]//a/@title" ).extract()[i] url = "http://202.207.22.13:100/" + item result.append((provider_subject, url)) utils.parse_results_to_sql(conn, sql_in, result) print('插入', len(result), ' 个结果到数据库成功')