def down_cover(self, message): HEADER = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36', } bookid = message[0] cover_url = message[1] filename = self.cover_path + '/' + bookid + '.jpg' if os.path.exists(filename): self.senddistributefinish('process_cover', bookid) return try: proxy = self.getproxy() proxies = {'http': proxy, 'https': proxy} resp = requests.get(cover_url, headers=HEADER, timeout=20, proxies=proxies) # resp = requests.get(cover_url, headers=HEADER, timeout=20) except: self.sendwork('down_cover', message) return if utils.Img2Jpg(resp.content, filename): utils.printf('下载图片%s成功' % filename) self.senddistributefinish('process_cover', bookid) else: self.sendwork('down_cover', message) return
def downcover(): url = 'https://aip.scitation.org' cover_dir_fullpath = os.path.dirname(os.path.abspath(__file__)) + '/cover' if not os.path.exists(cover_dir_fullpath): os.makedirs(cover_dir_fullpath) try: resp = utils.get_html(url) except: # exMsg = '* ' + traceback.format_exc() # print(exMsg) return False if not resp: return False if resp.text.find('</html>') < 0: return False soup = BeautifulSoup(resp.content.decode('utf8'), 'lxml') divList = soup.select('div.publicationCoverImage') # divpb = soup.select_one('div', data - widget - id='bfd39502-c303-4169-88ba-1d2b9bba85ab') for divtag in divList: coverurl = url + divtag.a.img['src'] covername = cover_dir_fullpath + '/' + divtag.a['href'].split( '/')[-1].lower() + '.jpg' if os.path.exists(covername): continue resp = utils.get_html(coverurl) if utils.Img2Jpg(resp.content, covername): utils.printf('下载', covername, '成功...') time.sleep(3) # apburl = 'https://aip.scitation.org/pb-assets/images/publications/apb/apl-bioeng-1483023557097.jpg' # apbname = cover_dir_fullpath + '/' + 'apb.jpg' # resp = utils.get_html(apburl) # if utils.Img2Jpg(resp.content, apbname): # utils.printf('下载', apbname, '成功...') return True
def down_cover(): conn = pymysql.connect(DBHOST, DBUSER, DBPWD, DB) now_time = time.strftime('%Y%m%d') dirpath = cover_path + '/' + now_time if not os.path.exists(dirpath): os.makedirs(dirpath) sql_up = "update video set cover_stat = 1 where rawid = %s" result = [] while True: sql = "select rawid,cover_url from video where cover_stat=0 limit 1000" cur = conn.cursor() cur.execute(sql) rows = cur.fetchall() if len(rows) == 0: break else: for rawid, cover_url in rows: path = dirpath + '/%s.jpg' % rawid res = utils.get_html(cover_url, proxies=proxy, timeout=50) if res: if os.path.exists(path): result.append((rawid)) utils.printf("该", rawid, "存在...") else: if utils.Img2Jpg(res.content, path): result.append((rawid)) utils.printf("下载", rawid, "成功...") else: print('%s -- down cover error' % rawid) if utils.parse_results_to_sql(conn, sql_up, result, 100): total = len(result) result.clear() print('更新 ', total, ' 个结果到数据库成功') utils.parse_results_to_sql(conn, sql_up, result) print('更新 ', len(result), ' 个结果到数据库成功')
def down_cover(self, message): utils.printf('开始下载图片') if not self.cover_path: self.initpath() self.refreshproxypool() conn = utils.init_db('mysql', 'hepengineeringjournal', 4) cur = conn.cursor() cur.execute( "select journal_id,cover_url from journal where cover_url!=''") rows = cur.fetchall() HEADER = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36', } for journal_id, cover_url in rows: filename = self.cover_path + '/' + journal_id + '.jpg' if os.path.exists(filename): continue while True: try: proxy = self.getproxy() proxies = {'http': proxy, 'https': proxy} resp = requests.get(cover_url, headers=HEADER, timeout=20, proxies=proxies) # resp = requests.get(cover_url, headers=HEADER, timeout=20) except: utils.printf(filename) continue if utils.Img2Jpg(resp.content, filename): utils.printf('下载图片%s成功' % filename) break self.sendwork('mapcover')
def down_cover(self, message): HEADER = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36', } url = message[0] cover_url = message[1] filename = '%s/10.2514/%s.jpg' % (self.cover_path, url.split('/')[-1]) if os.path.exists(filename): self.senddistributefinish('process_cover', url) return try: proxy = self.getproxy() proxies = {'http': proxy, 'https': proxy} resp = requests.get('https://arc.aiaa.org' + cover_url, headers=HEADER, timeout=20, proxies=proxies) # resp = requests.get(cover_url, headers=HEADER, timeout=20) except: exMsg = '* ' + traceback.format_exc() print(exMsg) self.sendwork('down_cover', message) return if utils.Img2Jpg(resp.content, filename): utils.printf('下载图片%s成功' % filename) self.senddistributefinish('process_cover', url) else: self.sendwork('down_cover', message) return
def down_cover(): provider = 'aspetjournal' path = r'E:\work\DMAD\cover' if not os.path.exists(path): os.mkdir(path) pathtxt = '%s/%s_cover_%s.txt' % (path, provider, now_time) dic_cover = { "dmd": "http://dmd.aspetjournals.org/current-issue/cover-image-medium/cover.jpg", "jpet": "http://jpet.aspetjournals.org/current-issue/cover-image-medium/cover.jpg", "molpharm": "http://molpharm.aspetjournals.org/current-issue/cover-image-medium/cover.jpg", "pharmrev": "http://pharmrev.aspetjournals.org/current-issue/cover-image-medium/cover.jpg" } for name, url in dic_cover.items(): res = utils.get_html(url) if res: cover_path = '%s/%s.jpg' % (path, name) utils.Img2Jpg(res.content, cover_path) s = provider + '@' + name + '★' + '/smartlib/' + provider + '/' + name + '.jpg' + '\n' with open(pathtxt, 'a', encoding='utf-8') as f: f.write(s) utils.printf('下载%s图片成功' % name)
def down_cover(self): now_path = self.coverPath + '\\' + self.now_time if not os.path.exists(now_path): os.makedirs(now_path) sql_up = "update list set cover_stat = 1 where rawid = %s" sql_fail = "update list set fail_count = %s where rawid = %s" result = [] result_fail = [] while True: sql = "select rawid, cover_url, title, fail_count from list where cover_stat = 0 and fail_count < 10 limit 1000 " cur = self.conn.cursor() cur.execute(sql) rows = cur.fetchall() if len(rows) == 0: break else: for rawid, cover_url, title, fail_count in rows: path = now_path + '/%s.jpg' % rawid if os.path.exists(path): result.append((rawid)) print('%s exists' % path) elif 'zhlx.png' in cover_url: result.append((rawid)) print('%s no ' % path) elif cover_url == 'http://lib.fifedu.com/upload_dir/': result.append((rawid)) print('%s no ' % path) else: try: res = requests.get(cover_url, headers=self.headers) if res.status_code == 200: path = now_path + '/%s.jpg' % rawid # utils.Img2Jpg(res.content,path) if utils.Img2Jpg(res.content, path): print('%s -- down cover right' % title) result.append((rawid)) else: print('%s -- down cover error' % title) fail_count += 1 result_fail.append((fail_count, rawid)) utils.parse_results_to_sql( self.conn, sql_fail, result_fail) else: print('status_code != 200 ~') fail_count += 1 result_fail.append((fail_count, rawid)) utils.parse_results_to_sql( self.conn, sql_fail, result_fail) except Exception as e: print(e) # pass if utils.parse_results_to_sql(self.conn, sql_up, result, 100): print('插入%s条' % len(result)) result.clear() utils.parse_results_to_sql(self.conn, sql_up, result) print('插入剩下%s条' % len(result)) result.clear()
def down_cover(): conn = pymysql.connect(DBHOST, DBUSER, DBPWD, DB) sql_up = "update detail set cover_stat = 1 where cover_url = %s" sql_fail = "update detail set failcount =failcount+1 where cover_url = %s" result = [] result_fail = [] fdir = cover_path + '/' + now_time if not os.path.exists(fdir): os.mkdir(fdir) while True: cur = conn.cursor() sql = "select url,cover_url from detail where cover_stat = 0 and failcount < 5 limit 1000" cur.execute(sql) rows = cur.fetchall() if len(rows) == 0: break else: for url,cover_url in rows: rawid = url.replace("/chinese/web/Details.aspx?id=","") url = "http://www.360elib.com:2100" + cover_url res = utils.get_html(url) if res: new_id = rawid.lower() fname = "%s/%s.jpg" % (fdir,new_id) if utils.Img2Jpg(res.content,fname): result.append( (cover_url) ) utils.printf("%s封面下载成功" % new_id) else: result_fail.append( (cover_url) ) utils.printf("错误") else: result_fail.append( (cover_url) ) utils.printf("错误") if utils.parse_results_to_sql(conn, sql_fail, result_fail,100): print("错误%s条成功" % len(result_fail)) result_fail.clear() if utils.parse_results_to_sql(conn, sql_up, result,100): print("更新%s条成功" % len(result)) result.clear() utils.parse_results_to_sql(conn, sql_up, result) print("更新剩下%s条成功" % len(result)) result.clear() utils.parse_results_to_sql(conn, sql_fail, result_fail) print("错误剩下%s条成功" % len(result_fail)) result_fail.clear()
def down_cover(): path = r'E:\work\PNAS\cover\pnas.jpg' url = "https://www.pnas.org/content/by/year" res = utils.get_html(url) if res: html = Selector(res.text,'html') cover_url = html.xpath("//div[@class='cover-issue-image']/a/img/@src").extract_first() utils.printf(cover_url) cover = requests.get(cover_url) if cover: utils.Img2Jpg(cover.content,path) provider = 'pnasjournal' pathtxt = r'E:\work\PNAS\cover\pnasjournal_cover_20191212.txt' s = provider + '@' + 'pnas' + '★' + '/smartlib/' + provider + '/' + 'pnas' + '.jpg' +'\n' with open (pathtxt, 'a',encoding='utf-8') as f: f.write(s)
def down_cover(self, message): url = message[0] filename = message[1] count = message[2] if os.path.exists(filename): self.senddistributefinish('process_cover') return if count == 5: self.senddistributefinish('process_cover') return feature = 'on-the-cover-homepage' utils.logerror(url) utils.printf(url) resp = self.gethtml('https://www.cambridge.org' + url) if not resp: self.sendwork('down_cover', (url, filename, count + 1)) return if resp.content.decode('utf-8').find(feature) < 0: print("can not find on-the-cover-homepage") feature = 'row cover collapse' resp = self.gethtml( 'https://www.cambridge.org' + url + '/latest-issue', feature) if not resp: self.sendwork('down_cover', (url, filename, count + 1)) return soup = BeautifulSoup(resp.content.decode('utf-8'), 'lxml') imgtag = soup.select_one('div.image.on-the-cover-homepage > a > img') if not imgtag: imgtag = soup.select_one( 'div.large-4.medium-4.small-12.columns > div > div > div > a > img' ) if not imgtag: utils.logerror('%s:图片错误' % url) self.senddistributefinish('process_cover') return cover_url = 'https:' + imgtag.get('src').replace('https:', '') resp = self.gethtml(cover_url, feature=None, coverflag=True) if not resp: self.sendwork('down_cover', (url, filename, count + 1)) return if utils.Img2Jpg(resp.content, filename): utils.printf('下载图片%s成功' % filename) self.senddistributefinish('process_cover') else: utils.printf('%s:图片错误' % url) utils.logerror('%s:图片错误' % url) self.senddistributefinish('process_cover')
def down_cover(self, message): fname = message[0] cover_url = message[1] if os.path.exists(fname): self.senddistributefinish('process_cover', cover_url) return url = 'https://ascelibrary.org' + cover_url while True: try: proxy = self.getproxy() proxies = {'http': proxy, 'https': proxy} resp = requests.get(url, proxies=proxies) except: exMsg = '* ' + traceback.format_exc() print(exMsg) continue if utils.Img2Jpg(resp.content, fname): utils.printf('下载图片%s成功' % fname) self.senddistributefinish('process_cover', cover_url) return
def down_cover(self, message): url = 'http://www.sciencemag.org/journals' while True: resp = self.gethtml(url) if resp: break html = resp.content.decode('utf-8') soup = BeautifulSoup(html, 'lxml') alist = soup.select('div.media__icon > a') for aTag in alist: imageTag = aTag.select_one('img') filename = self.cover_path + '/' + aTag.get('href').replace( 'http://', '').split('.')[0] + '.jpg' if imageTag: imageurl = 'http:' + imageTag.get('src') while True: try: resp = requests.get(imageurl) except: continue if utils.Img2Jpg(resp.content, filename): break utils.printf('下载图片成功') self.sendwork('mapcover')