def process_item(self, item, spider): image_urls = [] for image_url in item['image_urls']: url = image_url.get('url') urlHash = EncryptUtil.md5(url) path = 'full/' + str(urlHash) + '.jpg' detailPath = self.savePath + '/' + path # 创建目录 saveDir = self.savePath + '/full' if not FileUtil.dirIsExist(saveDir): FileUtil.createDir(saveDir) if FileUtil.fileIsExist(detailPath): spider.logDao.info(u'图片已经存在本地:' + url) image_url_new = { 'ok': True, 'x': { 'url': url, 'path': path } } else: try: fileResponse = requests.get(url, timeout=10) req_code = fileResponse.status_code req_msg = fileResponse.reason if req_code == 200: open(detailPath, 'wb').write(fileResponse.content) image_url_new = { 'ok': True, 'x': { 'url': url, 'path': path } } spider.logDao.info(u'图片成功下载:' + url) else: spider.logDao.info(u'下载图片失败:' + url) image_url_new = { 'ok': False, 'x': { 'url': url, } } except Exception, e: print e spider.logDao.warn(u'下载图片失败:' + url) image_url_new = { 'ok': False, 'x': { 'url': url, } } image_urls.append(image_url_new) # 空转2s TimerUtil.sleep(2)
def delImg(url): file_path = os.path.dirname(os.path.realpath(__file__)) + u'/image' urlHash = EncryptUtil.md5(url) file_name = str(urlHash) + '.jpg' detailPath = file_path + '\\' + file_name try: FileUtil.delFile(detailPath) print(u'删除图片成功:%s' % detailPath) except Exception as e: print(u'删除图片失败:%s' % str(e))
def downLoadCss(styleUrls): styleList = [] css = {} for styleUrl in styleUrls: # 得到hash作为key if styleUrl.startswith(u'//'): styleUrl = u'http:' + styleUrl styleUrlHash = EncryptUtil.md5(styleUrl) if styleUrlHash not in css: # 不存在则去下载 并保存 styles = getStyle(styleUrl) if styles: css[styleUrlHash] = styles if css.get(styleUrlHash): styleList.append(css.get(styleUrlHash)) return styleList
def getStyle(url): url_hash = EncryptUtil.md5(url) # 先检查缓存里面的style file_path = getFilePath(url_hash) loadF = None try: if not os.path.exists(file_path): # 不存在,则需要下载 styles = CssUtil.downLoad(url) if styles: with open(file_path, u'w') as loadF: json.dump( { u'update_time': datetime.datetime.now().strftime( u'%Y-%m-%d %H:%M:%S'), u'url': url, u'styles': styles }, loadF) return styles else: with open(file_path, u'r') as loadF: detail = json.load(loadF) update_time = detail[u'update_time'] styles = detail[u'styles'] # 如果更新时间之间相差5天,就下载 update_time = datetime.datetime.strptime( update_time, u'%Y-%m-%d %H:%M:%S') now = datetime.datetime.now() space_day = (now - update_time).days if space_day >= 5: # 需要重新下载 loadF.close() FileUtil.delFile(file_path) return getStyle(url) else: # 不需要重新下载 return styles finally: if loadF: loadF.close()
def getHashCode(self, source_url): # 具体逻辑 return EncryptUtil.md5(source_url)
def getWxArticleHashCode(self, title, wx_account, source_id): # 具体逻辑 微信专用,别的请使用getHashCode return EncryptUtil.md5(title.encode('utf8')+wx_account.encode('utf8')+str(source_id))
def downLoadImage(image_url_sources): image_urls = [] for image_url in image_url_sources: file_path = os.path.dirname(os.path.realpath(__file__)) + u'/image' if not os.path.isdir(file_path): os.mkdir(file_path) url = image_url.get('url') print url urlHash = EncryptUtil.md5(url) fileName = str(urlHash) + '.jpg' detailPath = file_path + '\\' + fileName if FileUtil.fileIsExist(detailPath): print u'图片已经存在本地:' + url image_url_new = { 'ok': True, 'x': { 'url': url, 'path': detailPath, 'fileName': fileName } } else: try: fileResponse = requests.get(url, timeout=10) req_code = fileResponse.status_code req_msg = fileResponse.reason if req_code == 200: open(detailPath, 'wb').write(fileResponse.content) # 判断大小是否大于100kb 压缩到600, 质量为80 if len(fileResponse.content) > 100 * 1024: # 目标图片大小 dst_w = 600 dst_h = 600 # 保存的图片质量 save_q = 80 ImageCompressUtil().resizeImg(ori_img=detailPath, dst_img=detailPath, dst_w=dst_w, dst_h=dst_h, save_q=save_q) image_url_new = { 'ok': True, 'x': { 'url': url, 'path': detailPath, 'fileName': fileName } } # http://p0.ifengimg.com/pmop/2017/1010/E66C2599CE9403A670AD405F4CCAB271B366D7DC_size415_w1290_h692.png print u'图片成功下载,大小:' + str( len(fileResponse.content) / 1024) + 'kb ' + url print u'最终存储图片,大小:' + str( os.path.getsize(detailPath) / 1024) + 'kb ' + url else: print u'下载图片失败:' + url image_url_new = { 'ok': False, 'x': { 'url': url, } } except Exception, e: print u'下载图片失败:' + url image_url_new = { 'ok': False, 'x': { 'url': url, } } image_urls.append(image_url_new) # 空转2s TimerUtil.sleep(2)