def Process_SubPage(save_path, img_url): start_html = requests.get(get_inner_link(img_url), headers=headers)#返回Response: <Response [200]>只要不是访问图片,一般都不会被封禁,不用换header和IP start_html.encoding = 'utf-8' bsObj = BeautifulSoup(start_html.text, 'html.parser') #取到网页数据 # print('子页面读取完毕,开始尝试处理图片') urlList = [] # if ('href' in a_img.attrs) and ('title' not in a_img.attrs ) : # print(a_img.attrs['href']) # if re.match(r'^/updowm/.+.html', a_img.attrs['href']) : for a in bsObj.find("div", {"id": "read_tpc"}).findAll("a"):#处理图片 if ('href' in a.attrs) and ('title' not in a.attrs ) : if re.match(re.compile(r'http://www3.uptorrentfilespacedownhostabc'), a.attrs['href']): #这个方法会从符串的开头匹配模式(也就是说字符串从第一个字符开始就需要能够匹配到模式,字符串中间的某些字符能够匹配模式是不行的),如果匹配到会返回 Match 对象,否则返回 None。 url2 = a.attrs['href'] get_torrent(url2, save_path) # urlList.append(url2) # for a_img in bsObj.find("div", {"id": "read_tpc"}).findAll("a"):#处理图片 # if ('href' in a.attrs) and ('title' not in a.attrs ): # if re.match(r'^htm_data/.+.html', a.attrs['href']): # # url2 = baseUrl+a.attrs['href'] # print(a_img.attrs['href']) # urlList.append(a_img.attrs['href']) img_ind = 1 # 下标 # print(bsObj) # IP = soup.select('div.arc > div.content > p > span') # ii = bsObj.select('div.tpc_content > a > href ')#[0].get('href') # IP_lists = [] # for i in ii: # span_IP = i.get_text().encode('utf-8').split('@')[0] # print (span_IP) # ii.append(span_IP) uptorrentfilespacedownhostabc # for a_img in bsObj.find("div", {"id": "read_tpc"}).findAll("uptorrentfilespacedownhostabc"):#处理图片 # if ('src' in a_img.attrs): # print('图片URL为' + a_img.attrs['src']) # print(ii) # get_torrent(ii, save_path) for a_img in bsObj.find("div", {"id": "read_tpc"}).findAll("img"):#处理图片 if ('src' in a_img.attrs): # print('图片URL为' + a_img.attrs['src']) image = download_single_image(a_img.attrs['src']) #time.sleep(0.3)#停止 if image and len(image.content)>40: # 如果不是为空,则说明下载到了,另外图片如果太小,说明图片数据错误(全黑的),一般都是网站上这个图确实没数据 os.chdir(save_path) f = open(str(img_ind) + '.jpg', 'ab') print('下载得到图片!保存图片'+str(img_ind)+'大小为'+str(len(image.content))) f.write(image.content) f.close() img_ind += 1
def Process_SubPage(save_path, img_url): start_html = requests.get( get_inner_link(img_url), headers=headers) # 只要不是访问图片,一般都不会被封禁,不用换header和IP start_html.encoding = 'utf-8' bsObj = BeautifulSoup(start_html.text, 'html.parser') print('子页面读取完毕,开始尝试处理图片') img_ind = 1 # 下标 for a_img in bsObj.find("div", {"id": "read_tpc"}).findAll("img"): # 处理图片 if ('src' in a_img.attrs): print('图片URL为' + a_img.attrs['src']) image = download_single_image(a_img.attrs['src']) time.sleep(0.3) # 停止 # if image and len(image.content)>40000: # 如果不是为空,则说明下载到了,另外图片如果太小,说明图片数据错误(全黑的),一般都是网站上这个图确实没数据 if image: os.chdir(save_path) f = open(str(img_ind) + '.jpg', 'ab') print('下载得到图片!保存图片' + str(img_ind) + '大小为' + str(len(image.content))) f.write(image.content) f.close() img_ind += 1
#Page = urllib.request.urlopen(site)#这里是要读取内容的url except UnicodeDecodeError as e : print('-----UnicodeDecodeError url:',site) except urllib.error.URLError as e: print("-----urlError url:",site) except socket.timeout as e: print("-----socket timout:",site) Coding = (Page.encoding) Content = Page.content#.decode(Coding).encode('utf-8') ContentSoup = BeautifulSoup(Content) jpg = ContentSoup.find_all('img',{'class':'scrollLoading'})#scrollLoading for photo in jpg: PhotoAdd = photo.get('src') PhotoName +=1 Name = (str(PhotoName)+c) r = requests.get(PhotoAdd,stream=True) # with open(PWD+Name, 'ab') as fd: # for chunk in r.iter_content(): # fd.write(chunk) print(PhotoAdd) image=download_single_image(PhotoAdd) if image and len(image.content)>40: # 如果不是为空,则说明下载到了,另外图片如果太小,说明图片数据错误(全黑的),一般都是网站上这个图确实没数据 os.chdir(PWD) f = open(str(Name), 'ab') print('下载得到图片!保存图片'+str(Name)+'大小为'+str(len(image.content))) f.write(image.content) f.close() # print ("You have down %d photos" %PhotoName)