예제 #1
0
def Process_SubPage(save_path, img_url):
	
	start_html = requests.get(get_inner_link(img_url), headers=headers)#返回Response: <Response [200]>只要不是访问图片,一般都不会被封禁,不用换header和IP
	start_html.encoding = 'utf-8'
	bsObj = BeautifulSoup(start_html.text, 'html.parser') #取到网页数据
# 	print('子页面读取完毕,开始尝试处理图片')
	
	urlList = []
	#          if ('href' in a_img.attrs) and ('title' not in a_img.attrs ) :
#          		print(a_img.attrs['href'])
#          if re.match(r'^/updowm/.+.html', a_img.attrs['href']) :
	for a in bsObj.find("div", {"id": "read_tpc"}).findAll("a"):#处理图片
           if ('href' in a.attrs) and ('title' not in a.attrs ) :
            	
               if re.match(re.compile(r'http://www3.uptorrentfilespacedownhostabc'), a.attrs['href']): #这个方法会从符串的开头匹配模式(也就是说字符串从第一个字符开始就需要能够匹配到模式,字符串中间的某些字符能够匹配模式是不行的),如果匹配到会返回 Match 对象,否则返回 None。
                   url2 = a.attrs['href']
                   get_torrent(url2, save_path)
#                    urlList.append(url2)
# 	for a_img in bsObj.find("div", {"id": "read_tpc"}).findAll("a"):#处理图片
# 		if ('href' in a.attrs) and ('title' not in a.attrs ):
#        	 if re.match(r'^htm_data/.+.html', a.attrs['href']):	
# #                    url2 = baseUrl+a.attrs['href']
# 			print(a_img.attrs['href'])
#              urlList.append(a_img.attrs['href'])		
	
	img_ind = 1  # 下标
# 	print(bsObj)
# 	IP = soup.select('div.arc > div.content > p > span')
# 	ii = bsObj.select('div.tpc_content > a > href ')#[0].get('href') 	
# 	IP_lists = []
#     for i in ii:
#       span_IP = i.get_text().encode('utf-8').split('@')[0]
#          	print (span_IP)	
#          	ii.append(span_IP) uptorrentfilespacedownhostabc
# 	for a_img in bsObj.find("div", {"id": "read_tpc"}).findAll("uptorrentfilespacedownhostabc"):#处理图片
# 		if ('src' in a_img.attrs):
# 			print('图片URL为' + a_img.attrs['src'])
# 	print(ii)
# 	get_torrent(ii, save_path)
	
	for a_img in bsObj.find("div", {"id": "read_tpc"}).findAll("img"):#处理图片
		if ('src' in a_img.attrs):
# 			print('图片URL为' + a_img.attrs['src'])
			
			image = download_single_image(a_img.attrs['src'])
			#time.sleep(0.3)#停止
			
			if image and len(image.content)>40:  # 如果不是为空,则说明下载到了,另外图片如果太小,说明图片数据错误(全黑的),一般都是网站上这个图确实没数据
				os.chdir(save_path)
				f = open(str(img_ind) + '.jpg', 'ab')
				print('下载得到图片!保存图片'+str(img_ind)+'大小为'+str(len(image.content)))
				f.write(image.content)
				f.close()
			img_ind += 1
예제 #2
0
def Process_SubPage(save_path, img_url):
    start_html = requests.get(
        get_inner_link(img_url),
        headers=headers)  # 只要不是访问图片,一般都不会被封禁,不用换header和IP
    start_html.encoding = 'utf-8'
    bsObj = BeautifulSoup(start_html.text, 'html.parser')
    print('子页面读取完毕,开始尝试处理图片')
    img_ind = 1  # 下标
    for a_img in bsObj.find("div", {"id": "read_tpc"}).findAll("img"):  # 处理图片
        if ('src' in a_img.attrs):
            print('图片URL为' + a_img.attrs['src'])
            image = download_single_image(a_img.attrs['src'])
            time.sleep(0.3)  # 停止

            # if image and len(image.content)>40000: # 如果不是为空,则说明下载到了,另外图片如果太小,说明图片数据错误(全黑的),一般都是网站上这个图确实没数据
            if image:
                os.chdir(save_path)
                f = open(str(img_ind) + '.jpg', 'ab')
                print('下载得到图片!保存图片' + str(img_ind) + '大小为' +
                      str(len(image.content)))
                f.write(image.content)
                f.close()
            img_ind += 1
예제 #3
0
    #Page = urllib.request.urlopen(site)#这里是要读取内容的url  
    except  UnicodeDecodeError as e :
        print('-----UnicodeDecodeError url:',site)  
    except urllib.error.URLError as e:  
        print("-----urlError url:",site)  
  
    except socket.timeout as e:  
        print("-----socket timout:",site)       
    Coding =  (Page.encoding)
    Content = Page.content#.decode(Coding).encode('utf-8')
    ContentSoup = BeautifulSoup(Content)
    jpg = ContentSoup.find_all('img',{'class':'scrollLoading'})#scrollLoading
    for photo in jpg:
        PhotoAdd = photo.get('src')
        PhotoName +=1
        Name =  (str(PhotoName)+c)
        r = requests.get(PhotoAdd,stream=True)
#         with open(PWD+Name, 'ab') as fd:
#             for chunk in r.iter_content():
#                 fd.write(chunk)
        print(PhotoAdd)
        image=download_single_image(PhotoAdd)  
        if image and len(image.content)>40:  # 如果不是为空,则说明下载到了,另外图片如果太小,说明图片数据错误(全黑的),一般都是网站上这个图确实没数据
            os.chdir(PWD)
            f = open(str(Name), 'ab')
            print('下载得到图片!保存图片'+str(Name)+'大小为'+str(len(image.content)))
            f.write(image.content)
            f.close()    
#         print ("You have down %d photos" %PhotoName)