def download_image(url): # 把图片下载下来 # 指定图片下载路径 # 给这个分配一个名字 split_list = url.split('/') filename = split_list.pop() path = os.path.join('images', filename) urllib.urlretrieve(url, filename=path)
def customer(): while True: gLock.acquire() if len(FACE_URL_LIST) == 0: gLock.release() continue else: face_url = FACE_URL_LIST.pop() gLock.release() split_list = face_url.split('/') filename = split_list.pop() path = os.path.join('images', filename) urllib.urlretrieve(face_url, filename=path)
def NARRdload(bdate, hr, filedir): if not os.path.isdir(filedir): os.makedirs(filedir) print('create foler: {}'.format(filedir)) flist = [] for i, day in enumerate(bdate): webdir = day[0:6] fname = 'narr-a_221_%s_%s00_000.grb' % (day, hr) flist.append('%s/%s' % (filedir, fname)) weburl = 'http://nomads.ncdc.noaa.gov/data/narr/%s/%s/%s' % ( webdir, day, fname) dname = '%s/%s' % (filedir, fname) print('Downloading %d of %d: %s' % (i + 1, len(bdate), fname)) if not os.path.exists(dname): urllib3.urlretrieve(weburl, dname) return flist
from urllib.parse import parse_qs, urlparse import requests from bs4 import BeautifulSoup, SoupStrainer import csv import webbrowser import urllib3 with open('../ExtractedNumbers.csv', 'r') as serials: fieldnames = ['Serial_Number', 'Owner'] try: for Serial_Number, line in enumerate(serials, i): response = urllib3.urlretrieve( line, "http://www.utahcounty.gov/LandRecords/Property.asp?av_serial=" + str(Serial_Number)) except: pass serials.close() print('Done') only_td_tags = SoupStrainer("td") soup = BeautifulSoup(response.text, "html.parser", parse_only=only_td_tags) targetCell = soup.find(text="Mailing Address:") print(targetCell.parent.parent.text)
fo = open("/Users/Personals/%s" % user_id, "wb") fo.write(result) word_path = os.getcwd() + '/%d' % user_id print u'文字微博爬取完毕' link = "" fo2 = open("/Users/Personals/%s_imageurls" % user_id, "wb") for eachlink in urllist_set: link = link + eachlink + "\n" fo2.write(link) print u'图片链接爬取完毕' if not urllist_set: print u'该页面中不存在图片' else: #下载图片,保存在当前目录的pythonimg文件夹下 image_path = os.getcwd() + '/weibo_image' if os.path.exists(image_path) is False: os.mkdir(image_path) x = 1 for imgurl in urllist_set: temp = image_path + '/%s.jpg' % x print u'正在下载第%s张图片' % x try: urllib3.urlretrieve(urllib3.urlopen(imgurl).geturl(), temp) except: print u"该图片下载失败:%s" % imgurl x += 1 print u'原创微博爬取完毕,共%d条,保存路径%s' % (word_count - 4, word_path) print u'微博图片爬取完毕,共%d张,保存路径%s' % (image_count - 1, image_path)
# py抓取页面图片并保存到本地 # 获取页面信息 def getHtml(url): http = urllib3.PoolManager() request = http.request('GET', url) html = str(request.data, encoding="utf-8") return html # 通过正则获取图片 def getImg(html): reg = r'src="(.+?\.jpg)" pic_ext' imgre = re.compile(reg) imglist = re.findall(imgre, html) # 循环把图片存到本地 return imglist html = getHtml("http://tieba.baidu.com/p/2460150866") # print(html) imglist = getImg(html) print(imglist) x = 0 for imgurl in imglist: # 保存到本地 urllib3.urlretrieve(imgurl, '/Applications/MAMP/image/%s.jpg' % x) x += 1
from selenium import webdriver import urllib3 from selenium.webdriver.common.keys import Keys driver = webdriver.Chrome('chromedriver.exe') word = "coffee" url = "http://images.google.com/search?q="+word+"&tbm=isch&sout=1" driver.get(url) imageXpathSelector = '//*[@id="islrg"]/div[1]/div[1]/a[1]/div[1]/img' img = driver.find_element_by_xpath(imageXpathSelector) src = (img.get_attribute('src')) urllib3.urlretrieve(src, word+".jpg") driver.close()