def down(info): imgUrl = info["imgUrl"] pathtitle = info["pathtitle"] title= info["title"] global Path,threadLock filePath = "{}{}".format(Path,pathtitle) threadLock.acquire() if os.path.exists(filePath): pass else: try: os.mkdir(filePath) except Exception as e: os.makedirs(filePath) # 判断该文件是否已存在 flag2 = os.path.exists( "{}/{}".format(filePath,title) ) threadLock.release() if flag2: print("{}已存在,跳过".format(title)) return else: print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), "下载图片:{},到:{}".format(imgUrl, pathtitle)) resource = requests.get(imgUrl, headers=h.getHeader(), verify=False) with open("{}/{}".format(filePath,title), mode="wb") as fh: fh.write(resource.content)
def pages(html): doc = pq(html) nextPage = doc("div.nav-links>a.post-page-numbers") pathTitle = doc("h1.post-title").text() imgurl = re.findall("pic: '(.*?)',",html)[0] title = imgurl.split("/") d = {} d["imgUrl"] = imgurl d["pathtitle"] = pathTitle d["title"] = title[len(title) - 1] thread_pool.submit(down, d) for n in nextPage: urlTemp = pq(n).attr("href") try: for i in passUrl: if (i == urlTemp.lstrip("https://www.vmgirls.com") ): return res = requests.get(urlTemp, headers=h.getHeader(), verify=False) doc = pq(res.text) info = doc(".nc-light-gallery").find('a') for i in info: d = pq(i) pathTitle = d.attr("alt") imgurl = d.attr("href") title = imgurl.split("/") d = {} d["imgUrl"] = imgurl d["pathtitle"] = pathTitle d["title"] = title[len(title) - 1] thread_pool.submit(down, d) threadLock.acquire() passUrl.append( urlTemp.lstrip("https://www.vmgirls.com") ) threadLock.release() except Exception as e: pass
def getUrl(url): tempUrl = url.lstrip("https://www.vmgirls.com") for i in passUrl: if(i == tempUrl): return try: threadLock.acquire() passUrl.append(tempUrl) threadLock.release() res = requests.get(url, headers=h.getHeader(), verify=False) html = res.text filterUrl(html,url) except Exception as e: pass
def getUrl(url): print("getUrl") try: # 加锁 threadLock.acquire() passUrl.append(url) # 释放锁 threadLock.release() print("正在请求", url) proxies = getProxy() res = requests.get(url, headers=h.getHeader(), proxies=proxies, verify=False) html = res.text thread_pool.submit(filterUrl, html) except Exception as e: pass
def down(info): # print("down") imgUrl = ("https:" + info["imgUrl"]).replace("normal", "middle") title = info["title"] global Path, threadLock # 加锁 threadLock.acquire() if os.path.exists(Path): pass else: try: os.mkdir(Path) except Exception as e: os.makedirs(Path) filePath = "{}/{}.{}".format(Path, title, imgUrl.split(".")[-1]) # 判断该文件是否已存在 if os.path.exists(filePath): print("{}已存在,跳过".format(title)) # 释放锁 threadLock.release() return else: threadLock.release() print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), "下载图片:{},到:{}".format(title, Path)) resource = requests.get(imgUrl, headers=h.getHeader(), verify=False) with open(filePath, mode="wb") as fh: fh.write(resource.content)
def getProxy(): res = requests.get("http://47.106.86.144:8088/getProxy") info = json.loads(res.text) proxy = f'{info["type"]}://{info["ip"]}:{info["port"]}' return proxy if __name__ == '__main__': # getProxy() url = "https://www.hahamx.cn/pic" res = requests.get(url, headers=h.getHeader(), verify=False) dom = pq(res.text) ls = dom(".pagination-link").items() page = 0 # 获得总页数 for i in ls: try: tempNum = int(i.text()) page = tempNum except: pass filterUrl(res.text)