示例#1
0
def down(info):
    imgUrl = info["imgUrl"]
    pathtitle = info["pathtitle"]
    title= info["title"]


    global Path,threadLock

    filePath = "{}{}".format(Path,pathtitle)

    threadLock.acquire()

    if os.path.exists(filePath):
        pass
    else:
        try:
            os.mkdir(filePath)
        except Exception as e:
            os.makedirs(filePath)


    # 判断该文件是否已存在
    flag2 = os.path.exists( "{}/{}".format(filePath,title) )

    threadLock.release()

    if flag2:
        print("{}已存在,跳过".format(title))
        return
    else:
        print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), "下载图片:{},到:{}".format(imgUrl, pathtitle))

        resource = requests.get(imgUrl, headers=h.getHeader(), verify=False)
        with open("{}/{}".format(filePath,title), mode="wb") as fh:
            fh.write(resource.content)
示例#2
0
def pages(html):
    doc = pq(html)
    nextPage = doc("div.nav-links>a.post-page-numbers")

    pathTitle =  doc("h1.post-title").text()

    imgurl = re.findall("pic: '(.*?)',",html)[0]
    title = imgurl.split("/")

    d = {}
    d["imgUrl"] = imgurl
    d["pathtitle"] = pathTitle
    d["title"] =  title[len(title) - 1]
    thread_pool.submit(down, d)

    for n in nextPage:

        urlTemp = pq(n).attr("href")
        try:

            for i in passUrl:
                if (i == urlTemp.lstrip("https://www.vmgirls.com") ):
                    return

            res = requests.get(urlTemp, headers=h.getHeader(), verify=False)

            doc = pq(res.text)

            info = doc(".nc-light-gallery").find('a')

            for i in info:
                d = pq(i)
                pathTitle = d.attr("alt")
                imgurl = d.attr("href")
                title = imgurl.split("/")

                d = {}
                d["imgUrl"] = imgurl
                d["pathtitle"] = pathTitle
                d["title"] = title[len(title) - 1]
                thread_pool.submit(down, d)

            threadLock.acquire()
            passUrl.append( urlTemp.lstrip("https://www.vmgirls.com") )

            threadLock.release()


        except Exception as e:
            pass
示例#3
0
def getUrl(url):
    tempUrl = url.lstrip("https://www.vmgirls.com")
    for i in passUrl:
        if(i == tempUrl):
            return
    try:

        threadLock.acquire()
        passUrl.append(tempUrl)

        threadLock.release()
        res = requests.get(url, headers=h.getHeader(), verify=False)
        html = res.text
        filterUrl(html,url)
    except Exception as e:
        pass
示例#4
0
def getUrl(url):
    print("getUrl")

    try:
        # 加锁
        threadLock.acquire()
        passUrl.append(url)
        # 释放锁
        threadLock.release()

        print("正在请求", url)

        proxies = getProxy()

        res = requests.get(url,
                           headers=h.getHeader(),
                           proxies=proxies,
                           verify=False)
        html = res.text

        thread_pool.submit(filterUrl, html)

    except Exception as e:
        pass
示例#5
0
def down(info):
    # print("down")
    imgUrl = ("https:" + info["imgUrl"]).replace("normal", "middle")
    title = info["title"]

    global Path, threadLock

    # 加锁
    threadLock.acquire()

    if os.path.exists(Path):
        pass
    else:
        try:
            os.mkdir(Path)
        except Exception as e:
            os.makedirs(Path)

    filePath = "{}/{}.{}".format(Path, title, imgUrl.split(".")[-1])

    # 判断该文件是否已存在
    if os.path.exists(filePath):

        print("{}已存在,跳过".format(title))

        # 释放锁
        threadLock.release()
        return
    else:
        threadLock.release()
        print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
              "下载图片:{},到:{}".format(title, Path))

        resource = requests.get(imgUrl, headers=h.getHeader(), verify=False)
        with open(filePath, mode="wb") as fh:
            fh.write(resource.content)
示例#6
0

def getProxy():
    res = requests.get("http://47.106.86.144:8088/getProxy")

    info = json.loads(res.text)

    proxy = f'{info["type"]}://{info["ip"]}:{info["port"]}'
    return proxy


if __name__ == '__main__':
    # getProxy()

    url = "https://www.hahamx.cn/pic"
    res = requests.get(url, headers=h.getHeader(), verify=False)
    dom = pq(res.text)
    ls = dom(".pagination-link").items()

    page = 0

    # 获得总页数
    for i in ls:
        try:
            tempNum = int(i.text())
            page = tempNum
        except:
            pass

    filterUrl(res.text)