Exemplo n.º 1
0
def indexgeter(qi):
    TatgetTag = False
    indexlist = []
    # 从日志中读取上次进行到的位置
    lastpage, lastgid, lasttoken = getlastindex()
    lastpage = int(lastpage)
    print('为API爬虫获取代理')
    APIProxy = getIP()
    # 借用API的代理寻找之前的进行到的最后一条所在的页数
    if lasttoken != None:
        lastpage = findPage(lastpage, lastgid, lasttoken, APIProxy)
    print("创建目录爬虫")
    # 上次进行到的页数和项目
    geter = getindex(lastpage=lastpage, toekn=lasttoken)
    while True:
        # 如果目录队列的项目小于5则开始获取下一页
        if qi.qsize() < 5:
            # 目录列表为空才会启动
            if len(indexlist) == 0:
                try:
                    # 获取到目录
                    indexlist = geter.getlist()
                except GetIndexError as e:
                    print(e)
                    break
            try:
                # 调用API爬虫
                APIdata = getAPIdata(indexlist, APIProxy)
                # 生成值对象列表
                dataOVlist = datadump(APIdata)
                # 将值对象送入队列
                for dataOV in dataOVlist:
                    if dataOV.getindex()[1] == TARGET[1]:
                        TatgetTag = True
                        break
                    # print(dataOV)
                    qi.put(dataOV)
                # 清空队列
                indexlist = []
                dataOVlist = []
            except BanIPError:
                print("API爬虫的代理已被Ban,更换代理")
                APIProxy = getIP()
            except APIError as e:
                print(e)
                break
        if TatgetTag:
            print("到达目标位置,目录发生器停止运行")
            break
Exemplo n.º 2
0
def findPage(lastpage, lastindex, proxy):
    excookies = requests.utils.cookiejar_from_dict(COOKIE_DICT,
                                                   cookiejar=None,
                                                   overwrite=True)
    ehheaders = {
        'Accept':
        'text/html, application/xhtml+xml, image/jxr, */*',
        'Accept-Encoding':
        'gzip, deflate',
        'Accept-Language':
        'zh-Hans-CN, zh-Hans; q=0.7, ja; q=0.3',
        'Connection':
        'Keep-Alive',
        'Host':
        'exhentai.org',
        'User-Agent':
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393'
    }
    print("开始获取上次进行到的位置")
    while True:
        try:
            html = requests.get('https://exhentai.org/?page=' + str(lastpage),
                                headers=ehheaders,
                                cookies=excookies,
                                proxies=proxy).text
            if 'Archive Download' not in html and 'IP' in html:
                proxy = getIP()
            elif lastindex[1] not in html:
                lastpage += 1
                continue
            else:
                return lastindex
        except BaseException:
            pass
Exemplo n.º 3
0
def go(no=1):
    IPandport = [
        '104.196.177.247',
        80,
    ]
    indexs = getindex(no).geter()
    proxycount = 0
    count = 0
    try:
        for index in indexs:
            while True:
                try:
                    id = index[2]
                    data = getfav_rat(index[0], index[1], IPandport)
                    writedata(data[0], data[1], data[2], id)
                    sleep(0.5)
                    count += 1
                    no += 1
                    proxycount = 0
                    break
                except ProxyError:
                    proxycount += 1
                    sleep(5)
                except ExPandaError:
                    raise
                except BanIPError:
                    IPandport = getIP()
                    proxycount = 0
                if proxycount >= 5:
                    IPandport = getIP()
                    proxycount = 0
            # 每拿到100条数据,暂停十秒
            if count == 100:
                sleep(10)
                count = 0
    except BaseException as e:
        print('发生未知错误')
        # 发生错误时,输出错误信息、当前所用的no、id、代理
        logging.exception(e)
        print(no)
        print(id)
        print(IPandport)
        remind('程序异常终止')
Exemplo n.º 4
0
def changeproxies():
    try:
        IPandport = getIP()
    except IPProxyPoolRunError as e:
        IPandport = MyIPandport
    except IPPoolEmpError as e:
        IPandport = MyIPandport
    if IPandport[0] in banedIPlist and IPandport[0] == '144.168.63.75':
        print('已无任何可用IP')
        raise NOIPError
    return IPandport
Exemplo n.º 5
0
 def __init__(self, lastpage=0, toekn=None):
     self.__lastpage = lastpage  # 当前进行到第几页
     self.__token = toekn  # 上一页的最后一条
     self.__excookies = requests.utils.cookiejar_from_dict(
         config.COOKIE_DICT, cookiejar=None, overwrite=True)  # 装载cookie
     self.__IPandport = getIP()
     self.__proxies = {
         "https":
         "http://%s:%s" % (self.__IPandport[0], str(self.__IPandport[1]))
     }
     print("目录爬虫启动")
Exemplo n.º 6
0
 def __open_next(self):
     ErrorCount = 0
     while True:
         try:
             return self.__open_Ex(self.__excookies,
                                   exurl='https://exhentai.org/?page=' +
                                   str(self.__lastpage))
         except ExOpenError:
             print("代理姨妈或者E绅士服务器姨妈,等待5秒后重试")
             ErrorCount += 1
             sleep(5)
         except BanIPError:
             print()
             print("目录爬虫的IP已被Ban,更换IP")
             self.__proxies = getIP()
         except BanIPError as e:
             print("发生未知错误,等待5秒后重试")
             print("错误输出:", e.__str__())
             ErrorCount += 1
             sleep(5)
         if ErrorCount >= 10:
             print("目录获取失败超过10次,更换IP")
             self.__proxies = getIP()
Exemplo n.º 7
0
def reProcess(qi, qd, qe, n):
    if n < 3:
        try:
            IPandport = getIP()
            print("获取到了一个新的可用IP,启动一个进程")
            n += 1
            Process(target=dataget, args=(qi, qd, qe, IPandport)).start()
            # 成功新开进程是,n+1
            n += 1
        except IPPoolEmpError:
            pass
        except IPProxyPoolRunError:
            print("IPProxyPoolRunError")
            pass
    return n
Exemplo n.º 8
0
def changeip(IPandport):
    i = 0
    while i < 5:
        try:
            # 如果原代理还能使用,继续使用原代理
            testIP(IPandport)
            return IPandport
        except ProxyInvaError:
            i += 1
    try:
        #如果原代理已不能使用,更换IP
        return getIP()
    except IPPoolEmpError:
        print("IP池已空,启动备用IP完成已分配的任务")
        raise
Exemplo n.º 9
0
def reProcess(qip, n):
    if n < 20:
        while True:
            print("开始尝试重启爬虫线程")
            try:
                IPandport = getIP()
                print("获取到了一个新的可用IP")
                n += 1
                qip.put(IPandport)
                if n >= 10:
                    break
            except IPPoolEmpError:
                print("IP池已空")
                break
            except IPProxyPoolRunError:
                print("IPProxyPoolRunError")
                break
    return n
Exemplo n.º 10
0
def reProcess(qip, n):
    # 如果实际启动的线程低于设定值,错误处理进程会反复尝试获取新IP,所以不要把线程数开太高
    if n < THREAD_MAX:
        while True:
            print("开始尝试重启爬虫线程")
            try:
                IPandport = getIP()
                print("获取到了一个新的可用IP")
                n += 1
                qip.put(IPandport)
                if n >= 10:
                    break
            except IPPoolEmpError:
                print("IP池已空")
                break
            except IPProxyPoolRunError:
                print("IPProxyPoolRunError")
                break
    return n
Exemplo n.º 11
0
def __dataget(qi,qd,qe,px,n):
    lock = threading.Lock()
    count = 0
    try:
        ProxyErrorCount = 0
        while True:
            if qi.empty():
                sleep(1)
            else:
                while True:
                    lock.acquire()
                    try:
                        index = qi.get()
                    finally:
                        lock.release()
                    id = index[2]
                    try:
                        data = getfav_rat(index[0],index[1],px)
                        sleep(0.5)
                        data.append(id)
                        lock.acquire()
                        try:
                            qd.put(data)
                        finally:
                            lock.release()
                        count +=1
                        ProxyErrorCount = 0
                    except ProxyError:
                        ProxyErrorCount +=1
                        sleep(10)
                    if ProxyErrorCount >=10:
                        # 代理连续错误十次则更换IP
                        print("更换IP")
                        px = getIP()
                        ProxyErrorCount = 0
                    if count >=100:
                        sleep(10)
    # 如果发生任何不能处理的错误(没有代理、熊猫等),向错误队列内输出当前使用的代理和当前数据
    except BaseException as e:
        print(e.__str__())
        n -= 1
        errordata = [px,index,id]
        qe.put(errordata)
Exemplo n.º 12
0
def dataget(qi, qd, qe, px):
    count = 0
    try:
        ProxyErrorCount = 0
        while True:
            if qi.empty():
                sleep(1)
            else:
                while True:
                    index = qi.get()
                    id = index[2]
                    try:
                        data = getfav_rat(index[0], index[1], px)
                        sleep(0.5)
                        data.append(id)
                        qd.put(data)
                        print(data)
                        count += 1
                        ProxyErrorCount = 0
                    except IsExHon:
                        print("这个本子是EX的,跳过")
                        break
                    except ProxyError:
                        ProxyErrorCount += 1
                        sleep(10)
                    if ProxyErrorCount >= 10:
                        # 代理连续错误十次则更换IP
                        print("更换IP")
                        px = getIP()
                        ProxyErrorCount = 0
                    if count >= 100:
                        sleep(10)
    # 如果发生任何不能处理的错误(没有代理、熊猫等),向错误队列内输出当前使用的代理和当前数据
    except BaseException as e:
        print(e.__str__())
        # 向错误队列中输入当前使用的代理、当前进行到的项目、id
        errordata = [px, index, id]
        qe.put(errordata)
Exemplo n.º 13
0
def __dataget(qi, qd, qe, px, n):
    count = 0
    try:
        ProxyErrorCount = 0
        while True:
            if qi.empty():
                sleep(1)
            else:
                while True:
                    data = qi.get()
                    try:
                        index = data.getindex()
                        webdata = getfav_rat(index[0], index[1], px)
                        data.update(favorited=webdata['favorited'],
                                    ratings=webdata['ratings'],
                                    elanguage=webdata['elanguage'],
                                    title_jpn=webdata['title_jpn'])
                        sleep(0.5)
                        qd.put(data)
                        count += 1
                        ProxyErrorCount = 0
                    # 爬取EX本子时,404错误全为代理服务器抛出的
                    except ProxyError:
                        ProxyErrorCount += 1
                        sleep(10)
                    if ProxyErrorCount >= 10:
                        # 代理连续错误十次则更换IP
                        print("更换爬虫IP")
                        px = getIP()
                        ProxyErrorCount = 0
                    if count >= 100:
                        sleep(10)
    # 如果发生任何不能处理的错误(没有代理、熊猫等),向错误队列内输出当前使用的代理和当前数据
    except BaseException as e:
        print("爬虫进程发生未知错误:", e.__str__())
        n -= 1
        errordata = [px, data]
        qe.put(errordata)
Exemplo n.º 14
0
from Proxy.IPPool import getIP
import threading
'''
后记:目录、数据处理、错误处理作为三个线程在主进程里启动
爬虫作为子进程启动
但事实上并没有对爬虫进程做任何控制,如果不是在PyCharm中运行,可能会有僵尸进程的问题
另外我的爬虫进程仍然是单进程单线程的,爬虫这种IO阻塞很严重的工作这么做效率很低
事实上就是因为我的电脑风扇声大到快要爆炸..我才写了多线程版本。
'''

if __name__ == '__main__':
    # 目录队列
    qindex = Queue()
    # 数据队列
    qdata = Queue()
    # 错误队列
    qerror = Queue()
    threading.Thread(target=index_get, args=(qindex, 197078)).start()
    threading.Thread(target=read, args=(qdata, )).start()
    n = 0
    # 最多启动四个爬虫进程
    while n < 4:
        try:
            Process(target=dataget,
                    args=(qindex, qdata, qerror, getIP())).start()
            n += 1
        except BaseException:
            break
    threading.Thread(target=error_handing,
                     args=(qindex, qdata, qerror, n)).start()
Exemplo n.º 15
0
    def __read_html(self):
        hlist = []
        ErrorCount = 0
        while True:
            try:
                bsobj = BeautifulSoup(self.__open_next())
                table = bsobj.find('table', {'class': 'itg'})
                for link in table.findAll(
                        'a',
                        href=re.compile(
                            'https://exhentai\.org/g/[0-9]{1,8}/[A-Za-z0-9]{10}/'
                        )):
                    if 'href' in link.attrs:
                        hlist.append(self.__parse_html(link.attrs['href']))
                #  检查上一页的最后一条有没有被挤到这页,如果有,修剪目录list
                if self.__token in hlist:
                    lastindex = hlist.index(self.__token)
                    hlist = hlist[lastindex + 1:len(hlist)]
                # 更新最后一条的值
                self.__token = hlist[-1]
                # 写入最后一条的值
                rootpath = os.path.abspath(config.__file__)
                if os.name == "nt":
                    pathlist = rootpath.split("\\")
                    pathlist.pop(len(pathlist) - 1)
                    pathlist.append("lastpage&index.txt")
                    lastindexpath = "\\".join(pathlist)

                else:
                    pathlist = rootpath.split("/")
                    pathlist.pop(len(pathlist) - 1)
                    pathlist.append("lastpage&index.txt")
                    lastindexpath = "/".join(pathlist)
                with open(lastindexpath, 'w') as f:
                    f.write(
                        str(self.__lastpage) + "," + str(self.__token[0]) +
                        ',' + self.__token[1])
                self.__lastpage += 1
                return hlist
            except BaseException as e:
                print("列表爬虫发生未知异常,重新尝试获取列表")
                print("异常信息:", e.__str__())

                rootpath = os.path.abspath(config.__file__)
                if os.name == "nt":
                    pathlist = rootpath.split("\\")
                    pathlist.pop(len(pathlist) - 1)
                    pathlist.append("Log.txt")
                    logpath = "\\".join(pathlist)

                else:
                    pathlist = rootpath.split("/")
                    pathlist.pop(len(pathlist) - 1)
                    pathlist.append("Log.txt")
                    logpath = "/".join(pathlist)
                with open(logpath, 'a') as f:
                    f.write("列表爬虫异常:" + e.__str__())
                ErrorCount += 1
            if ErrorCount > 5:
                self.__IPandport = getIP()
                self.__proxies = {
                    "https":
                    "http://%s:%s" %
                    (self.__IPandport[0], str(self.__IPandport[1]))
                }
Exemplo n.º 16
0
def go(id=1, IPandport=MyIPandport):
    x = 0
    i = 0
    # 目录数计数
    indexcount = 0
    # json语句变量
    indexlist = []
    # 代理错误计数
    proxycount = 0
    IPandport = getIP()
    jsonStr = {"method": "gdata", "gidlist": indexlist, "namespace": 1}
    geter = getindex(id).geter()
    try:
        for index in geter:
            indexlist.append([index[0], index[1]])
            indexcount += 1
            # print(index)
            while indexcount == 25:
                try:
                    datajson = getdata(jsonStr, IPandport)
                    id = writedata(datajson, id)
                    print('已写入%s' % (id - 1))
                    i += 1
                    indexcount = 0  # 如果写入成功,重置目录计数
                    indexlist = []  # 如果写入成功,重置目录列表
                    jsonStr = {
                        "method": "gdata",
                        "gidlist": indexlist,
                        "namespace": 1
                    }
                    proxycount = 0  # 如果写入成功,重置代理测试计数
                except requests.exceptions.ProxyError as e:
                    proxycount += 1
                    sleep(10)
                    print(e.__str__())
                except BanIPError as e:
                    print('%s该代理已经被ban,更换代理' % (IPandport[0]))
                    banedIPlist.append(IPandport[0])
                    IPandport = changeproxies()
                    proxycount = 0  # 重置代理计数
                except ConnectionResetError:
                    print('%s该代理已经被ban,更换代理' % (IPandport[0]))
                    banedIPlist.append(IPandport[0])
                    IPandport = changeproxies()
                    proxycount = 0  # 重置代理计数
                if proxycount == 3:
                    print('当前代理已连续三次失效,更换代理')
                    IPandport = changeproxies()
                    proxycount = 0  # 重置代理计数

            if i == 5:
                sleep(6)
                i = 0

        if len(indexlist) != 0 and len(indexlist) != 25:
            try:
                datajson = getdata(jsonStr, IPandport)
                id = writedata(id, datajson)
                indexcount = 0  # 如果写入成功,重置目录计数
                indexlist = []  # 如果写入成功,重置目录列表
                jsonStr = {
                    "method": "gdata",
                    "gidlist": indexlist,
                    "namespace": 1
                }
                proxycount = 0  # 如果写入成功,重置代理测试计数
            except requests.exceptions.ProxyError as e:
                proxycount += 1
                print(e.__str__())
            except BanIPError as e:
                banedIPlist.append(IPandport[0])
                IPandport = changeproxies()
                proxycount = 0  # 重置代理计数
            if proxycount == 3:
                print('当前代理已连续三次失效,更换代理')
                IPandport = changeproxies()
                proxycount = 0  # 重置代理计数

    except BaseException as e:
        print('发生未知错误')
        logging.exception(e)
        print(id)
        print(jsonStr)
        remind('程序异常终止')
Exemplo n.º 17
0
if __name__ == "__main__":
    # 目录队列
    qindex = Queue()
    # 数据队列
    qdata = Queue()
    # 错误队列
    qerror = Queue()
    # 代理队列
    qip = Queue()
    # 启动目录发生器进程
    Process(target=indexgeter, args=(qindex, )).start()
    # 开始启动爬虫进程
    n = 0
    while n < PROCESS_MAX:
        Process(target=webdatageter, args=(qindex, qdata, qerror, qip)).start()
        n += 1
    # 数据写入器线程
    threading.Thread(target=data_writer, args=(qdata, )).start()
    # 错误处理线程
    threading.Thread(target=error_handing,
                     args=(qdata, qerror, qip, n)).start()
    # 获取THREAD_COUNT个代理用于开启爬虫线程
    n = 0
    while n < THREAD_MAX * THREAD_MAX:
        try:
            qip.put(getIP())
            n += 1
        except BaseException:
            break