Пример #1
0
 def __init__(self):
     self.needNextUrl = False
     self.ipValid = None
     self.currPage = 0
     self.ipDao = IPDao()
     self.log = Log()
     self.needChangeParams = True  # 是否需要更换参数,在服务器异常的时候不需要更换,保持原有参数
     self.bookDetailDao = BookDetailParse()
     self.getIpFromXici = GetIpFromXici()
     pass
Пример #2
0
class BookDetailThreadLaunch(object):
    def __init__(self):
        self.threadList = []
        self.threadMaxSize = 50
        self.log = Log()
        # self.connector =

    def start(self):
        # 启动输入线程
        inputThread = ThreadInput()
        inputThread.start()
        self.spiderBookDetail()
        pass

    def spiderBookDetail(self):
        bookDetailJson = self.log.getBookDetailThreadLaunchIndex() or {}
        bookId = bookDetailJson['detailIndex'] or 4000000
        while True:
            self.checkNeedAddThread()
            while not Global.consoleToStopCatch and len(
                    self.threadList) <= self.threadMaxSize:
                print bookId
                try:
                    thread_ = BookDetailSpiderThread(bookId,
                                                     DBDao().getConnector())
                    thread_.start()
                    self.threadList.append(thread_)
                    bookId += 1
                    self.log.saveBookDetailThreadLaunchIndex(bookId)
                except:
                    pass
            time.sleep(1)

    def checkNeedAddThread(self):
        for currThread in self.threadList:
            if not currThread.isAlive():
                print "dead:" + str(currThread.bookId)
                currThread.connector.close()
                self.threadList.remove(currThread)
        # 看还需要增加几个
        return self.threadMaxSize - len(self.threadList)
Пример #3
0
 def __init__(self, connector):
     threading.Thread.__init__(self)
     self.needNextUrl = False
     self.ipValid = None
     self.currPage = 0
     self.ipDao = IPDao(connector)
     self.log = Log()
     self.canOverCatch = True  # 是否需要更换参数,在服务器异常的时候不需要更换,保持原有参数
     self.bookCatchRecordDao = BookCatchRecordDao(connector)
     self.connector = connector
     self.isException = False
     pass
Пример #4
0
 def __init__(self):
     self.threadList = []
     self.threadMaxSize = 50
     self.log = Log()
Пример #5
0
class BookDetailSpider(object):
    def __init__(self):
        self.needNextUrl = False
        self.ipValid = None
        self.currPage = 0
        self.ipDao = IPDao()
        self.log = Log()
        self.needChangeParams = True  # 是否需要更换参数,在服务器异常的时候不需要更换,保持原有参数
        self.bookDetailDao = BookDetailParse()
        self.getIpFromXici = GetIpFromXici()
        pass

    def start(self):
        detailIndexLogDict = self.log.getBookDetailIndexLog() or {}
        index = detailIndexLogDict['detailIndex'] or 1
        while True:
            print "-----------------当前第", index, "条----------------------"
            if index >= 2000001:
                print "-----------------到达第", 2000001, "条----------------------"
                break
            self.log.saveBookDetailIndexLog(index)
            url = "https://book.douban.com/subject/%s/" % (index, )
            print url
            self.request(url, index)
            time.sleep(int(format(random.randint(0, 1))))
            if self.needChangeParams:
                index += 1
            self.needChangeParams = True

    def request(self, url, book_id):
        user_agent = random.choice(Constant.USER_AGENTS)
        try:
            self.checkIP()
            if self.ipValid:
                proxies = {
                    "http": "http://%s:%s" % self.ipValid,
                    "https": "http://%s:%s" % self.ipValid
                }
                print proxies
                response = requests.get(url,
                                        proxies=proxies,
                                        headers={"User-Agent": user_agent},
                                        timeout=10)
                req_code = response.status_code
                req_msg = response.reason
                print "返回状态 ", req_code, " 返回状态消息 ", req_msg
                if req_code >= 400:
                    if req_code == 404:
                        print "没有本页资源"
                    else:
                        print "返回状态错误", req_code
                        self.exceptionOperate_1()
                else:
                    print "请求通过"
                    print "开始解析"
                    # 解析文档
                    self.bookDetailDao.start(response.text, url, book_id)
            else:
                print "没有ip了,等", "不改变参数"
                raise requests.exceptions.ProxyError("")
        except mysql.connector.errors.InterfaceError, e:
            print "数据库连接出问题", "不改变参数"
            self.exceptionOperate_1()
        except requests.exceptions.ConnectTimeout, e:
            print "服务器连接超时", "不改变参数"
            self.exceptionOperate_1()