def __init__(self): self.needNextUrl = False self.ipValid = None self.currPage = 0 self.ipDao = IPDao() self.log = Log() self.needChangeParams = True # 是否需要更换参数,在服务器异常的时候不需要更换,保持原有参数 self.bookDetailDao = BookDetailParse() self.getIpFromXici = GetIpFromXici() pass
class BookDetailThreadLaunch(object): def __init__(self): self.threadList = [] self.threadMaxSize = 50 self.log = Log() # self.connector = def start(self): # 启动输入线程 inputThread = ThreadInput() inputThread.start() self.spiderBookDetail() pass def spiderBookDetail(self): bookDetailJson = self.log.getBookDetailThreadLaunchIndex() or {} bookId = bookDetailJson['detailIndex'] or 4000000 while True: self.checkNeedAddThread() while not Global.consoleToStopCatch and len( self.threadList) <= self.threadMaxSize: print bookId try: thread_ = BookDetailSpiderThread(bookId, DBDao().getConnector()) thread_.start() self.threadList.append(thread_) bookId += 1 self.log.saveBookDetailThreadLaunchIndex(bookId) except: pass time.sleep(1) def checkNeedAddThread(self): for currThread in self.threadList: if not currThread.isAlive(): print "dead:" + str(currThread.bookId) currThread.connector.close() self.threadList.remove(currThread) # 看还需要增加几个 return self.threadMaxSize - len(self.threadList)
def __init__(self, connector): threading.Thread.__init__(self) self.needNextUrl = False self.ipValid = None self.currPage = 0 self.ipDao = IPDao(connector) self.log = Log() self.canOverCatch = True # 是否需要更换参数,在服务器异常的时候不需要更换,保持原有参数 self.bookCatchRecordDao = BookCatchRecordDao(connector) self.connector = connector self.isException = False pass
def __init__(self): self.threadList = [] self.threadMaxSize = 50 self.log = Log()
class BookDetailSpider(object): def __init__(self): self.needNextUrl = False self.ipValid = None self.currPage = 0 self.ipDao = IPDao() self.log = Log() self.needChangeParams = True # 是否需要更换参数,在服务器异常的时候不需要更换,保持原有参数 self.bookDetailDao = BookDetailParse() self.getIpFromXici = GetIpFromXici() pass def start(self): detailIndexLogDict = self.log.getBookDetailIndexLog() or {} index = detailIndexLogDict['detailIndex'] or 1 while True: print "-----------------当前第", index, "条----------------------" if index >= 2000001: print "-----------------到达第", 2000001, "条----------------------" break self.log.saveBookDetailIndexLog(index) url = "https://book.douban.com/subject/%s/" % (index, ) print url self.request(url, index) time.sleep(int(format(random.randint(0, 1)))) if self.needChangeParams: index += 1 self.needChangeParams = True def request(self, url, book_id): user_agent = random.choice(Constant.USER_AGENTS) try: self.checkIP() if self.ipValid: proxies = { "http": "http://%s:%s" % self.ipValid, "https": "http://%s:%s" % self.ipValid } print proxies response = requests.get(url, proxies=proxies, headers={"User-Agent": user_agent}, timeout=10) req_code = response.status_code req_msg = response.reason print "返回状态 ", req_code, " 返回状态消息 ", req_msg if req_code >= 400: if req_code == 404: print "没有本页资源" else: print "返回状态错误", req_code self.exceptionOperate_1() else: print "请求通过" print "开始解析" # 解析文档 self.bookDetailDao.start(response.text, url, book_id) else: print "没有ip了,等", "不改变参数" raise requests.exceptions.ProxyError("") except mysql.connector.errors.InterfaceError, e: print "数据库连接出问题", "不改变参数" self.exceptionOperate_1() except requests.exceptions.ConnectTimeout, e: print "服务器连接超时", "不改变参数" self.exceptionOperate_1()