def run(self): global queue_href, mutex_href_get, mutex_href_put mutex_href_get.acquire() while queue_href.qsize() > 0: # 在线程池中取得链接 viewHref = str(queue_href.get()) mutex_href_get.release() # 调用get_page函数 result = get_page(viewHref) print('1111111111111111111111111111111111111122222222222222') print(result) print('111' * 10) try: mutex_href_put.acquire() print(str(type(result))) if str(type(result)) == "<class 'list'>": # 存储 print(len(result)) saveUrls(result) elif result == 1: #连接错误 logUrlConnectError(viewHref) elif result == 2: #格式错误 logUrlFormError(viewHref) mutex_href_put.release() except: traceback.print_exc() print('shittttttttttttttttttt') mutex_href_put.release() mutex_href_get.acquire() continue mutex_href_get.acquire() mutex_href_get.release()
def run(self): global queue_date, mutex_date_get, mutex_date_put mutex_date_get.acquire() while queue_date.qsize() > 0: # 在线程池中取得链接和序号 crawldate = str(queue_date.get()) mutex_date_get.release() # 调用get_page函数 result = get_page(crawldate) print('1111111111111111111111111111111111111122222222222222') print(result) print('111' * 10) try: mutex_date_put.acquire() print(str(type(result))) if str(type(result)) == "<class 'list'>": #存储 print(len(result)) saveUrls(result) elif result == 1: logUrlConnectError(crawldate) mutex_date_put.release() except: traceback.print_exc() print('shittttttttttttttttttt') mutex_date_put.release() mutex_date_get.acquire() continue mutex_date_get.acquire() mutex_date_get.release()
def get_new(spiderUrl, localLatestDate): pageNum = 1 nowPageDate = "1970-01-01" #初始化信息的发布日期 errCount = 0 #连接错误次数 while errCount <= 3: #错误次数超过三次,直接退出 time.sleep(2) viewHref = spiderUrl + str(pageNum) result = get_page(viewHref) if str(type(result)) == "<class 'list'>": print(len(result)) nowPageDate = result[-1][-2] #获取最后一条信息的发布日期 print(nowPageDate) saveUrls(result) errCount = 0 if nowPageDate < localLatestDate: #已经到本地最新的日期,停止爬 break else: pageNum = pageNum + 1 #继续爬下一页 elif result == 1: # 连接错误 logUrlConnectError(viewHref) errCount = errCount + 1 elif result == 2: # 格式错误 logUrlFormError(viewHref) errCount = 0
def getErrorUrlAgain(): datelist = readErrorUrl() for crawldate in datelist: result = get_page(crawldate) try: if str(type(result)) == "<class 'list'>": # 存储 saveUrls(result) deleteErrorUrl(crawldate) elif result == 1: logUrlConnectError(crawldate) except: traceback.print_exc()