def run(self): while not self.NOT_EXIST: # 队列为空, 结束 if self.queue.empty(): NOT_EXIST = 1 self.queue.task_done() break url = self.queue.get() try: response = requests.get(url, headers=RequestModel.getHeaders(), proxies=RequestModel.getProxies(), timeout=3) print('Top 子线程 ' + str(self.id) + ' 请求【 ' + url + ' 】的结果: ' + str(response.status_code)) # time.sleep(1000) # 需将电影天堂的页面的编码改为 GBK, 不然会出现乱码的情况 response.encoding = 'GBK' if response.status_code != 200: self.queue.put(url) time.sleep(20) else: temp = dytt_Lastest.getMoiveInforms(url, response.text) TaskQueue.getContentQueue().put(temp) time.sleep(5) except Exception as e: # self.queue.put(url) print(e)
def getMaxsize(cls): response = requests.get(cls.breakoutUrl, headers=RequestModel.getHeaders(), proxies=RequestModel.getProxies(), timeout=3) # 需将电影天堂的页面的编码改为 GBK, 不然会出现乱码的情况 response.encoding = 'GBK' selector = etree.HTML(response.text) # 提取信息 optionList = selector.xpath("//select[@name='sldd']/text()") return len(optionList) - 1 # 因首页重复, 所以要减1
def run(self): while not self.NOT_EXIST: # 队列为空, 结束 if self.queue.empty(): NOT_EXIST = 1 self.queue.task_done() break url = self.queue.get() try: response = requests.get(url, headers=RequestModel.getHeaders(), proxies=RequestModel.getProxies(), timeout=3) print('Floor 子线程 ' + str(self.id) + ' 请求【 ' + url + ' 】的结果: ' + str(response.status_code)) # time.sleep(1000) # 需将电影天堂的页面的编码改为 GBK, 不然会出现乱码的情况 response.encoding = 'GBK' if response.status_code != 200: self.queue.put(url) time.sleep(20) else: moivePageUrlList = dytt_Lastest.getMoivePageUrlList( response.text) for item in moivePageUrlList: each = self.host + item # print(each) # time.sleep(1000) TaskQueue.putToMiddleQueue(each) time.sleep(3) # 5 except Exception as e: # print('catsh Exception ==== ') # self.queue.put(url) print(e)
# ======================================================================================== # Top 子线程 3 请求【 http://www.dytt8.net/html/gndy/dyzz/20141029/46502.html 】的结果: 200 # Top 子线程 2 请求【 http://www.dytt8.net/html/gndy/dyzz/20141026/46484.html 】的结果: 200 # Top 子线程 0 请求【 http://www.dytt8.net/html/gndy/dyzz/20141022/46458.html 】的结果: 200 # ======================================================================================== # Top 子线程 4 请求【 http://www.dytt8.net/html/gndy/dyzz/20120815/38998.html 】的结果: 200 # Top 子线程 3 请求【 http://www.dytt8.net/html/gndy/dyzz/20120811/38936.html 】的结果: 200 # Top 子线程 0 请求【 http://www.dytt8.net/html/gndy/dyzz/20120825/39129.html 】的结果: 200 # Top 子线程 1 请求【 http://www.dytt8.net/html/gndy/dyzz/20120809/38919.html 】的结果: 200 # Top 子线程 2 请求【 http://www.dytt8.net/html/gndy/dyzz/20120807/38894.html 】的结果: 200 # Top 子线程 3 请求【 http://www.dytt8.net/html/gndy/dyzz/20120904/39251.html 】的结果: 200 # ''' url = 'http://www.dytt8.net/html/gndy/dyzz/20120807/38894.html' response = requests.get(url, headers=RequestModel.getHeaders(), proxies=RequestModel.getProxies(), timeout=3) print(' 请求【 ' + url + ' 】的结果: ' + str(response.status_code)) response.encoding = 'GBK' selector = etree.HTML(response.text) # print(response.text) # 页面内容 content = selector.xpath( "//div[@class='co_content8']/ul/tr/td/div/td/p/text()") print('第 1 次: content') print(content) # 匹配出来有两张图片, 第一张是海报, 第二张是电影画面截图 imgs = selector.xpath("//div[@class='co_content8']/ul/tr/td/div/td/p/img/@src")