예제 #1
0
    def run(self):
        while not self.NOT_EXIST:
            # 队列为空, 结束
            if self.queue.empty():
                NOT_EXIST = 1
                self.queue.task_done()
                break

            url = self.queue.get()
            try:
                response = requests.get(url,
                                        headers=RequestModel.getHeaders(),
                                        proxies=RequestModel.getProxies(),
                                        timeout=3)
                print('Top 子线程 ' + str(self.id) + ' 请求【 ' + url + ' 】的结果: ' +
                      str(response.status_code))
                # time.sleep(1000)

                # 需将电影天堂的页面的编码改为 GBK, 不然会出现乱码的情况
                response.encoding = 'GBK'

                if response.status_code != 200:
                    self.queue.put(url)
                    time.sleep(20)
                else:
                    temp = dytt_Lastest.getMoiveInforms(url, response.text)
                    TaskQueue.getContentQueue().put(temp)
                time.sleep(5)

            except Exception as e:
                # self.queue.put(url)
                print(e)
예제 #2
0
    def getMaxsize(cls):
        response = requests.get(cls.breakoutUrl,
                                headers=RequestModel.getHeaders(),
                                proxies=RequestModel.getProxies(),
                                timeout=3)
        # 需将电影天堂的页面的编码改为 GBK, 不然会出现乱码的情况
        response.encoding = 'GBK'

        selector = etree.HTML(response.text)
        # 提取信息
        optionList = selector.xpath("//select[@name='sldd']/text()")
        return len(optionList) - 1  # 因首页重复, 所以要减1
예제 #3
0
    def run(self):
        while not self.NOT_EXIST:
            # 队列为空, 结束
            if self.queue.empty():
                NOT_EXIST = 1
                self.queue.task_done()
                break

            url = self.queue.get()
            try:
                response = requests.get(url,
                                        headers=RequestModel.getHeaders(),
                                        proxies=RequestModel.getProxies(),
                                        timeout=3)
                print('Floor 子线程 ' + str(self.id) + ' 请求【 ' + url + ' 】的结果: ' +
                      str(response.status_code))
                # time.sleep(1000)

                # 需将电影天堂的页面的编码改为 GBK, 不然会出现乱码的情况
                response.encoding = 'GBK'

                if response.status_code != 200:
                    self.queue.put(url)
                    time.sleep(20)
                else:
                    moivePageUrlList = dytt_Lastest.getMoivePageUrlList(
                        response.text)
                    for item in moivePageUrlList:
                        each = self.host + item
                        # print(each)
                        # time.sleep(1000)
                        TaskQueue.putToMiddleQueue(each)
                time.sleep(3)  # 5

            except Exception as e:
                # print('catsh  Exception ==== ')
                # self.queue.put(url)
                print(e)
예제 #4
0
# ========================================================================================
# Top 子线程 3 请求【 http://www.dytt8.net/html/gndy/dyzz/20141029/46502.html 】的结果: 200
# Top 子线程 2 请求【 http://www.dytt8.net/html/gndy/dyzz/20141026/46484.html 】的结果: 200
# Top 子线程 0 请求【 http://www.dytt8.net/html/gndy/dyzz/20141022/46458.html 】的结果: 200
# ========================================================================================
# Top 子线程 4 请求【 http://www.dytt8.net/html/gndy/dyzz/20120815/38998.html 】的结果: 200
# Top 子线程 3 请求【 http://www.dytt8.net/html/gndy/dyzz/20120811/38936.html 】的结果: 200
# Top 子线程 0 请求【 http://www.dytt8.net/html/gndy/dyzz/20120825/39129.html 】的结果: 200
# Top 子线程 1 请求【 http://www.dytt8.net/html/gndy/dyzz/20120809/38919.html 】的结果: 200
# Top 子线程 2 请求【 http://www.dytt8.net/html/gndy/dyzz/20120807/38894.html 】的结果: 200
# Top 子线程 3 请求【 http://www.dytt8.net/html/gndy/dyzz/20120904/39251.html 】的结果: 200

# '''
url = 'http://www.dytt8.net/html/gndy/dyzz/20120807/38894.html'
response = requests.get(url,
                        headers=RequestModel.getHeaders(),
                        proxies=RequestModel.getProxies(),
                        timeout=3)
print(' 请求【 ' + url + ' 】的结果: ' + str(response.status_code))
response.encoding = 'GBK'
selector = etree.HTML(response.text)
# print(response.text)   # 页面内容

content = selector.xpath(
    "//div[@class='co_content8']/ul/tr/td/div/td/p/text()")
print('第 1 次: content')
print(content)

# 匹配出来有两张图片, 第一张是海报, 第二张是电影画面截图
imgs = selector.xpath("//div[@class='co_content8']/ul/tr/td/div/td/p/img/@src")