示例#1
0
 def newThread(self):
     file = open("d:/IP/availableIP.txt", "w")
     for ip in self.ip_pool:
         file.write(ip + '\n')
     file.close()
     self.ip_pool = []
     file = open("d:/IP/ip.txt")
     ipList = []
     print("test file")
     for ip in file:
         ipList.append(ip)
     file.close()
     ipSize = len(ipList)
     for xx in ipList:
         ip = str(xx)
         if ip.__contains__(":"):
             self.requestCount += 1
             rr = Request(url="https://baidu.com/",
                          callback=lambda response, typeid=ip.strip(): self.
                          parse_ipResponse(response, typeid))
             rr.meta['proxy'] = "http://" + ip.strip()
             print("ip test-----------------------")
             print(ip.strip())
             print(str(self.requestCount))
             print("------------------------------")
             print("------------------------------")
             print("------------------------------")
             print("------------------------------")
             rr.dont_filter = True
             rr.meta['DOWNLOAD_TIMEOUT'] = 5
             if self.requestCount == ipSize:
                 self.requestCount = 0
                 self.newThread()
                 break
             yield rr
示例#2
0
 def start_requests(self):
     while True:
         file = open("d:/IP/ip.txt")
         ipList = []
         print("test file")
         for ip in file:
             ipList.append(ip)
         file.close()
         ipSize = len(ipList)
         for xx in ipList:
             ip = str(xx)
             if ip.__contains__(":"):
                 self.requestCount += 1
                 rr = Request(url=random.choice(self.webs),
                              callback=lambda response, typeid=ip.strip():
                              self.parse_ipResponse(response, typeid))
                 rr.meta['proxy'] = "http://" + ip.strip()
                 rr.headers.setdefault('User-Agent',
                                       random.choice(self.user_agent))
                 print("ip test-----------------------")
                 print(ip.strip())
                 print(str(self.requestCount))
                 print("------------------------------")
                 print("------------------------------")
                 print("------------------------------")
                 print("------------------------------")
                 rr.dont_filter = True
                 # rr.meta['download_timeout'] = 5
                 yield rr
示例#3
0
 def start_requests(self):
     request = Request(url="http://www.qidian.com")
     request.meta['dont_retry'] = True
     # 代理池中为空的时候不等待代理,因为爬取代理的爬虫不使用爬取的代理
     request.meta['dont_wait_proxy'] = True
     request.dont_filter = True
     yield request
示例#4
0
 def start_requests(self):
     file = open(
         "D:/Python/scrapy_learning/getproxiesIP/getproxiesIP/files/ip.txt")
     for ip in file:
         rr = Request(url="https://baidu.com/",
                      callback=lambda response, typeid=ip: self.
                      parse_ipResponse(response, typeid))
         rr.meta['proxy'] = "http://" + ip
         rr.dont_filter = True
         rr.meta['time_out'] = 5
         yield rr
示例#5
0
 def start_requests(self):
     # 随机爬取前100页代理
     flag = True
     while flag:
         flag = ProxySpiderSwitch.flag
         page = random.choice(range(1, 100))
         url = 'http://www.xicidaili.com/wt/%d' % page
         request = Request(url=url)
         # 不走ProxyFilterMiddleware,不重新发送request,失败了就失败了
         request.meta['dont_retry'] = True
         # 代理池中为空的时候不等待代理,因为爬取代理的爬虫不使用爬取的代理
         request.meta['dont_wait_proxy'] = True
         request.dont_filter = True
         yield request
示例#6
0
 def gen_request(self, url, **kwargs):
     """
     generate a Request
     :param url: the url to generate requests
     :return: Request
     """
     r = Request(
         url=url,
         callback=self.parse,
         errback=self.parse_err,
     )
     if 'dont_filter' in kwargs:
         r.dont_filter = True
     return r
示例#7
0
 def retry_request_with_get(self, request: Request) -> Generator[Request, None, None]:
     request.method = 'GET'
     request.dont_filter = True
     yield request
示例#8
0
 def retry_request_with_get(
         self, request: Request) -> Generator[Request, None, None]:
     request.method = 'GET'
     request.dont_filter = True
     yield request
示例#9
0
    def parse(self, response):
        # TODO 你需要首先使用utils的redisListUpload下发任务。否则会一直等待直到有任务来!

        url = response.url
        print("当前解析页", url)
        request = response.request
        if '玄鸟' not in response.text and 'company_getinfos' not in url:
            print("虽然正常可达,但登陆失败,需要更新cookie")
            print(response.text)
            with open("../web_msg/qcc_loginErr.html", "w",
                      encoding='utf-8') as fp:
                fp.write(response.text)
            raise Exception("虽然正常可达,但登陆失败,需要更新cookie")
        elif '#ipo' in url:
            print("该页为上市信息,不采集")
            # IPO不采集
            request._set_url = (request.url.replace('#ipo', '#base'))
            request.dont_filter = True

            yield request
        elif 'base' in url or 'firm' in url:
            print("该页为基本信息")
            item = QichachaHtmlItem()
            item['base_html'] = response.text
            item['name'] = response.css('h1::text')[0].extract()
            item['id'] = url.split('firm_')[1].split('.')[0]
            item['mid_requests'] = 1  # 谢鋆Request下载框架MID
            print("企查查首页解析成功:", item['name'], item['id'])
            requestNew = Request(
                url=self.URL_BASE.format(item['id'], item['name'], 'susong'))
            requestNew.meta['item'] = item
            requestNew.meta['item']['mid_requests'] = 2
            requestNew.priority = request.priority + 100
            requestNew.dont_filter = True
            yield requestNew
        elif 'susong' in url:
            print("该页为法律诉讼")
            request.meta['item']['susong_html'] = response.text
            request.meta['item']['mid_requests'] = 2  # 谢鋆Request下载框架MID

            requestNew = Request(url=request.url.replace('susong', 'run'))
            requestNew.meta['item'] = request.meta['item']
            requestNew.priority = request.priority + 100
            requestNew.dont_filter = True
            yield requestNew
        elif 'run' in url:
            print("该页为经营状况")
            request.meta['item']['run_html'] = response.text
            request.meta['item']['mid_requests'] = 2  # 谢鋆Request下载框架MID

            requestNew = Request(url=request.url.replace('run', 'fengxian'))
            requestNew.meta['item'] = request.meta['item']
            requestNew.priority = request.priority + 100
            requestNew.dont_filter = True
            yield requestNew
        elif 'fengxian' in url:
            print("该页为经营风险")
            request.meta['item']['fengxian_html'] = response.text
            request.meta['item']['mid_requests'] = 2  # 谢鋆Request下载框架MID

            requestNew = Request(url=request.url.replace('fengxian', 'report'))
            requestNew.meta['item'] = request.meta['item']
            requestNew.priority = request.priority + 100
            requestNew.dont_filter = True
            yield requestNew
        elif 'report' in url:
            print("该页为企业年报")
            request.meta['item']['report_html'] = response.text
            request.meta['item']['mid_requests'] = 2  # 谢鋆Request下载框架MID

            requestNew = Request(url=request.url.replace('report', 'history'))
            requestNew.meta['item'] = request.meta['item']
            requestNew.priority = request.priority + 100
            requestNew.dont_filter = True
            yield requestNew
        elif 'history' in url:
            print("该页为历史股东")
            request.meta['item']['history_html'] = response.text
            request.meta['item']['mid_requests'] = 2  # 谢鋆Request下载框架MID
            print('存储数据')
            yield request.meta['item']

        else:
            print("跳转到了异常页,停止抓取")
            print(response.text)
            raise Exception("抓取异常,url不包括关键字,停止抓取")