def getProxies(self): # 加载 Log 配置 getLogConfig() proxy_model_list = [] response = super(data5uSpider, self).getProxies() selector = etree.HTML(response.text) infos = selector.xpath('//ul[@class="l2"]') for i, info in enumerate(infos): try: ip = info.xpath('//ul[@class="l2"]/span[1]/li/text()')[i] # ip port = info.xpath('//ul[@class="l2"]/span[2]/li/text()')[ i] # 端口 anonymity = info.xpath( '//ul[@class="l2"]/span[3]/li/a/text()')[i] # 匿名度 type = info.xpath('//ul[@class="l2"]/span[4]/li/a/text()')[ i] # 类型 area = info.xpath('//ul[@class="l2"]/span[6]/li/a[1]/text()')[ i] # 地区, 省 area = area + info.xpath( '//ul[@class="l2"]/span[6]/li/a[2]/text()')[i] # 地区, 市 speed = info.xpath('//ul[@class="l2"]/span[8]/li/text()')[ i] # 速度 if type == 'http' or type == 'https': # print(type + "://" + ip + ":" + port) proxy = proxyModel() proxy.set_ip(ip) proxy.set_port(port) proxy.set_type(type) proxy.set_anonymity(anonymity) proxy.set_area(area) proxy.set_speed(speed) proxy.set_agent(self.agent) proxy.set_survivalTime("") proxy_model_list.append(proxy) else: pass except Exception as e: logging.debug(e) logging.debug("抓取 " + self.agent + " 网站共计 " + str(len(proxy_model_list)) + " 个代理") return proxy_model_list
def getProxies(self): # 加载 Log 配置 getLogConfig() proxy_model_list = [] response = super(xiciSpider, self).getProxies() selector = etree.HTML(response.text) infos = selector.xpath('//tr[@class="odd"]') for i, info in enumerate(infos): try: ip = info.xpath('./td[2]/text()')[0] # ip port = info.xpath('./td[3]/text()')[0] # 端口 anonymity = info.xpath('./td[5]/text()')[0] # 匿名度 type = info.xpath('./td[6]/text()')[0] # 类型 area = info.xpath('./td[4]/a/text()')[0] # 地区 speed = info.xpath('./td[7]/div/@title')[0] # 速度 survivalTime = info.xpath('./td[9]/text()')[0] # 存活时间 print(ip + " | " + port + " | " + anonymity + " | " + type + " | " + area + " | " + speed + " | " + survivalTime) proxy = proxyModel() proxy.set_ip(ip) proxy.set_port(port) proxy.set_type(type) proxy.set_anonymity(anonymity) # 处理空地区 if area is None: proxy.set_area('') else: proxy.set_area(area) proxy.set_speed(speed) proxy.set_agent(self.agent) proxy.set_survivalTime(survivalTime) proxy_model_list.append(proxy) except Exception as e: logging.debug(e) logging.debug("抓取 " + self.agent + " 网站共计 " + str(len(proxy_model_list)) + " 个代理") return proxy_model_list
def getProxies(self): # 加载 Log 配置 getLogConfig() proxy_model_list = [] response = super(ip181Spider, self).getProxies() # 这个网站的编码是 gb2312 response.encoding = 'gb2312' selector = etree.HTML(response.text) infos = selector.xpath('//div[@class="col-md-12"]/table/tbody/tr') for i, info in enumerate(infos): try: ip = info.xpath('./td[1]/text()')[0] # ip port = info.xpath('./td[2]/text()')[0] # 端口 anonymity = info.xpath('./td[3]/text()')[0] # 匿名度 type = info.xpath('./td[4]/text()')[0] # 类型 speed = info.xpath('./td[5]/text()')[0] # 速度 area = info.xpath('./td[6]/text()')[0] # 地区 # print(ip + " | " + port + " | " + anonymity + " | " + type + " | " + speed + " | " + area) if i == 1: # 把标题过滤掉 pass else: proxy = proxyModel() proxy.set_ip(ip) proxy.set_port(port) if type == 'HTTP,HTTPS': proxy.set_type('http') else: proxy.set_type(type.lower()) proxy.set_anonymity(anonymity) proxy.set_area(area) proxy.set_speed(speed) proxy.set_agent(self.agent) proxy.set_survivalTime("") proxy_model_list.append(proxy) except Exception as e: logging.debug(e) logging.debug("抓取 " + self.agent + " 网站共计 " + str(len(proxy_model_list)) + " 个代理") return proxy_model_list
def __init__(self): # 加载 Log 配置 getLogConfig() self.__proxyTable = 'proxy' self.conn = pymysql.connect( host=config.MYSQL_HOST, db=config.MYSQL_DBNAME, user=config.MYSQL_USER, passwd=config.MYSQL_PASSWORD, charset='utf8', # 编码要加上,否则可能出现中文乱码问题 use_unicode=False) with self.conn: self.cursor = self.conn.cursor()
def getProxies(self): # 加载 Log 配置 getLogConfig() proxy_model_list = [] response = super(kuaidailiSpider, self).getProxies() pattern = re.compile( '<tr>\s.*?<td.*?>(.*?)</td>\s.*?<td.*?>(.*?)</td>\s.*?<td.*?>(.*?)</td>\s.*?<td.*?>(' '.*?)</td>\s.*?<td.*?>(.*?)</td>\s.*?<td.*?>(.*?)</td>\s.*?<td.*?>(.*?)</td>\s.*?</tr>', re.S) infos = re.findall(pattern, response.text) for item in infos: try: ip = item[0] # ip port = item[1] # 端口 anonymity = item[2] # 匿名度 type = item[3] # 类型 area = item[4] # 地区 speed = item[5] # 速度 if type == 'HTTP' or type == 'HTTPS': # print(type.lower() + "://" + ip + ":" + port) proxy = proxyModel() proxy.set_ip(ip) proxy.set_port(port) proxy.set_type(type.lower()) proxy.set_anonymity(anonymity) proxy.set_area(area) proxy.set_speed(speed) proxy.set_agent(self.agent) proxy.set_survivalTime("") proxy_model_list.append(proxy) except Exception as e: logging.debug(e) logging.debug("抓取 " + self.agent + " 网站共计 " + str(len(proxy_model_list)) + " 个代理") return proxy_model_list
def start_spider(self): ''' 在同一进程里面执行多个爬虫程序 ''' self.process = CrawlerProcess(self.setting) #self.process.crawl(Demospider()) # 换成你 scrapy 爬虫的名字, self.process.start() ''' 停止爬虫和代理池 ''' def stop(self): self.isRunning = False # 关闭资源 getProxyPoolWorker().stopWork() # todo 停止scrapy爬虫? ''' 启动爬虫和代理池 ''' def start(self): self.start_proxyPool() self.start_spider() if __name__ == '__main__': # 加载 Log 配置 getLogConfig() manager = SpiderManager() manager.start()