def initPool(self): headers = { # 'Cookie': '__jsluid=7047e65c298237d485207bb867f6d903; __jsl_clearance=1556350912.141|0|ObnYAjOyNX3tzLrsd9c%2Btx7qzRk%3D; Hm_lvt_1761fabf3c988e7f04bec51acd4073f4=1556350916; Hm_lpvt_1761fabf3c988e7f04bec51acd4073f4=1556351251', # 'Host': 'www.66ip.cn', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36', } reqs = [ Request( 'get', 'http://www.66ip.cn/mo.php?sxb=&tqsl=100&port=&export=&ktip=&sxa=&submit=%CC%E1++%C8%A1&textarea=', headers=headers), Request( 'get', 'http://www.89ip.cn/tqdl.html?api=1&num=100&port=&address=&isp=', headers=headers) ] spider = Spider( parser=ipParser(), writter=CommonWritter(writeMode=CommonWritter.WritterMode.EXTEND)) spider.run(reqs) # self.__pool = [{'http': i} for i in spider.getItems()] # # https://tool.lu/ip/ajax.html for i in spider.getItems(): proxy = {'http': i} try: timeout = 10 res = requests.get('http://2019.ip138.com/ic.asp', proxies=proxy, timeout=timeout) group = re.search('<center>.*?((?:\d+\.){3}\d+).*?</center>', res.text) if group: print(group[0], "---------", proxy['http']) self.__pool.append(proxy) else: print(group) print('invalid', proxy['http']) except: print('invalid and exception', proxy['http'])
'数据提供方': item['orgName'], '数据名称': item['name'], # '更新时间': item['updTime'], 'id': item['id'], 'list': item['list'] }) # 返回继续抓取的请求与解析到的数据,此处无继续抓取请求 return [], items # 数据写入类,此处使用xlsx格式数据写入 writter = XlsxWritter(writeMode=XlsxWritter.WritterMode.EXTEND) # 建立爬虫对象,解析类是CatalogParser cataSpider = Spider(name="cataSpider", parser=CatalogParser(), writter=writter) cataSpider.run(reqs) # 数据保存 cataSpider.write(file_key + 'catalog.xlsx', write_header=True) ############################################################################ # 根据目录信息继续抓取api接口信息并进行测试接口 # 获取目录信息 cata = cataSpider.getItems(type='dict') # 代理 # ppool = ProxyPool() url_template = 'http://data.guizhou.gov.cn/dataopen/api/dataset/{}/apidata?callback=jQuery111305121739185053189_1555829556219&_=1555829556226'
doc = response.html() topicCata = doc.xpath( '/html/body/div[1]/div[2]/div[3]/div[1]/div/div[1]/ul[1]/li/a') data = [] for item in topicCata: data.append({ 'name': item.text, 'url': item.get('href'), 'num': item.text.split('(')[-1][:-1] }) return [], data topicCataWritter = XlsxWritter(XlsxWritter.WritterMode.EXTEND) topicCataSpider = Spider(name='topicspider', parser=TopicClassifyParser(), writter=topicCataWritter) # 地市地区分类 class CityCataParser(HtmlParser): def parse(self, response): doc = response.html() topicCata = doc.xpath( '/html/body/div[1]/div[2]/div[3]/div[1]/div/div[1]/ul[3]/li/a') data = [] for item in topicCata: data.append({ 'name': item.text, 'url': item.get('href'), 'num': item.text.split('(')[-1][:-1]
data["验证更新时间"] = True else: data["验证更新时间"] = False except Exception as e: data['验证更新时间'] = '日期格式异常:' + str(e) # 返回下一次爬取的请求集合与解析到的数据 return [], data # 将数据写进xlsx中,因为数据有多层嵌套,所以此时选择写入json文件中,如需要写入xlsx中,取消下方注释即可 dimenInfoWritter = XlsxWritter(writeMode=XlsxWritter.WritterMode.APPEND) # 创建爬虫对象 dimenSpider = Spider(name='dimenSpider', parser=DimensionParser(), writter=dimenInfoWritter) if __name__ == '__main__': from cataSpider import cataSpider, getCataIndex # 通过filekey指定不同城市或者不同政府机构,支持以下四个 filekey = '深圳市' cataInfo = getCataIndex(key=filekey, classfied='city') # 如何运行该爬虫: # 下载安装python环境 # 下载安装爬虫库 sspider : 使用该命令即可`pip install sspider` # type/0 数据集
doc = response.html() data = [] # /html/body/div[5]/table/tbody/tr/td[3]/div[2] items = doc.xpath( '/html/body/div[5]/table/tbody/tr/td[3]/div/div[1]/a')[1:6] for item in items: data.append({ 'url': host+item.get('href'), '名称': item.text.strip() }) return [], data cataWritter = XlsxWritter(writeMode=XlsxWritter.WritterMode.EXTEND) cataSpider = Spider(name="cataSpider", parser=cataParser(), writter=cataWritter) class dimenParser(HtmlParser): def parse(self, response): doc = response.html() data = response.request.other_info trs = doc.xpath('/html/body/div[5]/div/div[2]/div/div[1]/table/tr') for td in trs: item = td.xpath('string(.)').strip() temp = item.split('\r\n') if len(temp) >= 2: data[temp[0].strip()] = temp[-1].strip() # 分割