def parse(self, response): data = response.jsonp()['data'] # 获取请求中保存的信息 item = response.request.other_info try: item['接口地址'] = data['ifsAddr'] start = time.time() res = requests.get(data['requestDemo']) if res.status_code == 200 and res.text: item['API_AVALIABLE'] = '可用' # 如需下载接口数据,去掉下方注释即可 print(item['数据名称'], '可用') path = 'source/' + item['数据名称'] + '.' + data['supportFormat'] saveContent(path, res.content) else: item['API_AVALIABLE'] = '不可用' item['测试响应时长'] = time.time() - start if 'supportFormat' in data: item['支持格式'] = data['supportFormat'] # 对格式验证 # pass for key in data: if key in item: item[key + '~1'] = data[key] else: item[key] = data[key] except Exception as e: item['API_AVALIABLE'] = '接口详情页面异常' + str(e) # 返回下一次爬取的请求集合与解析到的数据 return [], item
def parse(self, response): doc = response.html() data = response.request.other_info trs = doc.xpath('/html/body/div[5]/div/div[2]/div/div[1]/table/tr') for td in trs: item = td.xpath('string(.)').strip() temp = item.split('\r\n') if len(temp) >= 2: data[temp[0].strip()] = temp[-1].strip() # 分割 # 测试api test = 'API测试' try: headers = { 'Authorization': token } res = requests.get(data['请求示例:'], headers=headers) file_format = '.json' if 'json' in data['支持格式:'] else '.'+data['支持格式:'] path = 'source/'+data['名称']+file_format saveContent(path, res.content) data[test] = '可用' data[test+'_status'] = res.status_code data[test+'_text'] = res.text except Exception as e: data[test] = '异常: '+str(e) if '请求示例:' not in data: data[test] = '异常: 无请求示例' return [], data
def parse(self, response): doc = response.html() data = response.request.other_info url = doc.xpath('/html/body/div[1]/div[2]/div[2]/p/a')[0].text param = doc.xpath( '/html/body/div[1]/div[2]/div[4]/table/tbody/tr/td[4]')[0].text des = doc.xpath( '/html/body/div[1]/div[2]/div[6]/p')[0].xpath('string(.)') data['apiurl'] = url.replace('个人唯一标识码', token).replace('文件编号', param) data['apidoc'] = des.strip() # 测试api test = 'API测试' try: res = requests.get(data['apiurl']) path = 'source/'+data['名称']+'.json' saveContent(path, res.content) data[test] = '可用' data[test+'_status'] = res.status_code data[test+'_text'] = res.text except Exception as e: data[test] = '异常: '+str(e) return [], data
def parse(self, response): doc = response.html() data = response.request.other_info # 创建时间/更新时间 table = doc.xpath('//div[@class="base-info"]/table')[0] for tbody in table: tds = tbody.getchildren() for index in range(0, len(tds), 2): data[tds[index].text.strip()] = tds[index + 1].text # api相关 block_path = '/html/body/div[1]/div[2]/div[3]/div[1]/div[2]/div[2]/ul/li' # 获取key获取值 # key是第一个子节点的text # value所有text block = doc.xpath(block_path) for li in block: tags = [] for i in li: if i.tag == 'span' or 'h3': tags.append(i) key = tags[0].xpath('string(.)').strip() value = tags[1].xpath('string(.)').strip() data[key] = value # 测试API demo = '请求示例:' method = '请求方式:' test = 'API测试' if demo in data: start = time.time() try: if data[method].upper() == 'GET': res = requests.get(data[demo]) else: res = requests.post(data[demo]) data[test] = '可用' data[test + '_status'] = res.status_code data[test + '_text'] = res.text path = 'source/' + data['数据集name'] + '.' + data['支持格式:'] saveContent(path, res.content) except: data[test] = '异常' data['响应时长'] = time.time() - start else: data[test] = '不存在' # 返回下一次爬取的请求集合与解析到的数据 return [], data
# 北京市的api数据请求结果关联到其相应的文件,因此需要再次下载 from datacommon import reader from datacommon.util import saveContent, listFile import json import requests import os def filterNullList(tar_list): for i in tar_list: if i: return i files = listFile('source') for file in files: content = json.loads(reader.getTxtData(file).strip()) url = content['result']['address'] if os.path.exists('apidatasource/' + content['result']['name'] + '.' + url.split('.')[-1]): continue res = requests.get(url) print('-----' + content['result']['name']) saveContent( 'apidatasource/' + content['result']['name'] + '.' + url.split('.')[-1], res.content)
def parse(self, response): doc = response.html() data = response.request.other_info update_times = [] # 建立维度集合 dimensions = { a.text.strip(): a.text.strip() for a in doc.xpath( '/html/body/div[1]/div[2]/div[3]/div[1]/div[2]/div[1]/a') } # 详细维度 if DATA_DOWNLOAD in dimensions: links = doc.xpath( '/html/body/div[1]/div[2]/div[3]/div[1]/div[2]/div[2]/ul/li//a' ) texts = doc.xpath( '/html/body/div[1]/div[2]/div[3]/div[1]/div[2]/div[2]/ul/li//span[@class="item-ext"]/text()' ) times = doc.xpath( '/html/body/div[1]/div[2]/div[3]/div[1]/div[2]/div[2]/ul/li//h4/text()' ) dimensions[DATA_DOWNLOAD] = [] for i in range(len(links)): item = { 'url': seed_url + links[i].get('href'), 'format': texts[i] if i < len(texts) else '', 'name': links[i].text.strip(), 'time': times[i] if i < len(times) else '' } # 存储时间之后进行比较最后更新时间 update_times.append(item['time']) # 测试链接获取粒度值:是否可下载 test_link = item['url'] res = requests.get(test_link) if res.status_code == 200 and res.text: item['是否可用'] = True else: item['是否可用'] = False path = 'source/'+data['name']+'/' + \ item['name']+'.'+item['format'] # 保存文件 util.saveContent(path, res.content) item['保存路径'] = path # 打印每个数据子项信息 # print(item) dimensions[DATA_DOWNLOAD].append(item) # 相关api接口 if DATA_MANAGE in dimensions: try: inter = doc.xpath( '/html/body/div[1]/div[2]/div[3]/div[1]/div[2]/div[2]/a' )[0] item = { "接口名称": inter.xpath('string(.)'), "接口地址": seed_url + inter.get('href') } dimensions[DATA_MANAGE] = item except: dimensions[DATA_MANAGE] = '异常:数据管理标签存在,但无内容' # 其他维度类似 # 维度信息添加到数据中 data.update(dimensions) # 创建时间/更新时间 table = doc.xpath('//div[@class="base-info"]/table')[0] for tbody in table: tds = tbody.getchildren() for index in range(0, len(tds), 2): data[tds[index].text.strip()] = tds[index + 1].text.strip() try: compare_update_times = [ time.mktime(time.strptime(t, "%Y-%m-%d")) for t in update_times ] if compare_update_times and data['最后修改时间'] and time.mktime( time.strptime(data['最后修改时间'], "%Y-%m-%d")) == compare_update_times[-1]: data["验证更新时间"] = True else: data["验证更新时间"] = False except Exception as e: data['验证更新时间'] = '日期格式异常:' + str(e) # 返回下一次爬取的请求集合与解析到的数据 return [], data