示例#1
0
    def parse(self, response):
        data = response.jsonp()['data']
        #  获取请求中保存的信息
        item = response.request.other_info
        try:
            item['接口地址'] = data['ifsAddr']
            start = time.time()
            res = requests.get(data['requestDemo'])
            if res.status_code == 200 and res.text:
                item['API_AVALIABLE'] = '可用'
                # 如需下载接口数据,去掉下方注释即可
                print(item['数据名称'], '可用')
                path = 'source/' + item['数据名称'] + '.' + data['supportFormat']
                saveContent(path, res.content)
            else:
                item['API_AVALIABLE'] = '不可用'
            item['测试响应时长'] = time.time() - start
            if 'supportFormat' in data:
                item['支持格式'] = data['supportFormat']
                # 对格式验证
                # pass

            for key in data:
                if key in item:
                    item[key + '~1'] = data[key]
                else:
                    item[key] = data[key]
        except Exception as e:
            item['API_AVALIABLE'] = '接口详情页面异常' + str(e)
        # 返回下一次爬取的请求集合与解析到的数据
        return [], item
示例#2
0
    def parse(self, response):
        doc = response.html()
        data = response.request.other_info

        trs = doc.xpath('/html/body/div[5]/div/div[2]/div/div[1]/table/tr')

        for td in trs:
            item = td.xpath('string(.)').strip()
            temp = item.split('\r\n')
            if len(temp) >= 2:
                data[temp[0].strip()] = temp[-1].strip()
            # 分割

        # 测试api
        test = 'API测试'
        try:
            headers = {
                'Authorization': token
            }
            res = requests.get(data['请求示例:'], headers=headers)
            file_format = '.json' if 'json' in data['支持格式:'] else '.'+data['支持格式:']
            path = 'source/'+data['名称']+file_format
            saveContent(path, res.content)
            data[test] = '可用'
            data[test+'_status'] = res.status_code
            data[test+'_text'] = res.text
        except Exception as e:
            data[test] = '异常: '+str(e)
            if '请求示例:' not in data:
                data[test] = '异常: 无请求示例'

        return [], data
示例#3
0
    def parse(self, response):
        doc = response.html()
        data = response.request.other_info

        url = doc.xpath('/html/body/div[1]/div[2]/div[2]/p/a')[0].text
        param = doc.xpath(
            '/html/body/div[1]/div[2]/div[4]/table/tbody/tr/td[4]')[0].text
        des = doc.xpath(
            '/html/body/div[1]/div[2]/div[6]/p')[0].xpath('string(.)')

        data['apiurl'] = url.replace('个人唯一标识码', token).replace('文件编号', param)
        data['apidoc'] = des.strip()

        # 测试api
        test = 'API测试'
        try:
            res = requests.get(data['apiurl'])
            path = 'source/'+data['名称']+'.json'
            saveContent(path, res.content)
            data[test] = '可用'
            data[test+'_status'] = res.status_code
            data[test+'_text'] = res.text
        except Exception as e:
            data[test] = '异常: '+str(e)

        return [], data
示例#4
0
    def parse(self, response):
        doc = response.html()
        data = response.request.other_info

        # 创建时间/更新时间
        table = doc.xpath('//div[@class="base-info"]/table')[0]
        for tbody in table:
            tds = tbody.getchildren()
            for index in range(0, len(tds), 2):
                data[tds[index].text.strip()] = tds[index + 1].text

        # api相关
        block_path = '/html/body/div[1]/div[2]/div[3]/div[1]/div[2]/div[2]/ul/li'

        # 获取key获取值
        # key是第一个子节点的text
        # value所有text
        block = doc.xpath(block_path)
        for li in block:
            tags = []
            for i in li:
                if i.tag == 'span' or 'h3':
                    tags.append(i)
            key = tags[0].xpath('string(.)').strip()
            value = tags[1].xpath('string(.)').strip()
            data[key] = value

        # 测试API
        demo = '请求示例:'
        method = '请求方式:'
        test = 'API测试'
        if demo in data:
            start = time.time()
            try:
                if data[method].upper() == 'GET':
                    res = requests.get(data[demo])
                else:
                    res = requests.post(data[demo])
                data[test] = '可用'
                data[test + '_status'] = res.status_code
                data[test + '_text'] = res.text
                path = 'source/' + data['数据集name'] + '.' + data['支持格式:']
                saveContent(path, res.content)
            except:
                data[test] = '异常'
            data['响应时长'] = time.time() - start
        else:
            data[test] = '不存在'

        # 返回下一次爬取的请求集合与解析到的数据
        return [], data
# 北京市的api数据请求结果关联到其相应的文件,因此需要再次下载

from datacommon import reader
from datacommon.util import saveContent, listFile
import json
import requests
import os


def filterNullList(tar_list):
    for i in tar_list:
        if i:
            return i


files = listFile('source')

for file in files:
    content = json.loads(reader.getTxtData(file).strip())
    url = content['result']['address']
    if os.path.exists('apidatasource/' + content['result']['name'] + '.' +
                      url.split('.')[-1]):
        continue
    res = requests.get(url)
    print('-----' + content['result']['name'])
    saveContent(
        'apidatasource/' + content['result']['name'] + '.' +
        url.split('.')[-1], res.content)
示例#6
0
    def parse(self, response):
        doc = response.html()
        data = response.request.other_info
        update_times = []
        # 建立维度集合
        dimensions = {
            a.text.strip(): a.text.strip()
            for a in doc.xpath(
                '/html/body/div[1]/div[2]/div[3]/div[1]/div[2]/div[1]/a')
        }
        # 详细维度
        if DATA_DOWNLOAD in dimensions:
            links = doc.xpath(
                '/html/body/div[1]/div[2]/div[3]/div[1]/div[2]/div[2]/ul/li//a'
            )
            texts = doc.xpath(
                '/html/body/div[1]/div[2]/div[3]/div[1]/div[2]/div[2]/ul/li//span[@class="item-ext"]/text()'
            )
            times = doc.xpath(
                '/html/body/div[1]/div[2]/div[3]/div[1]/div[2]/div[2]/ul/li//h4/text()'
            )

            dimensions[DATA_DOWNLOAD] = []

            for i in range(len(links)):
                item = {
                    'url': seed_url + links[i].get('href'),
                    'format': texts[i] if i < len(texts) else '',
                    'name': links[i].text.strip(),
                    'time': times[i] if i < len(times) else ''
                }
                # 存储时间之后进行比较最后更新时间
                update_times.append(item['time'])
                # 测试链接获取粒度值:是否可下载
                test_link = item['url']
                res = requests.get(test_link)
                if res.status_code == 200 and res.text:
                    item['是否可用'] = True
                else:
                    item['是否可用'] = False

                path = 'source/'+data['name']+'/' + \
                    item['name']+'.'+item['format']
                # 保存文件
                util.saveContent(path, res.content)

                item['保存路径'] = path

                # 打印每个数据子项信息
                # print(item)
                dimensions[DATA_DOWNLOAD].append(item)

        # 相关api接口
        if DATA_MANAGE in dimensions:
            try:
                inter = doc.xpath(
                    '/html/body/div[1]/div[2]/div[3]/div[1]/div[2]/div[2]/a'
                )[0]
                item = {
                    "接口名称": inter.xpath('string(.)'),
                    "接口地址": seed_url + inter.get('href')
                }
                dimensions[DATA_MANAGE] = item
            except:
                dimensions[DATA_MANAGE] = '异常:数据管理标签存在,但无内容'

        # 其他维度类似

        # 维度信息添加到数据中
        data.update(dimensions)

        # 创建时间/更新时间
        table = doc.xpath('//div[@class="base-info"]/table')[0]
        for tbody in table:
            tds = tbody.getchildren()
            for index in range(0, len(tds), 2):
                data[tds[index].text.strip()] = tds[index + 1].text.strip()

        try:
            compare_update_times = [
                time.mktime(time.strptime(t, "%Y-%m-%d")) for t in update_times
            ]
            if compare_update_times and data['最后修改时间'] and time.mktime(
                    time.strptime(data['最后修改时间'],
                                  "%Y-%m-%d")) == compare_update_times[-1]:
                data["验证更新时间"] = True
            else:
                data["验证更新时间"] = False
        except Exception as e:
            data['验证更新时间'] = '日期格式异常:' + str(e)

        # 返回下一次爬取的请求集合与解析到的数据
        return [], data