Пример #1
0
    def initPool(self):
        headers = {
            # 'Cookie': '__jsluid=7047e65c298237d485207bb867f6d903; __jsl_clearance=1556350912.141|0|ObnYAjOyNX3tzLrsd9c%2Btx7qzRk%3D; Hm_lvt_1761fabf3c988e7f04bec51acd4073f4=1556350916; Hm_lpvt_1761fabf3c988e7f04bec51acd4073f4=1556351251',
            # 'Host': 'www.66ip.cn',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36',
        }
        reqs = [
            Request(
                'get',
                'http://www.66ip.cn/mo.php?sxb=&tqsl=100&port=&export=&ktip=&sxa=&submit=%CC%E1++%C8%A1&textarea=',
                headers=headers),
            Request(
                'get',
                'http://www.89ip.cn/tqdl.html?api=1&num=100&port=&address=&isp=',
                headers=headers)
        ]

        spider = Spider(
            parser=ipParser(),
            writter=CommonWritter(writeMode=CommonWritter.WritterMode.EXTEND))
        spider.run(reqs)

        # self.__pool = [{'http': i} for i in spider.getItems()]
        # # https://tool.lu/ip/ajax.html
        for i in spider.getItems():
            proxy = {'http': i}
            try:
                timeout = 10
                res = requests.get('http://2019.ip138.com/ic.asp',
                                   proxies=proxy,
                                   timeout=timeout)
                group = re.search('<center>.*?((?:\d+\.){3}\d+).*?</center>',
                                  res.text)
                if group:
                    print(group[0], "---------", proxy['http'])
                    self.__pool.append(proxy)
                else:
                    print(group)
                    print('invalid', proxy['http'])
            except:
                print('invalid and exception', proxy['http'])
Пример #2
0
                '数据提供方': item['orgName'],
                '数据名称': item['name'],
                # '更新时间': item['updTime'],
                'id': item['id'],
                'list': item['list']
            })

        # 返回继续抓取的请求与解析到的数据,此处无继续抓取请求
        return [], items


# 数据写入类,此处使用xlsx格式数据写入
writter = XlsxWritter(writeMode=XlsxWritter.WritterMode.EXTEND)

# 建立爬虫对象,解析类是CatalogParser
cataSpider = Spider(name="cataSpider", parser=CatalogParser(), writter=writter)
cataSpider.run(reqs)
# 数据保存
cataSpider.write(file_key + 'catalog.xlsx', write_header=True)

############################################################################

# 根据目录信息继续抓取api接口信息并进行测试接口
# 获取目录信息
cata = cataSpider.getItems(type='dict')

# 代理
# ppool = ProxyPool()

url_template = 'http://data.guizhou.gov.cn/dataopen/api/dataset/{}/apidata?callback=jQuery111305121739185053189_1555829556219&_=1555829556226'
Пример #3
0
        doc = response.html()
        topicCata = doc.xpath(
            '/html/body/div[1]/div[2]/div[3]/div[1]/div/div[1]/ul[1]/li/a')
        data = []
        for item in topicCata:
            data.append({
                'name': item.text,
                'url': item.get('href'),
                'num': item.text.split('(')[-1][:-1]
            })
        return [], data


topicCataWritter = XlsxWritter(XlsxWritter.WritterMode.EXTEND)
topicCataSpider = Spider(name='topicspider',
                         parser=TopicClassifyParser(),
                         writter=topicCataWritter)


# 地市地区分类
class CityCataParser(HtmlParser):
    def parse(self, response):
        doc = response.html()
        topicCata = doc.xpath(
            '/html/body/div[1]/div[2]/div[3]/div[1]/div/div[1]/ul[3]/li/a')
        data = []
        for item in topicCata:
            data.append({
                'name': item.text,
                'url': item.get('href'),
                'num': item.text.split('(')[-1][:-1]
Пример #4
0
                data["验证更新时间"] = True
            else:
                data["验证更新时间"] = False
        except Exception as e:
            data['验证更新时间'] = '日期格式异常:' + str(e)

        # 返回下一次爬取的请求集合与解析到的数据
        return [], data


# 将数据写进xlsx中,因为数据有多层嵌套,所以此时选择写入json文件中,如需要写入xlsx中,取消下方注释即可
dimenInfoWritter = XlsxWritter(writeMode=XlsxWritter.WritterMode.APPEND)

# 创建爬虫对象
dimenSpider = Spider(name='dimenSpider',
                     parser=DimensionParser(),
                     writter=dimenInfoWritter)

if __name__ == '__main__':
    from cataSpider import cataSpider, getCataIndex

    # 通过filekey指定不同城市或者不同政府机构,支持以下四个
    filekey = '深圳市'

    cataInfo = getCataIndex(key=filekey, classfied='city')

    # 如何运行该爬虫:
    # 下载安装python环境
    # 下载安装爬虫库 sspider : 使用该命令即可`pip install sspider`

    # type/0 数据集
Пример #5
0
        doc = response.html()
        data = []
        # /html/body/div[5]/table/tbody/tr/td[3]/div[2]
        items = doc.xpath(
            '/html/body/div[5]/table/tbody/tr/td[3]/div/div[1]/a')[1:6]
        for item in items:
            data.append({
                'url': host+item.get('href'),
                '名称': item.text.strip()
            })

        return [], data


cataWritter = XlsxWritter(writeMode=XlsxWritter.WritterMode.EXTEND)
cataSpider = Spider(name="cataSpider",
                    parser=cataParser(), writter=cataWritter)


class dimenParser(HtmlParser):
    def parse(self, response):
        doc = response.html()
        data = response.request.other_info

        trs = doc.xpath('/html/body/div[5]/div/div[2]/div/div[1]/table/tr')

        for td in trs:
            item = td.xpath('string(.)').strip()
            temp = item.split('\r\n')
            if len(temp) >= 2:
                data[temp[0].strip()] = temp[-1].strip()
            # 分割