示例#1
0
 def getData():
     while not queue_town.empty():  # 保证url遍历结束后能退出线程
         url = queue_town.get()  # 从队列中获取URL
         data = getUrl(url)
         selector = etree.HTML(data)
         townList = selector.xpath('//tr[@class="towntr"]')
         #下面是爬取每个区的代码、URL
         for i in townList:
             townCode = i.xpath('td[1]/a/text()')
             townLink = i.xpath('td[1]/a/@href')
             townName = i.xpath('td[2]/a/text()')
             #上面得到的是列表形式的,下面将其每一个用字典存储
             for j in range(len(townLink)):
                 # 中山市、东莞市的处理
                 if url == 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2016/44/4419.html' or url == 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2016/44/4420.html':
                     townURL = url[:-9] + townLink[j]
                 else:
                     townURL = url[:-11] + townLink[j]
                 print(str(townName[j]) + ":" + str(townCode[j]))
                 time.sleep(0.1)
                 town.append({
                     'code': townCode[j],
                     'link': townURL,
                     'name': townName[j]
                 })
示例#2
0
def getProvince(url):
    province = []
    data = getUrl(url)
    selector = etree.HTML(data)
    provinceList = selector.xpath('//tr[@class="provincetr"]')
    for i in provinceList:
        provinceName = i.xpath('td/a/text()')

        print("爬取以下省信息\n "+str(provinceName))

        provinceLink = i.xpath('td/a/@href')
        for j in range(len(provinceLink)):
            provinceURL = url[:-10] + provinceLink[j]
            # key = str(provinceLink[j])[0:2]
            province.append({'name': provinceName[j], 'link': provinceURL})
            print(provinceName[j])
    return province
示例#3
0
 def getData():
     while not queue_village.empty():  # 保证url遍历结束后能退出线程
         url = queue_village.get()  # 从队列中获取URL
         data = getUrl(url)
         selector = etree.HTML(data)
         villageList = selector.xpath('//tr[@class="villagetr"]')
         #下面是爬取每个区的代码、URL
         for i in villageList:
             villageCode = i.xpath('td[1]/text()')
             UrbanRuralCode = i.xpath('td[2]/text()')
             villageName = i.xpath('td[3]/text()')
             #上面得到的是列表形式的,下面将其每一个用字典存储
             for j in range(len(villageCode)):
                 time.sleep(0.5)
                 village.append({
                     'code': villageCode[j],
                     'UrbanRuralCode': UrbanRuralCode[j],
                     'name': villageName[j]
                 })
                 print(str(villageName[j]) + ":" + str(villageCode[j]))
示例#4
0
 def getData():
     while not queue_county.empty():  # 保证url遍历结束后能退出线程
         url = queue_county.get()  # 从队列中获取URL
         data = getUrl(url)
         selector = etree.HTML(data)
         countyList = selector.xpath('//tr[@class="countytr"]')
         #下面是爬取每个区的代码、URL
         for i in countyList:
             countyCode = i.xpath('td[1]/a/text()')
             countyLink = i.xpath('td[1]/a/@href')
             countyName = i.xpath('td[2]/a/text()')
             #上面得到的是列表形式的,下面将其每一个用字典存储
             for j in range(len(countyLink)):
                 countyURL = url[:-9] + countyLink[j]
                 county.append({
                     'code': countyCode[j],
                     'link': countyURL,
                     'name': countyName[j]
                 })
                 print(str(countyName[j]) + ":" + str(countyCode[j]))
示例#5
0
def getCity(url_list):
    city_all = []
    for url in url_list:
        data = getUrl(url)
        selector = etree.HTML(data)
        cityList = selector.xpath('//tr[@class="citytr"]')

        city = []
        for i in cityList:
            cityCode = i.xpath('td[1]/a/text()')
            cityLink = i.xpath('td[1]/a/@href')
            cityName = i.xpath('td[2]/a/text()')
            print("爬取:" + str(cityName))
            for j in range(len(cityLink)):
                cityURL = url[:-7] + cityLink[j]
                city.append({
                    'name': cityName[j],
                    'code': cityCode[j],
                    'link': cityURL
                })
                print(str(cityName[j]) + str(cityCode[j]))
        city_all.extend(city)
    return city_all