def getData(): while not queue_town.empty(): # 保证url遍历结束后能退出线程 url = queue_town.get() # 从队列中获取URL data = getUrl(url) selector = etree.HTML(data) townList = selector.xpath('//tr[@class="towntr"]') #下面是爬取每个区的代码、URL for i in townList: townCode = i.xpath('td[1]/a/text()') townLink = i.xpath('td[1]/a/@href') townName = i.xpath('td[2]/a/text()') #上面得到的是列表形式的,下面将其每一个用字典存储 for j in range(len(townLink)): # 中山市、东莞市的处理 if url == 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2016/44/4419.html' or url == 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2016/44/4420.html': townURL = url[:-9] + townLink[j] else: townURL = url[:-11] + townLink[j] print(str(townName[j]) + ":" + str(townCode[j])) time.sleep(0.1) town.append({ 'code': townCode[j], 'link': townURL, 'name': townName[j] })
def getProvince(url): province = [] data = getUrl(url) selector = etree.HTML(data) provinceList = selector.xpath('//tr[@class="provincetr"]') for i in provinceList: provinceName = i.xpath('td/a/text()') print("爬取以下省信息\n "+str(provinceName)) provinceLink = i.xpath('td/a/@href') for j in range(len(provinceLink)): provinceURL = url[:-10] + provinceLink[j] # key = str(provinceLink[j])[0:2] province.append({'name': provinceName[j], 'link': provinceURL}) print(provinceName[j]) return province
def getData(): while not queue_village.empty(): # 保证url遍历结束后能退出线程 url = queue_village.get() # 从队列中获取URL data = getUrl(url) selector = etree.HTML(data) villageList = selector.xpath('//tr[@class="villagetr"]') #下面是爬取每个区的代码、URL for i in villageList: villageCode = i.xpath('td[1]/text()') UrbanRuralCode = i.xpath('td[2]/text()') villageName = i.xpath('td[3]/text()') #上面得到的是列表形式的,下面将其每一个用字典存储 for j in range(len(villageCode)): time.sleep(0.5) village.append({ 'code': villageCode[j], 'UrbanRuralCode': UrbanRuralCode[j], 'name': villageName[j] }) print(str(villageName[j]) + ":" + str(villageCode[j]))
def getData(): while not queue_county.empty(): # 保证url遍历结束后能退出线程 url = queue_county.get() # 从队列中获取URL data = getUrl(url) selector = etree.HTML(data) countyList = selector.xpath('//tr[@class="countytr"]') #下面是爬取每个区的代码、URL for i in countyList: countyCode = i.xpath('td[1]/a/text()') countyLink = i.xpath('td[1]/a/@href') countyName = i.xpath('td[2]/a/text()') #上面得到的是列表形式的,下面将其每一个用字典存储 for j in range(len(countyLink)): countyURL = url[:-9] + countyLink[j] county.append({ 'code': countyCode[j], 'link': countyURL, 'name': countyName[j] }) print(str(countyName[j]) + ":" + str(countyCode[j]))
def getCity(url_list): city_all = [] for url in url_list: data = getUrl(url) selector = etree.HTML(data) cityList = selector.xpath('//tr[@class="citytr"]') city = [] for i in cityList: cityCode = i.xpath('td[1]/a/text()') cityLink = i.xpath('td[1]/a/@href') cityName = i.xpath('td[2]/a/text()') print("爬取:" + str(cityName)) for j in range(len(cityLink)): cityURL = url[:-7] + cityLink[j] city.append({ 'name': cityName[j], 'code': cityCode[j], 'link': cityURL }) print(str(cityName[j]) + str(cityCode[j])) city_all.extend(city) return city_all