def parse(self, response): self.driver.get(response.url) for z in range(31): #fetch geo info on current page for i in range(1,11): #initial xpath, as the shop info start in 2nd tr on each page shopName_xpath = '//tr[' + str(i + 1) + ']/td[1]' shopAddress_xpath = '//tr[' + str(i + 1) + ']/td[2]' shopName = self.driver.find_element_by_css_selector('#listhtml').find_element_by_xpath(shopName_xpath).text shopAddress = self.driver.find_element_by_css_selector('#listhtml').find_element_by_xpath(shopAddress_xpath).text #shopAddress need to be encode, as initial asiic format shop_address_encode = shopAddress.encode('utf-8') full_address = '上海市' + shop_address_encode try: # shopGeo will be fetched from baidu as [lat, lng] shopGeo = baiduAddressGeo.getGeoForAddress(full_address) except Exception, e: print 'can not get geo for this address' pass item = KfcItem() item['shopName'] = shopName item['lat'] = shopGeo[0] item['lng'] = shopGeo[1] yield item #turn to next page for shops try: self.driver.find_element_by_css_selector('a[style*="text-decoration:underline"] + *').click() except Exception, e: print 'finish! no more pages!' return
geoData = open('./geo_accident.csv', 'a') try: geoWriter = csv.writer(geoData) geoWriter.writerow(('district', 'hurt', 'time', 'place','lat', 'lng')) finally: geoData.close() #处理原始文件每一行,从百度接口获取坐标 for row in addrAll: # print row[1].decode('utf-8') district = row[0] hurt = row[1] time = row[2] place = row[3] fullAddr = '上海市' + row[3] addressEncode = fullAddr.decode('utf-8').encode('utf-8') addrGeo = baiduAddressGeo.getGeoForAddress(addressEncode) lat = addrGeo[0] lng = addrGeo[1] print district, hurt, time, place, lat, lng #将百度返回的坐标写入目标文件 geoData = open('./geo_accident.csv', 'a') try: geoWriter = csv.writer(geoData) geoWriter.writerow((district, hurt, time, place, lat, lng)) finally: geoData.close() addrData.close()