def areaSnatch(): """ 抓取小学学区图位置信息 :return: """ # 小学学区 datas = [] primaryschool_url = 'http://map.28dat.net/inc/ftxx.js' data = spider_util.open_url(primaryschool_url).decode() start = data.find('return') end = data.find('];') data = data[start + 6:end + 1] # 获取其中坐标信息 primaryschool_area = demjson.decode(data) coordinate_handle(primaryschool_area, '小学') # 初中学区 middleschool_url = 'http://map.28dat.net/inc/ftcz.js' data = spider_util.open_url(middleschool_url).decode() start = data.find('return') end = data.find('];') data = data[start + 6:end + 1] # 获取其中坐标信息 middleschool_area = demjson.decode(data) coordinate_handle(middleschool_area, '初中') datas.extend(primaryschool_area) datas.extend(middleschool_area) return datas
def getInfo(url, datas): html = spider_util.open_url(url, 5, 20) # 20秒超时 bsObj = BeautifulSoup(html, "html.parser", from_encoding="UTF-8") title = bsObj.find('title').get_text().strip() time = bsObj.find('em').get_text().strip() p_tags = bsObj.find('div', {'class': 'TRS_Editor'}).find_all('p') thecontent = bsObj.get_text().strip() content = re.search('var content = \".+\"', bsObj.get_text().strip()) content = content.group() content = content[content.find('"') + 1:content.rfind('"')] content = content.replace('<br/>', '') content = content.replace(' ', '') content = content.strip() company = re.search('.+?[::]', content).group().strip()[:-1] print(content) data = {} data['company'] = company data['time'] = time data['content'] = content money = 0 arr = re.findall('(\d+\.?\d+元)', string=content) for i in range(len(arr)): money = money + float(arr[i][:-1]) data['title'] = title data['money'] = money data['source'] = '人力资源局' data['type'] = '劳动仲裁' datas.append(data)
def doRequest(datas=[], stockType=''): """ 请求http://s.askci.com/StockInfo/StockList/GetList,抓取上市企业信息 :param datas: 保存数据的数组 :param stockType: 股票类型,A股:a,港股:hk,新三板:xsb :return: """ url = 'http://s.askci.com/StockInfo/StockList/GetList?pageNum={pageNum}&stockType={stockType}' pageNum = 1 typestr = stockType if stockType is None or stockType == '': typestr = '所有类型' while True: print(f'抓取上市企业信息,当前第:{pageNum}页,股票类型为{typestr}') url = url.format(pageNum=pageNum, stockType=stockType) result = spider_util.open_url(url) json = demjson.decode(result) data = json['data'] if data is None or len(data) == 0: break if pageNum > json['totalPageNum']: break for obj in data: obj['stockType'] = stockType datas.extend(data) pageNum += 1 return datas
def get_info_data(url='', person_info=[]): """ 获取孔雀计划人才数据 """ try: html = spider_util.open_url(url, 5, 20) # 20秒超时 bsObj = BeautifulSoup(html, "html.parser", from_encoding="gb18030") main_div = bsObj.find('div', {'class': 'conRight_text2'}) title = main_div.find('h4').get_text() issues = re.search('第.*批', title) if issues: issues = spider_util.chinese2digits(issues.group()[1:-1]) else: issues = '' time_text = main_div.find('p', { 'style': 'text-align:center; line-height:22px; color:#333;background-color: #efefef;'}).get_text().strip() release_time = re.search('\d{3,4}\-\d{1,2}\-\d{1,2}', time_text).group() crawl_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) extra = {'发布时间': release_time, '标题': title, '批次': issues,'爬取时间':crawl_time} li_tag = bsObj.find('li') href = li_tag.find('a').get('href') doc_url = get_xls_url(url, href) download2person_info(doc_url, person_info, extra=extra) except Exception as e: print('获取孔雀计划人才数据失败,原因:%s' % e)
def address2location(address='', city='深圳市', ret_coordtype='bd09ll', ak='Exhb17fjBe4YoCCERO0mAkRsnTXDRpzN'): """ 地址解析为百度经纬度坐标信息 :param address: 需要被解析的地址 :param city: 地址所在城市,默认深圳市 :param ret_coordtype: 坐标系,默认bd09ll(百度经纬度坐标),可选gcj02ll(国测局坐标) :param sn:百度校验码 :return: """ time.sleep(0.00625) url = 'http://api.map.baidu.com/geocoder/v2/' params = { 'address': address, 'output': 'json', 'city': city, 'ret_coordtype': ret_coordtype, 'ak': ak } response = spider_util.open_url(url, data=params) data = json.loads(response) status = data['status'] if status is not 0: msg = data['msg'] raise RuntimeError('地址解析失败\n错误代码:' + str(status) + '\n原因:' + str(msg)) location = data.get('result').get('location') lng = location['lng'] lat = location['lat'] return lng, lat
def get_person_info(url, person_info): """ 获取人才数据 :param url: 人才详情页面地址 :param person_info: 保存人才信息数据的列表,列表的每个元素为表格每行对应表头的字典 :return: """ html = spider_util.open_url(url, self_rotation=5, timeout=20) # 20秒超时 bsObj = BeautifulSoup(html, "html.parser", from_encoding="gb18030") main_div = bsObj.find('div', {'class': 'conRight_text2'}) title = main_div.find('h4').get_text() content = re.search('(.*)', title) # issues 期数 if content is not None: issues = re.search('\d+', content.group()).group() else: issues = title[title.find('公告') + 2:] issues = spider_util.chinese2digits(issues) time_text = main_div.find( 'p', { 'style': 'text-align:center; line-height:22px; color:#333;background-color: #efefef;' }).get_text().strip() table_tag = bsObj.find("table") release_time = re.search('\d{3,4}\-\d{1,2}\-\d{1,2}', time_text).group() crawl_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) extra = { '期数': issues, '发布时间': release_time, '标题': title, '爬取时间': crawl_time } if table_tag is None: # 没有table标签,尝试抓取下载的连接地址并下载doc文件 text = re.search(re.compile("(附件)+"), main_div.get_text()) if text is not None: href = main_div.find('div', { 'class': 'nr' }).find('li').find('a').get('href') doc_url = get_docurl(url, href) download_to_data_arr(doc_url, person_info, extra=extra) else: print('该页面没有表格或doc文档') else: header_tds = table_tag.find_all("tr")[0].find_all('td') trs = table_tag.find_all("tr")[1:] for tr in trs: tds = tr.find_all("td") data = {} for i in range(len(tds)): field = header_tds[i].get_text().strip() field = re.sub('\s+', '', field) # 替换空格为空 text = tds[i].get_text().strip() data[field] = text for key in extra: # 添加额外数据 data[key] = extra[key] person_info.append(data)
def main(): interfaces = [ '经济运行主题库-商事基本信息', '经济运行主题库-私营主体信息', '经济运行主题库-外资企业信息', '经济运行主题库-证照基本信息', '经济运行主题库-本市生产总值三次产业信息', '经济运行主题库-三次产业贡献信息', '经济运行主题库-深圳市各行业增加信息', '经济运行主题库-各行业增加值构成项目信息', '经济运行主题库-深圳市各区生产总值信息', '经济运行主题库-按行业分的社会劳动者人数', '经济运行主题库-社会劳动者人数', '经济运行主题库-分经济类型和行业城镇单位从业人员' ] login() # 登陆获取cookie信息 cookie = browser.get_cookie("JSESSIONID") page = 1 rows = 30 url = 'http://10.248.96.106:9517/DataSupport/dataapi/apiList.xhtml?page=1&rows=15' data = spider_util.open_url(url, self_rotation=60, timeout=60, header=header) jsondata = json.loads(data) total = jsondata.get('total') while True: if (page * rows) > int(total): break print('当前第' + str(page) + '页') url = 'http://10.248.96.106:9517/DataSupport/dataapi/apiList.xhtml?page=' + str( page) + '&rows=' + str(rows) data = spider_util.open_url(url, self_rotation=60, timeout=60, header=header) jsondata = json.loads(data) apis = jsondata.get("rows") for api in apis: if api["name"] in interfaces: sql = gene_table_sql(api) with open(file=path.join( 'D:\\011111111111111111111111\\支撑平台建表', api["name"] + '.sql'), mode="w", encoding="utf-8") as file: file.write(sql) print('生成' + api["name"] + 'sql文件') page += 1 browser.close()
def getTotalPage(): """ 获取总页数 :return: """ html = spider_util.open_url('http://www.sz68.com/land/') bsObj = BeautifulSoup(html, "html.parser") litag=bsObj.select('#wp_page_numbers li')[-3] totalpage=litag.get_text() return int(totalpage)
def requset_school_info(areas): schoolnames = [] infourl_prefix = 'http://map.28dat.net/s_ft/school.aspx?no=' for school in areas: print(school) schoolnames.append(school['name']) resulet = spider_util.open_url(infourl_prefix + '1' + school['no']) bsObj = BeautifulSoup(resulet, "html.parser", from_encoding="utf-8") text = bsObj.select_one('#s_desc').get_text() print(text) print(schoolnames)
def gene_table_sql(api): """ 根据api生成建表语句 :param api: :return: """ table = api['dataTable'] url = 'http://10.248.96.106:9517/DataSupport/dataapi/columns.xhtml?tableName=' + table data = spider_util.open_url(url, header=header) fields = json.loads(data) return get_sql(api, fields)
def get_land_info(url, table1_dict,time): html = spider_util.open_url(url) bsObj = BeautifulSoup(html, "html.parser") grids = bsObj.find("main").find_all("div", {"class": "ym-g33 ym-gr"})[1].find_all("div", {"class": "ym-grid"}) # 正则过滤标题土地编号 pattern1 = "[0-9a-zA-Z\-\(\)]+" # 放入土地编号 codeArr = table1_dict.get("土地编号") if (codeArr == None): codeArr = table1_dict["土地编号"] = [] timeArr = table1_dict.get("交易日期") if (timeArr == None): timeArr = table1_dict["交易日期"] = [] # trs = bsObj.find("main").find_all("div", {"class": "ym-g66 ym-gl"})[1].find("table").find("table").find_all("tr") # time = "" # for tr in trs: # if "竞买申请截止" in tr.get_text().strip(): # time = tr.find("td").get_text().strip() # if "竞价开始" in tr.get_text().strip(): # tempTime = tr.find("td").get_text().strip() # if "" != tempTime and tempTime is not None: # time = tempTime # break heads = grids[1:7] idx = 8 infos = [] while (idx < len(grids)): info = grids[idx:idx + 11] infos.append(info) idx = idx + 11 for info in infos: # 先放入交易人相关信息 for head in heads: headkey = head.find("div", {"class": "ym-g33 ym-gl"}).get_text().strip() headValue = head.find("div", {"class": "ym-g66 ym-gr"}).get_text().strip() headArr = table1_dict.get(headkey) if (headArr == None): headArr = table1_dict[headkey] = [] headArr.append(headValue) # 放入土地编码 code = re.search(re.compile(pattern1), info[0].get_text().strip()) codeArr.append(code.group()) # 放入交易日期 timeArr.append(time) # 再放入土地信息 for infoField in info[2:]: fieldKey = infoField.find("div", {"class": "ym-g33 ym-gl"}).get_text().strip() fieldValue = infoField.find("div", {"class": {"ym-g66 ym-gr", "ym-g66 ym-gr get_district"}}).get_text().strip() fieldArr = table1_dict.get(fieldKey) if (fieldArr == None): fieldArr = table1_dict[fieldKey] = [] fieldArr.append(fieldValue)
def request_area_building(): """ 获取学区图中每个学区的0-18岁人口信息 :return: DataFrame """ file = 'D:\\pypy\\pythonresult\\教育学位\\学校人口信息.xls' if os.path.isfile(file): area_data = DataFrame(pd.read_excel(file)) if area_data is not None or not area_data.empty: return area_data areas = db_util.execute2Dataframe('SELECT\ WWYJFX.T_JY_SCHOOLAREA.SCHOOLNAME,\ WWYJFX.T_JY_SCHOOLAREA.SCHOOL_FULLNAME,\ WWYJFX.T_JY_SCHOOLAREA.SCHOOLTYPE,\ WWYJFX.T_JY_SCHOOLAREA.POLYGON_84\ FROM\ WWYJFX.T_JY_SCHOOLAREA\ ') # areas = DataFrame(pd.read_excel('D:\\pypy\\pythonresult\\教育学位\\学区信息.xls')) data = { 'f': 'json', 'returnGeometry': 'false', 'spatialRel': 'esriSpatialRelIntersects', 'geometryType': 'esriGeometryPolygon', 'inSR': 4490, 'outFields': 'BLDG_NO,NOWNAME', 'outSR': 4490 } url_prefox = 'http://10.190.55.55:8080/arcgis/rest/services/FTKSJ/JZWDLM_CGCS2000/MapServer/1/query' person_data = DataFrame() for index, row in areas.iterrows(): polygon_84 = row['POLYGON_84'] schoolname = row['SCHOOLNAME'] schooltype = row['SCHOOLTYPE'] if polygon_84 is not None and polygon_84 is not '' and polygon_84 is not np.nan: geometry = split_point_to_geometry(polygon_84) data['geometry'] = geometry result = spider_util.open_url(url_prefox, 5, 20, data=data) # 20秒超时 jsondata = demjson.decode(result) buildings = get_building(jsondata) if buildings is None or len(buildings) == 0: print('该学校:' + schoolname + '楼栋id为空') continue childinfo = request_area_personcont(schoolname, schooltype, buildings) person_data = person_data.append(childinfo) df = DataFrame(person_data) df.to_excel(file, index=False) return df
def change_zb(filename): url = 'http://192.168.37.134:8080/ConvertCoord/servlet/local2wgs' with open(filename, "r", encoding='utf-8', newline='') as file: df = pd.read_csv(file, dtype=str) df['84_x'] = None df['84_y'] = None df['bd_x'] = None df['bd_y'] = None df['gd_x'] = None df['gd_y'] = None for x in range(len(df.index)): try: ABSX = df['LON'].iloc[x] ABSY = df['LAT'].iloc[x] if ABSY is None or ABSY is None: continue ABSX = float(ABSX) ABSY = float(ABSY) if math.isnan(ABSX) or math.isnan(ABSY): #非数字跳过 continue html = spider_util.open_url(url, data={ 'lat': ABSX, 'lon': ABSY }) bsObj = BeautifulSoup(html, "html.parser", from_encoding="utf-8") zb = bsObj.get_text().strip() zb_arr = zb.split(',') lon = float(zb_arr[0]) # 84经度坐标 lat = float(zb_arr[1]) df.set_value(x, '84_x', lon) df.set_value(x, '84_y', lat) gcj02 = coordinate_util.wgs84togcj02(lon, lat) #84坐标转火星坐标 df.set_value(x, 'gd_x', gcj02[0]) df.set_value(x, 'gd_y', gcj02[1]) bd09 = coordinate_util.gcj02tobd09(gcj02[0], gcj02[1]) #火星坐标转百度坐标 df.set_value(x, 'bd_x', bd09[0]) df.set_value(x, 'bd_y', bd09[1]) # print(jd84+'-----'+wd84) time.sleep(0.04) except Exception as e: print('跳过该条数据') df.to_csv(filename, index=False, sep=',')
def sz2wgs84(lon, lat): """ 深圳坐标转wgs84坐标 :param lon: :param lat: :return: """ url = "http://192.168.37.134:8080/ConvertCoord/servlet/local2wgs" html = spider_util.open_url(url, data={'lat': lon, 'lon': lat}) bsObj = BeautifulSoup(html, "html.parser", from_encoding="utf-8") zb = bsObj.get_text().strip() zb_arr = zb.split(',') lon = zb_arr[0] # 经度坐标 lat = zb_arr[1] time.sleep(0.1) # 休眠1秒 return [lon, lat]
def get_assessment(url, data_arr): """ 获取人才评估结果名单 :param url: :param data_arr: 需要保存的数据列表 :return: """ html = spider_util.open_url(url, self_rotation=5, timeout=20) # 20秒超时 bsObj = BeautifulSoup(html, "html.parser", from_encoding="gb18030") main_div = bsObj.find('div', {'class': 'conRight_text2'}) title = main_div.find('h4').get_text() content = re.search('(.*)|\(.*\)', title) # issues 期数 if content is not None: issues = content.group()[1:-1] # 跳过括号 else: issues = '' time_text = main_div.find( 'p', { 'style': 'text-align:center; line-height:22px; color:#333;background-color: #efefef;' }).get_text().strip() release_time = re.search('\d{3,4}\-\d{1,2}\-\d{1,2}', time_text).group() crawl_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) extra = { '期数': issues, '发布时间': release_time, '标题': title, '爬取时间': crawl_time } # 没有table标签,尝试抓取下载的连接地址并下载doc文件 text = re.search(re.compile("(附件)+"), main_div.get_text()) if text is not None: li_tags = main_div.find('div', {'class': 'nr'}).find_all('li') for li_tag in li_tags: a_tag = li_tag.find('a') if a_tag is None: continue href = a_tag.get('href') doc_url = get_docurl(url, href) download_assessment_to_arr(doc_url, data_arr, extra=extra) else: print('该页面没有文档')
def getField_sql(url): """ 从深圳公开数据平台抓取对应页面字段信息 :param url: :return:获取的字段数组 """ html = spider_util.open_url(url, 5, 20) # 20秒超时 bsObj = BeautifulSoup(html, "html.parser", from_encoding="UTF-8") title = bsObj.select_one("div.row.operation-headline").get_text().strip() title = title.replace(".", "点") table = bsObj.select("#apicontent table")[1] tr_tags = table.select("tr")[2:] field_arr = [] for tr_tag in tr_tags: td_tags = tr_tag.select("td") field = td_tags[0].get_text().strip() comment = td_tags[2].get_text().strip() field_arr.append({"field": field, "comment": comment}) sql = get_sql(title, field_arr) return sql, title
def change_zb(filename): url = 'http://192.168.37.134:8080/ConvertCoord/servlet/wgs2local' with open(filename, "r", encoding='utf-8', newline='') as file: df = pd.read_csv(file, dtype=str) df['lon'] = None df['lat'] = None for x in range(len(df.index)): try: ABSX = df['bd_x'].iloc[x] ABSY = df['bd_y'].iloc[x] if ABSY is None or ABSY is None: continue ABSX = float(ABSX) ABSY = float(ABSY) if math.isnan(ABSX) or math.isnan(ABSY): #非数字跳过 continue bdo9 = coordinate_util.bd09togcj02(ABSX, ABSY) wgs84 = coordinate_util.gcj02towgs84(bdo9[0], bdo9[1]) jd84 = wgs84[0] wd84 = wgs84[1] # print(jd84+'-----'+wd84) html = spider_util.open_url(url, data={ 'lat': jd84, 'lon': wd84 }) bsObj = BeautifulSoup(html, "html.parser", from_encoding="utf-8") zb = bsObj.get_text().strip() zb_arr = zb.split(',') lon = zb_arr[0] # 经度坐标 lat = zb_arr[1] df.set_value(x, 'lon', lon) df.set_value(x, 'lat', lat) time.sleep(0.04) except Exception as e: print('跳过该条数据') df.to_csv(filename, index=False, sep=',')
def change_zb(filename): with open(filename, "r", encoding='utf-8', newline='') as file: header={'Cookie':'BCE54B84-5407-41FD-9D16-C8A09E5DA2A0=YWRtaW4%3D; YWRtaW4==a2RpZiNzaWM4RGpbY216; JSESSIONID=1BA5932F6535DFDEAA2E63C9AAD3040C'} url='http://10.169.11.195:7020/tjfxpt/gis/local2wgs.xhtml' df = pd.read_csv(file, dtype=str) length=len(df.index) df['bd_x'] = None df['bd_y'] = None df['gd_x'] = None df['gd_y'] = None for x in range(len(df.index)): try: ABSX=df['LOG'].iloc[x] ABSY=df['LAT'].iloc[x] if ABSY is None or ABSY is None: continue ABSX = float(ABSX) ABSY = float(ABSY) if math.isnan(ABSX) or math.isnan(ABSY):#非数字跳过 continue html = spider_util.open_url(url, data={'lng': ABSX, 'lat': ABSY},header=header) bsObj = BeautifulSoup(html, "html.parser", from_encoding="utf-8") zb = bsObj.get_text().strip() zb_arr = zb.split(',') lon = float(zb_arr[0]) # 百度经度坐标 lat = float(zb_arr[1]) df.set_value(x, 'bd_x', lon) df.set_value(x, 'bd_y', lat) gcj02=coordinate_util.bd09togcj02(lon,lat)#百度坐标转火星坐标 df.set_value(x, 'gd_x', gcj02[0]) df.set_value(x, 'gd_y', gcj02[1]) spider_util.log_progress(x,length,start_from_zero=True,detailedLog=True) # print(jd84+'-----'+wd84 # time.sleep(0.04) except Exception as e: print('跳过该条数据') df.to_csv(filename, index=False, sep=',') print(df)
def get_infourl(url, pattern): """ 获取公告对应的url链接 :param url: 当前页面的链接地址 :param pattern: 需要进行比较的正则表达式或正则表达式对象 :param keyword: 标题中的关键字,只有含有关键字的链接才会被返回 :return: """ html = spider_util.open_url(url, 5, 20) # 20秒超时 bsObj = BeautifulSoup(html, "html.parser", from_encoding="gb18030") liTags = bsObj.find("ul", {"class": "conRight_text_ul1"}).find_all("li") url_arr = [] # 获取当前url的目录地址 offset = url.rfind("/") url_prefix = url[:offset + 1] # 只获取高层次专业人才认定公式公告 for liTag in liTags: aTag = liTag.find("a") title = aTag.attrs["title"] if re.search(pattern, string=title) is not None: href = url_prefix + aTag.attrs["href"][2:] url_arr.append(href) return url_arr
'spatialRel': 'esriSpatialRelIntersects', 'returnGeometry': 'true', 'returnTrueCurves': 'false', 'returnIdsOnly': 'false', 'returnCountOnly': 'false', 'returnZ': 'false', 'returnM': 'false', 'returnDistinctValues': 'false', 'f': 'pjson' } data['where'] = "标准名 like '%" + school['学校名称'] + "%'" type = school['学校类型'] if type == 1 or type == '1': url = 'http://10.190.65.123:6080/arcgis/rest/services/FTKSJ/ggss_futian_201803_01/MapServer/102/query' elif type == 2 or type == '2': url = 'http://10.190.65.123:6080/arcgis/rest/services/FTKSJ/ggss_futian_201803_01/MapServer/101/query' elif type == 0 or type == '0': url = 'http://10.190.65.123:6080/arcgis/rest/services/FTKSJ/ggss_futian_201803_01/MapServer/103/query' #req = request.Request(url=url,data=data) data = urllib.parse.urlencode(data).encode('utf-8') html = spider_util.open_url(url, data=data) result = json.loads(html) features = result['features'] if features is not None and len(features) > 0: schoolpt = features[0] x = schoolpt.get('geometry').get('x') y = schoolpt.get('geometry').get('y') school['pointX'] = x school['pointY'] = y DataFrame(datas).to_csv("D:\\项目temp\\school_new_1.csv", index=False, sep=',')
def location2normaladdress( lng, lat, coordtype='bd09ll', ret_coordtype='bd09ll', ak='Exhb17fjBe4YoCCERO0mAkRsnTXDRpzN', ): """ :param lng: 经度 :param lat: 纬度 :param coordtype: 坐标的类型,目前支持的坐标类型包括: bd09ll(百度经纬度坐标)、bd09mc(百度米制坐标)、gcj02ll(国测局坐标)、wgs84ll( GPS经纬度) :param ret_coordtype: 返回的坐标类型: gcj02ll(国测局坐标)、默认bd09ll(百度经纬度坐标) :param ak: 百度校验码 :return: 返回addressComponent字典,包含以下信息 country 国家 province 省名 city 城市名 district 区县名 town 乡镇名(对于地级市对应街道名称) street 街道名(行政区划中的街道层级) street_number 街道门牌号 adcode 行政区划代码 country_code 国家代码 direction 相对当前坐标点的方向,当有门牌号的时候返回数据 distance 相对当前坐标点的距离,当有门牌号的时候返回数据 bd_x 百度坐标经度 bd_y 百度坐标纬度 lon84 84坐标经度 lat84 84坐标纬度 formatted_address 格式化后的地址 """ time.sleep(0.00625) url = 'http://api.map.baidu.com/geocoder/v2/' params = { 'location': str(lat) + ',' + str(lng), 'output': 'json', 'ret_coordtype': ret_coordtype, 'ak': ak, 'coordtype': coordtype, 'extensions_town': 'true', 'latest_admin': '1', 'extensions_poi': 'null' } response = spider_util.open_url(url, data=params) data = json.loads(response) result = data['result'] status = data['status'] if status is not 0: msg = data['msg'] raise RuntimeError('地址解析失败\n错误代码:' + str(status) + '\n原因:' + str(msg)) formatted_address = result['formatted_address'] addressComponent = result['addressComponent'] addressComponent['bd_x'] = lng addressComponent['bd_y'] = lat lon84, lat84 = coordinate_util.bd09towgs84(lng, lat) addressComponent['lon84'] = lon84 addressComponent['lat84'] = lat84 addressComponent['formatted_address'] = formatted_address return addressComponent
# -*- coding: utf-8 -*-\ from bs4 import BeautifulSoup from pandas import DataFrame from util import spider_util url = 'http://192.168.37.134:8080/ConvertCoord/servlet/wgs2local' # wgs84转换坐标为深圳坐标 paths = [ 'D:/福田决策文件/消防数据表/消防数据/T_XF_XIAOFANG_BUWEI.csv', 'D:/福田决策文件/消防数据表/消防数据/T_XF_SCHSELFCHECKRECORD.csv', 'D:/福田决策文件/消防数据表/消防数据/T_XF_INSRECORD.csv' ] for path in paths: datas, heads = spider_util.readCSV2List(path) for data in datas: jd84 = data['jd84'] wd84 = data['wd84'] #print(jd84+'-----'+wd84) html = spider_util.open_url(url, data={'lat': jd84, 'lon': wd84}) bsObj = BeautifulSoup(html, "html.parser", from_encoding="utf-8") zb = bsObj.get_text().strip() zb_arr = zb.split(',') lon = zb_arr[0] #经度坐标 lat = zb_arr[1] data['lon'] = lon data['lat'] = lat DataFrame(datas).to_csv(path, index=False, sep=',')
# -*- coding: utf-8 -*- from bs4 import BeautifulSoup from pandas import DataFrame from util import spider_util url = 'http://10.190.62.57/geostar/440304_schooldistrict/wfs?VERSION=1.0.0&SERVICE=WFS&REQUEST=GetFeature&RESULTTYPE=result&OUTPUTFORMAT=XML' html = spider_util.open_url(url, self_rotation=5, timeout=20) # 20秒超时 bsObj = BeautifulSoup(html, "lxml-xml", from_encoding="utf-8") features = bsObj.find_all('gml:featureMember') schoolarea = [] for feature in features: school = None schoolType = 2 school = feature.find('MIDDLESCHOOL') if school is None: school = feature.find('PRIMARYSCHOOL') schoolType = 1 schoolObj = {} position = school.find('gml:coordinates').get_text() schoolname = school.find('NAME').get_text() schoolObj['schoolType'] = schoolType schoolObj['schoolName'] = schoolname schoolObj['position'] = position schoolarea.append(schoolObj) DataFrame(schoolarea).to_csv( "D:\\011111111111111111111111\\软件\\school_area_data.csv", index=False, sep=',')