def get_build_info(self, build_url_list): for i in build_url_list: try: build = Building(co_index) build_url = 'http://www.ndjsj.gov.cn/House/' + i build.co_name = '项目名称:.*?<td.*?>(.*?)<' build.bu_num = '幢 号:.*?<td.*?>(.*?)<' build.bu_address = '坐落位置:.*?<td.*?>(.*?)<' build.co_build_structural = '建筑结构:.*?<td.*?>(.*?)<' build.bu_floor = '总 层 数:.*?<td.*?>(.*?)<' build.bu_build_size = '总 面 积:.*?<td.*?>(.*?)<' # build.bu_type = '设计用途:.*?<td.*?>(.*?)<' build.bu_all_house = '批准销售:.*?<td.*?>(.*?)<' p = ProducerListUrl( page_url=build_url, request_type='get', encode='utf-8', analyzer_rules_dict=build.to_dict(), current_url_rule='javascript:ShowTitle.*?href="(.*?)"', analyzer_type='regex', headers=self.headers) house_url_list = p.get_details() self.get_house_info(house_url_list) except Exception as e: print('宁德楼栋错误,url={}'.format(build_url), e)
def build_info(self, build_url_list): bu = Building(co_index) for build_url in build_url_list: url = "http://ris.szpl.gov.cn/bol/" + build_url res = requests.get(url, headers=self.headers) con = etree.HTML(res.text) branch_list = con.xpath("//div[@id='divShowBranch/a/@href") for branch in branch_list: branch_url = "http://ris.szpl.gov.cn/bol/" + branch response = requests.get(branch_url, headers=self.headers) content = etree.HTML(response.text) bu.bu_num = content.xpath( "//div[@id='curAddress']/a/text()")[2] bu.co_name = content.xpath( "//div[@id='curAddress']/a/text()")[1] co_info = content.xpath("//form/@action")[0] bu.bu_id = bu_id = re.search('\?id=(\d+)&', co_info).group(1) bu.co_id = co_id = re.search('presellid=(\d+)&', co_info).group(1) bu.insert_db() house_list = content.xpath( "//div[@id='updatepanel1']//tr[@class='a1']//a/@href" )[2:-1] self.house_info(house_list, bu_id, co_id)
def get_build_info(self, build_url_list, co_id): for i in build_url_list: build_url = 'http://gold.ncfdc.com.cn/' + i.replace('amp;', '') res = requests.get(build_url) co_name = re.search('ctl15_proname">(.*?)<', res.text, re.S | re.M).group(1) str = re.search('项目楼栋列表.*?ctl17_fLinks_pDataShow', res.text, re.S | re.M).group() for info in re.findall('<tr>.*?</tr>', str, re.S | re.M): if 'href' not in info: continue try: build = Building(co_index) build.co_name = co_name build.bu_num = re.search('<tr>.*?<td>.*?<a href=.*?>(.*?)<', info, re.S | re.M).group(1) build.bu_pre_sale = re.search('onclick="BinSHouseInfo.*?>(.*?)<', info, re.S | re.M).group(1) build.bu_pre_sale_date = re.search('onclick="BinSHouseInfo.*?<td>(.*?)<', info, re.S | re.M).group( 1) build.bu_all_house = re.search('color:#ec5f00;">(.*?)<', info, re.S | re.M).group(1) build.bu_id = re.search("DisplayB_ld&hrefID=(.*?)'", info, re.S | re.M).group(1) build.co_id = co_id build.insert_db() except Exception as e: print('楼栋错误,co_index={},url={}'.format(co_index, build_url), e) house_url_list = re.findall("</span>.*?</td><td>.*?<a href='(.*?xs.*?)' target=\"_blank\">.*?查看", res.text, re.S | re.M) self.get_house_info(house_url_list)
def comm_info( self, con, ): # 小区及楼栋 comm = Comm(co_index) comm.co_name = con.xpath( "//span[@id='ctl00_ContentPlaceHolder2_web_item_retail1_lb_item_name']/text()" )[0] # 小区名称 co_id_str = con.xpath("//form[@id='aspnetForm']/@action")[0] # 小区id comm.co_id = re.search(r"\d+", co_id_str).group(0) comm.co_address = con.xpath( "//span[@id='ctl00_ContentPlaceHolder2_lb_item_seat']/text()")[ 0] # 小区地址 comm.co_develops = con.xpath( "//span[@id='ctl00_ContentPlaceHolder2_lb_enter_name']/text()")[ 0] # 开发商 comm.co_size = con.xpath( "//span[@id='ctl00_ContentPlaceHolder2_lb_area']/text()")[0] # 总面积 comm.co_build_size = con.xpath( "//span[@id='ctl00_ContentPlaceHolder2_lb_item_area']/text()")[ 0] # 建筑面积 comm.co_build_end_time = con.xpath( "//span[@id='ctl00_ContentPlaceHolder2_lb_item_ew_date']/text()")[ 0] # 竣工时间 comm.co_plan_pro = con.xpath( "//span[@id='ctl00_ContentPlaceHolder2_lb_program_pcode']/text()")[ 0] # 用地规划许可 comm.co_work_pro = con.xpath( "//span[@id='ctl00_ContentPlaceHolder2_lb_jg']/text()")[0] # 施工许可 comm.co_green = con.xpath( "//span[@id='ctl00_ContentPlaceHolder2_lb_item_green_rate']/text()" )[0] # 绿地百分比 comm.co_land_use = con.xpath( "//span[@id='ctl00_ContentPlaceHolder2_lb_td']/text()")[0] # 土地使用证 comm.insert_db() build = Building(co_index) build_table = con.xpath("//tr[@style='color:#000066;']") room_list = [] for build_list in build_table: build.co_id = comm.co_id build.co_name = comm.co_name build_info = build_list.xpath("./td/text()") build.bu_id = build_info[0] build.bu_num = build_info[1] build.bu_all_house = build_info[2] build.size = build_info[3] build.bu_floor = build_info[4] build.bu_pre_sale = build_info[5] build.insert_db() room_url = build_list.xpath("./td/a/@href")[0] room_list.append(room_url) return room_list
def get_build_info(self, build_url_list, co_name): for i in build_url_list: try: build = Building(co_index) code = i.split(',,') build.bu_num = code[1] build.co_name = co_name build.insert_db() self.get_house_info(code, co_name) except Exception as e: print(e)
def get_build_info(self, build_url_list, co_name): for i in build_url_list: try: build = Building(co_index) build.co_name = co_name build_url = 'http://www.sxczfdc.com/pubinfo/' + i response = requests.get(build_url, headers=self.headers) html = response.text # build_detail_url = re.findall('(Pub_dtxx.aspx\?ProjectBuildingID=.*?)"', html, re.S | re.M)[0] for k in re.findall('(Pub_dtxx.aspx\?ProjectBuildingID=.*?)"', html, re.S | re.M): try: build_url_detail = 'http://www.sxczfdc.com/pubinfo/' + k result = requests.get(build_url_detail, headers=self.headers) content = result.text build.bu_num = re.findall( 'BuildingInfo1_lblBuildingName">(.*?)<', content, re.S | re.M)[0] build.bu_all_house = re.findall( 'BuildingInfo1_lblZts">(.*?)<', content, re.S | re.M)[0] build.bu_floor = re.findall( 'BuildingInfo1_lblZcs">(.*?)<', content, re.S | re.M)[0] build.bu_build_size = re.findall( 'BuildingInfo1_lblJzmj">(.*?)<', content, re.S | re.M)[0] build.bu_live_size = re.findall( 'BuildingInfo1_lblZzmj">(.*?)<', content, re.S | re.M)[0] build.bu_pre_sale = re.findall( 'BuildingInfo1_lblYsxkzh">(.*?)<', content, re.S | re.M)[0] build.bu_pre_sale_date = re.findall( 'BuildingInfo1_lblYsxkzfzrq">(.*?)<', content, re.S | re.M)[0] build.insert_db() house_url_list = re.findall( "onClick=.getMoreHouseInfo\('(.*?)'\)", content, re.S | re.M) self.get_house_info(house_url_list, co_name, build.bu_num) except Exception as e: print(e) except Exception as e: print(e)
def get_comm_info(self, comm_url_list): for i in comm_url_list: comm = Comm(co_index) comm_url = 'http://old.newhouse.cnnbfdc.com/' + i try: response = requests.get(comm_url, headers=self.headers) except Exception as e: print("{}城市无法访问小区{}".format(city, comm_url), e) continue html = response.text con = etree.HTML(html) comm.co_id = re.search('id=(\d+)', i).group(1) comm.co_name = re.findall('项目名称:.*?<span.*?>(.*?)<', html, re.S | re.M)[0] comm.co_address = re.findall('项目地址:.*?<td.*?>(.*?)<', html, re.S | re.M)[0] comm.co_develops = re.findall('开发公司:.*?<td.*?>(.*?)<', html, re.S | re.M)[0] comm.co_pre_sale = re.findall('售证名称:.*?<td.*?>(.*?)<', html, re.S | re.M)[0] comm.co_build_size = re.findall('纳入网上可售面积:.*?<img.*?>(.*?)<', html, re.S | re.M)[0] comm.co_all_house = re.findall('纳入网上可售套数:.*?<img.*?>(.*?)<', html, re.S | re.M)[0] comm.area = re.findall('所在区县:.*?<td.*?>(.*?)<', html, re.S | re.M)[0] comm.insert_db() bu_all_house_list = re.findall( 'window.open.*?center.*?center.*?>(.*?)<', html, re.S | re.M) try: bu_url_list = re.findall("window\.open\('(.*?)'", html, re.S | re.M) except Exception as e: print("{}城市{}小区无楼栋".format(city, comm.co_name), e) continue for i in range(len(bu_url_list)): build = Building(co_index) bu_url = bu_url_list[i] build.bu_all_house = bu_all_house_list[i] build.co_name = comm.co_name build.bu_num = con.xpath("//a[@href='#']/@title")[i] build.bu_id = re.search('key=(\d+)&', bu_url).group(1) build.co_id = comm.co_id build.insert_db() self.get_house_info(bu_url, build.bu_id)
def get_build_info(self, build_url_list, bu_pre_sale_list, co_name, co_id): for i in range(len(build_url_list)): try: build = Building(co_index) build.co_id = co_id build.co_name = co_name build.bu_pre_sale = bu_pre_sale_list[i] build.bu_id = re.search('lh=(\d+)', build_url_list[i]).group(1) build_url = 'http://221.2.144.162:8090/' + build_url_list[i] response = requests.get(build_url, headers=self.headers) html = response.content.decode('gbk') build.bu_num = re.findall('<font color=white.*?><b>(.*?)<', html, re.S | re.M)[0] build.bu_address = re.findall('坐落位置:</b>(.*?)<', html, re.S | re.M)[0] build.insert_db() ho_url_list = re.findall('background-.*?href=(.*?) ', html, re.S | re.M) ho_name_list = re.findall('background-color.*?<a.*?>(.*?)<', html, re.S | re.M) for i in range(len(ho_url_list)): try: house = House(co_index) house_url = 'http://221.2.144.162:8090/' + ho_url_list[ i] result = requests.get( house_url, headers=self.headers).content.decode('gbk') house.bu_id = build.bu_id house.co_id = co_id house.ho_type = re.findall( '用 途:.*?<td.*?>(.*?)<', result, re.S | re.M)[0] house.ho_build_size = re.findall( '建筑面积:.*?<td>(.*?)<', result, re.S | re.M)[0] house.bu_num = build.bu_num house.co_name = co_name house.ho_name = ho_name_list[i] house.insert_db() except Exception as e: print("co_index={},房屋信息错误".format(co_index), e) except Exception as e: print("co_index={},楼栋信息错误".format(co_index), e)
def start_crawler(self): response = requests.get(url) html = response.text tree = etree.HTML(html) all_url = tree.xpath('//a[@class="a_name"]/@href') for i in all_url: comm = Comm(co_index) if i == '#': continue comm_url = 'http://www.lzfc.com.cn:8080' + i comm.co_name = "cc0.innerHTML='(.*?)'" comm.co_address = "cc1.innerHTML='(.*?)'" comm.area = "cc2.innerHTML='(.*?)'" comm.co_use = "cc4.innerHTML='(.*?)'" comm.co_develops = "cc5.innerHTML='(.*?)'" comm.co_open_time = "cc6.innerHTML='(.*?)'" comm.co_all_house = "cc9.innerHTML='(.*?)'" comm.co_build_size = "cc11.innerHTML='(.*?)'" comm.co_name = "cc0.innerHTML='(.*?)'" comm.co_id = "BaseCode=(.*?)'" p = ProducerListUrl(page_url=comm_url, request_type='get', encode='gbk', analyzer_rules_dict=comm.to_dict(), current_url_rule="queryBuildHerf1.href='(.*?)'", analyzer_type='regex') build_url = p.get_details() for i in build_url: build = Building(co_index) build_detail_url = 'http://www.lzfc.com.cn:8080' + i build.bu_num = 'onclick=comInfoView.*?center">(.*?)<' build.co_use = 'onclick=comInfoView.*?center.*?center">(.*?)<' build.bu_pre_sale = 'onclick=comInfoView.*?center.*?center.*?center"><.*?>(.*?)<' build.bu_all_house = 'onclick=comInfoView.*?center.*?center.*?center.*?center">(.*?)<' build.co_name = 'fontbg_red">(.*?)<' build.bu_id = "onclick=comInfoView\('(.*?)'\)" p = ProducerListUrl(page_url=comm_url, request_type='get', encode='gbk', analyzer_rules_dict=comm.to_dict(), current_url_rule="queryBuildHerf1.href='(.*?)'", analyzer_type='regex') build_url = p.get_details()
def analyzer_comm_url(self, comm_url_list): all_url = [] for i in comm_url_list: try: res = requests.get(i) html = res.content.decode('gbk') c = Comm(self.co_index) c.co_name = re.search('项目名称:.*?">.*?<span.*?>(.*?)</span>', html, re.S | re.M).group(1) # 项目名称 c.co_address = re.search('项目地址:.*?">.*?<span.*?>(.*?)</span>', html, re.S | re.M).group(1) # 项目地址 c.co_develops = re.search('开发商:.*?">.*?<span.*?>(.*?)</span>', html, re.S | re.M).group(1) # 开发商 c.co_build_size = re.search('总建筑面积:.*?">.*?<span.*?>(.*?)</span>', html, re.S | re.M).group(1) # 建筑面积 c.co_land_type = re.search('用地依据:.*?">.*?<span.*?>(.*?)</span>', html, re.S | re.M).group(1) # 土地使用证 c.co_all_house = re.search('>总套数:.*?">.*?<span.*?>(.*?)</span>', html, re.S | re.M).group(1) # 总套数 c.area = re.search('所在区域:.*?">.*?<span.*?>(.*?)</span>', html, re.S | re.M).group(1) # 地区 area c.co_work_pro = re.search('施工许可证:.*?">.*?<span.*?>(.*?)</span>', html, re.S | re.M).group(1) # 施工许可证 c.co_plan_pro = re.search('建设工程规划许可证:.*?">.*?<span.*?>(.*?)</span>', html, re.S | re.M).group( 1) # 规划许可证 c.insert_db() buildlist = re.findall('onmouseover.*?</TR>', html, re.S | re.M) url_list = [] for k in buildlist: try: b = Building(self.co_index) build_list = re.findall('<TD.*?>(.*?)</TD>', k, re.S | re.M) b.co_name = build_list[1] b.bu_num = build_list[2] b.bu_type = build_list[4] b.insert_db() house_url = re.findall('href="(.*?)"', k, re.S | re.M) for j in house_url: url_list.append('http://www.stfcj.gov.cn/stsite/ProjectList/' + j) except Exception as e: print('楼栋错误,co_index={},url={}'.format(co_index, i), e) all_url = all_url + url_list except Exception as e: print('小区错误,co_index={},url={}'.format(co_index, i), e) return all_url
def get_build_info(self, build_url_list): for i in build_url_list: try: build = Building(co_index) build_url = 'http://www.fjnpfdc.com/House/' + i res = requests.get(build_url, headers=self.headers) con = res.content.decode('gbk') build.co_name = re.search("项目名称:.*?<td.*?>(.*?)<", con, re.S | re.M).group(1) build.bu_num = re.search("幢 号:.*?<td.*?>(.*?)<", con, re.S | re.M).group(1) build.co_use = re.search("设计用途:.*?<td.*?>(.*?)<", con, re.S | re.M).group(1) build.co_build_structural = re.search("建筑结构:.*?<td.*?>(.*?)<", con, re.S | re.M).group(1) build.bu_floor = re.search("总 层 数:.*?<td.*?>(.*?)<", con, re.S | re.M).group(1) build.bu_build_size = re.search("总 面 积:.*?<td.*?>(.*?)<", con, re.S | re.M).group(1) build.co_build_end_time = re.search("竣工日期:.*?<td.*?>(.*?)<", con, re.S | re.M).group(1) house_url_list = re.findall('<a href="(HouseInfo.*?)"', con) # p = ProducerListUrl(page_url=build_url, # request_type='get', encode='gbk', # analyzer_rules_dict=build.to_dict(), # current_url_rule='<a href="(HouseInfo.*?)"', # analyzer_type='regex', # headers=self.headers) build.co_id = re.search('ProjectId=(.*?)&', i).group(1) build.bu_id = re.search('BuildingId=(.*?)&P', i).group(1) build.insert_db() # house_url_list = p.get_details() self.get_house_info(house_url_list, build.bu_id, build.co_id) except Exception as e: print("co_index={},楼栋{}错误".format(co_index, i), e)
def get_build_info(self, build_url_list, co_id): for i in build_url_list: build_url = 'http://www.fjlyfdc.com.cn/' + i try: build = Building(co_index) response = requests.get(build_url, headers=self.headers) html = response.text build.bu_id = re.search('buildingInfoID=(.*?)&', build_url).group(1) build.co_id = co_id build.bo_develops = re.search('开发商:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) build.co_name = re.search('项目名称:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) build.bu_address = re.search('坐落位置:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) build.bu_num = re.search('幢号:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) build.co_build_structural = re.search('建筑结构:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) build.bu_type = re.search('设计用途:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) build.bu_floor = re.search('总 层 数:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) build.co_all_size = re.search('总面积:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) build.bo_build_start_time = re.search('开工日期:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) build.insert_db() house_url_list = re.findall( 'href="(/House/HouseInfo\?HouseCenterID=.*?)"', html, re.S | re.M) self.get_house_info(house_url_list, build.bu_id, co_id) except Exception as e: print('楼栋错误,co_index={},url={}'.format(co_index, build_url), e)
def get_build_info(self, bu_pre_sale, bo_develops, bu_co_name, bu_con): build = Building(co_index) build.bu_id = re.search('编号.*?>(\d+)<', bu_con, re.S | re.M).group(1) build.bu_num = re.search('幢号.*?>(\d+)<', bu_con, re.S | re.M).group(1) build.bu_floor = re.search('总层数.*?>(\d+)<', bu_con, re.S | re.M).group(1) build.bu_build_size = re.search('预售建筑面积.*?>(\d+.\d+)<', bu_con, re.S | re.M).group(1) build.bu_address = re.search('楼房坐落.*?;">(.*?)</span', bu_con, re.S | re.M).group(1) build.bu_live_size = re.search('住宅建筑面积.*?>(\d+.\d+)<', bu_con, re.S | re.M).group(1) build.bu_not_live_size = re.search('非住宅建筑面积.*?;">(.*?)</span', bu_con, re.S | re.M).group(1) build.bo_build_start_time = re.search('开工日期.*?;">(.*?)</span', bu_con, re.S | re.M).group(1) build.bu_all_house = re.search('总套数.*?>(\d+)<', bu_con, re.S | re.M).group(1) build.bu_pre_sale = bu_pre_sale build.bo_develops = bo_develops build.co_name = bu_co_name build.insert_db()
def get_comm_info(self, comm_info): co = Comm(co_index) co.co_name = re.search('_blank">(.*?)</a', comm_info).group(1) try: co.co_address = re.findall('px">(.*?)</td', comm_info)[1] except: co.co_address = None co.area = re.search('center">(.*?)</td>', comm_info).group(1) co_detail_url = re.search("href='(.*?)'", comm_info).group(1) co_url = "http://www.qyfgj.cn/newys/" + co_detail_url try: res = requests.get(co_url, headers=self.headers) except Exception as e: print("co_index={}小区未请求到".format(co_index), e) con = res.content.decode('gbk') try: co.co_develops = re.search('开发商名称.*?px;">(.*?)</a', con, re.S | re.M).group(1) co.co_all_house = re.search('总套数.*?">(\d+) ', con, re.S | re.M).group(1) co.co_all_size = re.search('总面积.*?">(\d+.\d+) m', con, re.S | re.M).group(1) except: print("小区无开发商等信息") co.insert_db() try: build = re.findall('<tr bgcolor="white">(.*?)</tr>', con, re.S | re.M) except: print("小区没有楼栋信息") build_headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119Safari/537.36', 'Cookie': 'ASP.NET_SessionId=irv0qjamqztp1pb0shoqrx2j', 'Referer': co_url } for build_info in build: if "进入" in build_info: build_url = re.search('href="(.*?)"><font', build_info).group(1) build_url = "http://www.qyfgj.cn/newys/" + build_url ho_headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119Safari/537.36', 'Cookie': 'ASP.NET_SessionId=irv0qjamqztp1pb0shoqrx2j', 'Referer': build_url } build_res = requests.get(build_url, headers=build_headers) build_con = build_res.content.decode('gbk') if re.search('ID=(\d+)', build_url): #现售 bu = Building(co_index) bu_id = re.search('ID=(\d+)', build_url).group(1) bu.bu_id = bu_id bu.co_name = co.co_name bu.insert_db() self.get_house_info(headers=ho_headers, bu_id=bu_id, url=build_url) else: #预售 bu = Building(co_index) bu.co_name = co.co_name bu.bu_type = re.search('用途.*?">(.*?)</td>', build_con, re.S | re.M).group(1) bu.bu_pre_sale = re.search('许可证编号.*?_blank">(.*?)</a>', build_con, re.S | re.M).group(1) bu.bu_pre_sale_date = re.search('有效日期.*?">(.*?)</td>', build_con, re.S | re.M).group(1) bu.bu_address = re.search('项目座落.*?">(.*?)</td>', build_con, re.S | re.M).group(1) ret = re.findall('<tr onmouseover(.*?)</tr', build_con, re.S | re.M) for i in ret: house_url = re.search('href="(.*?)"', i).group(1) house_url = "http://www.qyfgj.cn/newys/" + house_url bu.bu_id = re.search('dbh=(.*?)&', i).group(1) bu.bu_num = re.search('<td width="89.*?">(.*?)</', i).group(1) bu.bu_floor = re.search('<td width="84.*?">(\d+)</td', i).group(1) bu.insert_db() ho_res = requests.get(house_url, headers=ho_headers) ho_con = ho_res.content.decode('gbk') new_headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119Safari/537.36', 'Cookie': 'ASP.NET_SessionId=irv0qjamqztp1pb0shoqrx2j', 'Referer': house_url } self.get_house_info(ho_con=ho_con, headers=new_headers, bu_id=bu.bu_id) else: print("楼栋无链接地址")
def get_build_url_list(self, url_list): for i in url_list: try: res = requests.get(i) html = res.content.decode('gbk') for k in re.findall('项目名称.*?</dl>', html, re.S | re.M): try: c = Comm(self.co_index) c.co_name = re.search('html">(.*?)</a>', k, re.S | re.M).group(1) c.co_address = re.search('class="address"(.*?)</dd>', k, re.S | re.M).group(1) c.area = re.search('"city">(.*?)</dd>', k, re.S | re.M).group(1) c.co_develops = re.search('"average">(.*?)</dd>', k, re.S | re.M).group(1) c.insert_db() global count count += 1 print(count) url = re.search('a href="(.*?)">', k, re.S | re.M).group(1) complete_url = self.url_source + url res = requests.get(complete_url) html = res.content.decode('gbk') build_info_str = re.search('楼盘表</td>(.*?)合 计', html, re.S | re.M).group(1) for j in re.findall('<tr.*?</tr>', build_info_str, re.S | re.M): try: b = Building(self.co_index) b.co_name = re.search('html">(.*?)</a>', k, re.S | re.M).group(1) b.bu_all_house = re.search( 'absmiddle" />(.*?)</a>', j, re.S | re.M).group(1) b.bu_num = re.search( '="absmiddle" />(.*?)</a></strong></', j, re.S | re.M).group(1) b.bu_build_size = re.search( 'td class="t_c">.*?td class="t_c">(.*?㎡)</td>', j, re.S | re.M).group(1) b.insert_db() url = re.search('a href="(.*?)"', j, re.S | re.M).group(1) complete_url = self.url_source + url res = requests.get(complete_url) html = res.content.decode('gbk') # 解析html获取iframe表单的数据 house_url = self.url_source + re.search( '<iframe.*?"(.*?)"', html, re.S | re.M).group(1) logic_house_url = house_url.replace( 'Default', 'GetData') logic_house_html = requests.get( url=logic_house_url).content.decode() logic_id = re.search( '<LOGICBUILDING_ID>(.*?)<', logic_house_html, re.S | re.M).group(1) final_url = 'http://www.yingtanfdc.com/website/presale/home/HouseTableControl/GetData.aspx?LogicBuilding_ID=' + logic_id final_html = requests.get( url=final_url).content.decode('gbk') for l in re.findall( '<ROOM_NUMBER>(.*?)</ROOM_NUMBER>', final_html, re.S | re.M): try: h = House(self.co_index) h.info = final_html h.ho_name = l h.co_name = re.search( 'html">(.*?)</a>', k, re.S | re.M).group(1) h.bu_num = re.search( '="absmiddle" />(.*?)</a></strong></', j, re.S | re.M).group(1) h.insert_db() except Exception as e: continue except Exception as e: continue except Exception as e: continue except Exception as e: continue