def house_crawler(self, house_url, bu_num, co_id, bu_id): ho = House(co_index, bu_num=bu_num, co_id=co_id, bu_id=bu_id) url = self.url + house_url con = requests.get(url, headers=self.headers) tr = con.text ho_name = re.findall('室号:(.*?)户', tr, re.S | re.M) # 房号:3单元403 # ho_num = re.findall('_td(\d+)"', tr) # 房号id ho_floor = re.findall('(\d+)层', tr) # 楼层 ho_type = re.findall('房屋属性:(.*?)"', tr, re.S | re.M) # 房屋类型:普通住宅 / 车库仓库 ho_room_type = re.findall('户型:(.*?)所', tr, re.S | re.M) # 户型 ho_build_size = re.findall('建筑面积:(.*?)房', tr, re.S | re.M) # 建筑面积 for floor in ho_floor: try: ho.ho_floor = floor for index in range(1, len(ho_name) + 1): ho.ho_name = ho_name[index] ho.ho_type = ho_type[index] ho.ho_room_type = ho_room_type[index] ho.ho_build_size = ho_build_size[index] # ho.ho_num = ho_num[index] ho.insert_db() except: continue
def get_house_info(self, house_url_list): for i in house_url_list: try: dongid = re.search('dongid=(.*?)&', i).group(1) roomid = re.search('roomid=(.*?)&', i).group(1) house_url = 'http://zjjg.0557fdc.com:9555/xiaoqu/roominfo.aspx?dongid=' + dongid + '&roomid=' + roomid house = House(co_index) house.co_name = 'Labelxqmc">(.*?)<' house.area = 'Labelxzq">(.*?)<' house.bu_num = 'Labeldongmc">(.*?)<' house.ho_type = 'Labelyxyongtu">(.*?)<' house.ho_name = '<span id="Labelroommc".*?>(.*?)</span>' house.ho_build_size = 'Labeljzmianji">(.*?)<' house.ho_true_size = 'Labeltaonei">(.*?)<' house.ho_share_size = 'Labelgongtan">(.*?)<' house.ho_room_type = 'Labelhuxing">(.*?)<' house.bu_id = 'dongid=(.*?)&' p = ProducerListUrl(page_url=house_url, request_type='get', encode='utf-8', analyzer_rules_dict=house.to_dict(), analyzer_type='regex', headers=self.headers) p.get_details() except Exception as e: print('房号错误,co_index={},url={}'.format(co_index, house_url), e)
def get_house_info(self, bu_id, co_id): house_url = "http://www.xyfdc.gov.cn/wsba/Common/Agents/ExeFunCommon.aspx" payload = "<?xml version=\"1.0\" encoding=\"utf-8\" standalone=\"yes\"?>\r\n<param funname=\"SouthDigital.Wsba.CBuildTableEx.GetBuildHTMLEx\">\r\n<item>" + \ bu_id + "</item>\r\n<item>1</item>\r\n<item>1</item>\r\n<item>80</item>\r\n<item>840</item>\r\n<item>g_oBuildTable</item>\r\n<item> 1=1</item>\r\n<item>1</item>\r\n<item>false</item>\r\n</param>\r\n" headers = { 'Content-Type': "text/xml", } response = requests.request("POST", house_url, data=payload, headers=headers) html = response.text house_info_list = re.findall( "onclick=.g_oBuildTable.clickRoom.*? title='(.*?)'", html, re.S | re.M) for i in house_info_list: try: house = House(co_index) house.ho_name = re.search('房号:(.*?)单元:', i, re.S | re.M).group(1) house.ho_build_size = re.search('总面积:(.*?)平方米', i, re.S | re.M).group(1) house.ho_type = re.search('用途:(.*?)户型', i, re.S | re.M).group(1) house.ho_room_type = re.search('户型:(.*?)状态', i, re.S | re.M).group(1) house.info = i house.bu_id = bu_id house.co_id = co_id house.insert_db() except Exception as e: print( '房号错误,co_index={},url={},data={}'.format( co_index, house_url, payload), e)
def get_house_info(self, house_url_list, co_name, bu_num): for i in house_url_list: try: house = House(co_index) house.co_name = co_name house.bu_num = bu_num house_url = 'http://www.sxczfdc.com/pubinfo/' + i response = requests.get(house_url, headers=self.headers) html = response.text house.ho_floor = re.findall('HouseInfo1_lblFwlc">(.*?)<', html, re.S | re.M)[0] house.ho_name = re.findall('HouseInfo1_lblFwfh">(.*?)<', html, re.S | re.M)[0] house.ho_type = re.findall('HouseInfo1_lblFwlx">(.*?)<', html, re.S | re.M)[0] house.ho_room_type = re.findall('HouseInfo1_lblFwhx">(.*?)<', html, re.S | re.M)[0] house.ho_build_size = re.findall( 'HouseInfo1_lblycfwjzmj">(.*?)<', html, re.S | re.M)[0] house.ho_true_size = re.findall( 'HouseInfo1_lblycfwtnmj">(.*?)<', html, re.S | re.M)[0] house.ho_share_size = re.findall( 'HouseInfo1_lblycfwftmj">(.*?)<', html, re.S | re.M)[0] house.orientation = re.findall('HouseInfo1_lblCx">(.*?)<', html, re.S | re.M)[0] house.insert_db() except Exception as e: print(e)
def get_house_detail(self, house_url_list, bu_id, co_id): for i in house_url_list: try: house = House(co_index) house_detail_url = 'http://www.lhfdc.gov.cn/templets/lh/aspx/hpms/RoomInfo.aspx?code=' + i response = requests.get(house_detail_url, headers=self.headers) html = response.text house.ho_name = re.search('id="ROOM_ROOMNO">(.*?)<', html, re.S | re.M).group(1) house.ho_room_type = re.search('id="ROOM_FWHX">(.*?)<', html, re.S | re.M).group(1) house.ho_type = re.search('id="ROOM_GHYT">(.*?)<', html, re.S | re.M).group(1) house.ho_build_size = re.search('id="ROOM_YCJZMJ">(.*?)<', html, re.S | re.M).group(1) house.ho_true_size = re.search('id="ROOM_YCTNMJ">(.*?)<', html, re.S | re.M).group(1) house.ho_share_size = re.search('id="ROOM_YCFTMJ">(.*?)<', html, re.S | re.M).group(1) house.bu_id = bu_id house.co_id = co_id house.insert_db() except Exception as e: print( '房号错误,co_index={},url={}'.format(co_index, house_detail_url), e)
def ho_parse(self, bid, co_id): payload = '<?xml version="1.0" encoding="utf-8" standalone="yes"?><param funname="SouthDigital.CMS.CBuildTableEx.GetBuildHTMLEx"><item>'\ +bid+'</item><item>1</item><item>1</item><item>100</item><item>1000</item><item>g_oBuildTable</item><item> 1=1</item><item>1</item></param>' payload = parse.quote(payload) try: res = requests.post( 'http://www.hbsfdc.com/Common/Agents/ExeFunCommon.aspx', data=payload, headers=self.headers) except Exception as e: log.error("{}楼栋请求失败".format(bid)) con = res.content.decode() ho_list = re.findall("title='(.*?)'>", con, re.S | re.M) for ho in ho_list: house = House(co_index) house.co_id = co_id house.bu_id = bid house.ho_name = re.search('房号:(.*)', ho).group(1) house.ho_type = re.search('用途:(.*)', ho).group(1) house.ho_room_type = re.search('户型:(.*)', ho).group(1) house.ho_build_size = re.search('总面积:(.*)', ho).group(1) if re.search('售价:(.*)', ho): house.ho_price = re.search('售价:(.*)', ho).group(1) else: house.ho_price = None house.insert_db()
def house_parse(self, house_url, co_id, bu_id): ho = House(co_index) url = "http://spf.tlfdc.cn/" + house_url res = requests.get(url, headers=self.headers) con = res.text ho_name = re.findall('室号:(.*?)套', con, re.S | re.M) ho_room_type = re.findall('套型:(.*?)建', con, re.S | re.M) ho_build_size = re.findall('建筑面积:(.*?)参', con, re.S | re.M) ho_price = re.findall('价格:(.*?)元', con, re.S | re.M) ho_detail = re.findall('href="(show.*?\?id=\d+&id2=\d+&prjid=\d+)"', con, re.S | re.M) for index in range(0, len(ho_name)): try: ho.co_id = co_id ho.bu_id = bu_id ho.ho_name = ho_name[index] ho.ho_room_type = ho_room_type[index] ho.ho_build_size = ho_build_size[index] ho.ho_price = ho_price[index] ho_detail_url = "http://spf.tlfdc.cn/" + ho_detail[index] res = requests.get(ho_detail_url, headers=self.headers) res = res.content.decode('gb2312') ho.ho_floor = re.findall('楼层.*?">(.*?)</td>', res, re.S | re.M)[0].strip() ho.insert_db() except: print('房号错误,co_index={},url={}'.format(co_index, url), e) continue
def get_house_detail(self, house_detail_url_list, co_id, bu_id): for i in house_detail_url_list: detail_url = 'http://www.yzfdc.cn/' + i try: house = House(co_index) time.sleep(3) response = self.s.get(detail_url, headers=self.headers) html = response.text house.co_name = re.search('lblxmmc.*?>(.*?)<', html, re.S | re.M).group(1) house.bu_num = re.search('lbldh.*?>(.*?)<', html, re.S | re.M).group(1) house.ho_name = re.search('lblfh.*?>(.*?)<', html, re.S | re.M).group(1) house.ho_build_size = re.search('lbljzmj.*?>(.*?)<', html, re.S | re.M).group(1) house.ho_true_size = re.search('lbltnmj.*?>(.*?)<', html, re.S | re.M).group(1) house.ho_share_size = re.search('lblftmj.*?>(.*?)<', html, re.S | re.M).group(1) house.ho_type = re.search('lblfwxz.*?>(.*?)<', html, re.S | re.M).group(1) house.ho_room_type = re.search('lblhuxin.*?>(.*?)<', html, re.S | re.M).group(1) house.bu_id = bu_id house.co_id = co_id house.insert_db() except Exception as e: print('房号错误,co_index={},url={}'.format(co_index, detail_url), e)
def ho_info(self, url, co_id, bu_id): ho_url = 'http://www.aqhouse.net/' + url while True: try: proxy = self.proxies[random.randint(0, 9)] ho_res = requests.get(ho_url, headers=self.headers, proxies=proxy) break except Exception as e: print(e) ho_html = etree.HTML(ho_res.text) room_list = ho_html.xpath("//td[@nowrap]/a/..") for room in room_list: try: room_info = room.xpath("./@title")[0] ho = House(co_index) ho.co_id = co_id ho.bu_id = bu_id ho.ho_name = room.xpath("./a/text()")[0] ho.ho_build_size = re.search('建筑面积:(.*?)平方米', room_info).group(1) ho.ho_true_size = re.search('套内面积:(.*?)平方米', room_info).group(1) ho.ho_share_size = re.search('分摊面积:(.*?)平方米', room_info).group(1) ho.ho_room_type = re.search('套型:(.*)', room_info).group(1) ho.ho_price = re.search('价格.*?:(.*?)元/平方米', room_info).group(1) ho.insert_db() except: print('房屋解析失败')
def get_house_info(self, house_id_list, bu_id, co_id): for i in house_id_list: house_url = 'http://www.hbczfdc.com:4993/HPMS/RoomInfo.aspx?code=' + i try: house = House(co_index) response = requests.get(house_url, headers=self.headers) html = response.text house.bu_id = bu_id house.co_id = co_id house.ho_name = re.search('id="ROOM_HH">(.*?)<', html, re.S | re.M).group(1) house.ho_floor = re.search('id="ROOM_MYC">(.*?)<', html, re.S | re.M).group(1) house.ho_type = re.search('id="ROOM_FWYT">(.*?)<', html, re.S | re.M).group(1) house.ho_room_type = re.search('id="ROOM_HX">(.*?)<', html, re.S | re.M).group(1) house.ho_build_size = re.search('id="ROOM_YCJZMJ">(.*?)<', html, re.S | re.M).group(1) house.ho_true_size = re.search('id="ROOM_YCTNJZMJ">(.*?)<', html, re.S | re.M).group(1) house.ho_share_size = re.search('id="ROOM_YCFTJZMJ">(.*?)<', html, re.S | re.M).group(1) house.insert_db() except Exception as e: print('房号错误,co_index={},url={}'.format(co_index, house_url), e)
def room_crawler(self, room): # 房屋 ho = House(co_index) house_url = "http://www.hzszjj.gov.cn" + room res = requests.get(house_url, ) con = etree.HTML(res.text) ho_table = con.xpath("//tr[@bgcolor='#fbf3e6']") for ho_list in ho_table[1:-1]: ho_floor = ho_list.xpath("./td[@align='center']/text()")[0] honum_list = ho_list.xpath(".//tr/td[@height='40']") for house in honum_list: ho.ho_floor = ho_floor # 楼层 id_num = re.search(r"(\d+)&\w+=(\d+)", room) ho.co_id = id_num.group(1) # 小区id ho.bu_id = id_num.group(2) # 楼栋id ho_url = house.xpath("./a/@href")[0] if len(ho_url) == 1: ho_info = house.xpath("./a/@wf")[0] ho.ho_name = house.xpath("./a/text()")[0] info = re.search( r":(.*?)<br>.*?:(.*?)<br>(.*?)<br><hr>.*?:(.*?)m.*?<br>.*?:(.*?)<br>.*?:(.*?)m", ho_info) ho.ho_type = info.group(5) ho.ho_build_size = info.group(4) ho.ho_room_type = info.group(2) else: detail_url = "http://www.hzszjj.gov.cn/ts_web_dremis/web_house_dir/" + ho_url res = requests.get(detail_url) con = etree.HTML(res.text) ho.ho_name = con.xpath( "//span[@id='ctl00_ContentPlaceHolder2_lb_house_name']/text()" )[0] ho.ho_type = con.xpath( "//span[@id='ctl00_ContentPlaceHolder2_lb_house_type']/text()" )[0] ho.ho_build_size = con.xpath( "//span[@id='ctl00_ContentPlaceHolder2_lb_house_build_area']/text()" )[0] ho.ho_share_size = con.xpath( "//span[@id='ctl00_ContentPlaceHolder2_lb_house_share_area']/text()" )[0] ho.ho_true_size = con.xpath( "//span[@id='ctl00_ContentPlaceHolder2_lb_house_inside_area']/text()" )[0] ho.insert_db()
def get_house_info(self, bu_url, bu_id): qrykey = re.search('qrykey=(.*?)&', bu_url).group(1) house_url = 'http://old.newhouse.cnnbfdc.com/GetHouseTable.aspx?qrykey=' + qrykey response = requests.get(house_url, headers=self.headers) html = response.text house_code_list = re.findall("onclick=select_room\('(.*?)'", html, re.S | re.M) for i in house_code_list: house_detail_url = 'http://old.newhouse.cnnbfdc.com/openRoomData.aspx?roomId=' + str( i) # while True: # proxies = self.proxy_pool() try: res = requests.get( house_detail_url, headers=self.headers, ) except Exception as e: print("{}城市无法访问房屋页面{}".format(city, house_detail_url), e) continue # if res.status_code ==200: time.sleep(2) # self.proxy_status(proxies,0) # break # else: # self.proxy_status(proxies,1) # continue content = res.text ho = House(co_index) ho.bu_id = bu_id try: ho.ho_name = re.search('室号.*?">(.*?)</td>', content, re.S | re.M).group(1) ho.ho_floor = re.search('楼层.*?">(.*?)</td>', content, re.S | re.M).group(1) ho.ho_room_type = re.search('户型.*?">(.*?)</td>', content, re.S | re.M).group(1) ho.ho_type = re.search('用途.*?">(.*?)</td>', content, re.S | re.M).group(1) ho.ho_build_size = re.search('预测建筑面积.*?">(.*?)</td>', content, re.S | re.M).group(1) ho.ho_true_size = re.search('预测套内面积.*?">(.*?)</td>', content, re.S | re.M).group(1) ho.ho_share_size = re.search('预测分摊面积.*?">(.*?)</td>', content, re.S | re.M).group(1) ho.insert_db() except Exception as e: print("{}房号错误,请求频繁,当前页面{}未提取".format(city, house_detail_url), e) continue
def get_house_info(self, con, co_id, build_id): html_str = re.search('houseTableData.*?特别申明', con, re.S | re.M).group() for info in re.findall('<div style.*?</div>', html_str, re.S | re.M): try: ho = House(co_index) ho.ho_name = re.search("'HC_HOUSENUMB':'(.*?)',", info, re.S | re.M).group(1) ho.ho_room_type = re.search("'HC_HOUSETYPE':'(.*?)',", info, re.S | re.M).group(1) ho.ho_build_size = re.search("'HC_STCTAREA':'(.*?)',", info, re.S | re.M).group(1) ho.bu_id = build_id ho.co_id = co_id ho.insert_db() except Exception as e: print('house error, co_index={}'.format(co_index))
def get_house_info(self, house_url, bu_id, co_id): ho_url = 'http://www.fangdi.com.cn/' + house_url response = requests.get(ho_url, headers=self.headers) html = response.content.decode('gbk') house_html = re.search('室号 <.*?</table>.*?</table>', html, re.S | re.M).group() house_info_list = re.findall('title.*?</td>', house_html, re.S | re.M) for i in house_info_list: try: house = House(co_index) house.ho_build_size = re.search('实测面积:(.*?)>', i, re.S | re.M).group(1) house.ho_name = re.search('实测面积.*?>(.*?)<br>', i, re.S | re.M).group(1).strip() house.bu_id = bu_id house.co_id = co_id if '<a' in house.ho_name: house_detail_url_code = re.search('href="(.*?)"', house.ho_name, re.S | re.M).group(1) house_detail_url = 'http://www.fangdi.com.cn/' + house_detail_url_code result = requests.get(house_detail_url, headers=self.headers) html_str = result.content.decode('gbk') house.ho_floor = re.search('实际层.*?<TD.*?>(.*?)<', html_str, re.S | re.M).group(1) house.ho_name = re.search('室号.*?<TD.*?>(.*?)<', html_str, re.S | re.M).group(1) house.ho_type = re.search('房屋类型.*?<TD.*?>(.*?)<', html_str, re.S | re.M).group(1) house.ho_room_type = re.search('房型.*?<TD.*?>(.*?)<', html_str, re.S | re.M).group(1) house.ho_build_size = re.search('实测建筑面积.*?<TD.*?>(.*?)<', html_str, re.S | re.M).group(1) house.ho_true_size = re.search('实测套内面积.*?<TD.*?>(.*?)<', html_str, re.S | re.M).group(1) house.ho_share_size = re.search('实测分摊面积.*?<TD.*?>(.*?)<', html_str, re.S | re.M).group(1) house.insert_db() except Exception as e: print('房号错误,co_index={},url={}'.format(co_index, ho_url), e)
def house_info(self, house_list, bu_id, co_id): ho = House(co_index) for house_url in house_list: url = "http://ris.szpl.gov.cn/bol/" + house_url res = requests.get(url, headers=self.headers) ho.ho_num = re.search('id=(\d+)', house_url).group(1) con = res.text ho.bu_num = re.search('情况.*?">(.*?)&', con).group(1) ho.bu_id = bu_id ho.co_id = co_id ho.ho_floor = re.search('楼层.*?">(\d+)&', con).group(1) ho.ho_num = re.search('房号.*?">(\d+)&', con).group(1) ho.ho_type = re.search('用途.*?">(\d+)&', con).group(1) ho.ho_room_type = re.search('户型.*?">(\d+)&', con).group(1) ho.ho_build_size = re.search('建筑面积<.*?">(\d+.\d+)平方米', con).group(1) ho.ho_true_size = re.search('户内面积<.*?">(\d+.\d+)平方米', con).group(1) ho.insert_db()
def get_house_detail(self, house_url_list, bu_id, co_id): for i in house_url_list: try: house = House(co_index) house_detail_url = 'http://222.184.103.50:7700/WW/housedetail.aspx?houseID=' + i response = requests.get(house_detail_url, headers=self.headers) html = response.text house.ho_name = re.search('id="Label1">(.*?)<', html, re.S | re.M).group(1) house.ho_room_type = re.search('id="Label2">(.*?)<', html, re.S | re.M).group(1) house.ho_build_size = re.search('id="Label3">(.*?)<', html, re.S | re.M).group(1) house.co_id = co_id house.bu_id = bu_id house.insert_db() except Exception as e: print( '请求错误,co_index={},url={}'.format(co_index, house_detail_url), e)
def get_house_info(self, bu_id, co_id): house_url = 'http://b.fang99.com/buildinglistselect.aspx?buildingid=' + co_id + '&xmbh=&lzbh=' + bu_id response = self.request_proxy(house_url, headers=self.headers) html = response.content.decode('gbk') house_html = re.search('rpt_ewlpblc_fjlistdiv_0.*?erp_con_2', html, re.S | re.M).group() house_info_list = re.findall('<span.*?</span>', house_html, re.S | re.M) for i in house_info_list: try: house = House(co_index) house.ho_room_type = re.search('title="(.*?),', i, re.S | re.M).group(1) house.ho_build_size = re.search('title=".*?,(.*?)"', i, re.S | re.M).group(1) if '<a' in i: house.ho_name = re.search('<a.*?>(.*?)<', i, re.S | re.M).group(1) else: house.ho_name = re.search('<span.*?>(.*?)<', i, re.S | re.M).group(1) house.bu_id = bu_id house.co_id = co_id house.insert_db() except Exception as e: print('房号错误,co_index={},url={}'.format(co_index, house_url), e)
def ho_info(self, bu_url_list, co_id): for bu_url in bu_url_list: try: res = requests.get(bu_url, headers=self.headers) html = etree.HTML(res.text) house_info_list = html.xpath("//li[@class='tjCor4']") for house_info in house_info_list: house = house_info.xpath("./@title")[0] ho = House(co_index) ho.co_id = co_id ho.bu_id = re.search('dbh=(\d+)', bu_url).group(1) ho.ho_name = re.search('房号:(.*?)<br', house).group(1) ho.ho_room_type = re.search('户型:(.*?)<br', house).group(1) ho.ho_build_size = re.search('建筑面积:(.*?)平方米', house).group(1) ho.ho_price = re.search('单价:(.*?)元', house).group(1) ho.ho_type = re.search('用途:(.*?)<br', house).group(1) ho.insert_db() except Exception as e: log.error('房号信息错误', e)
def get_house_info(self, house_url_list): for url in house_url_list: response = requests.get(url) html = etree.HTML(response.text) con = html.xpath("//tr[@align='center']") for i in con: try: house = House(co_index) # house.ho_num = 'NHOUSENO">(.*?)<' house.ho_name = i.xpath("./td/text()")[1] house.ho_floor = i.xpath("./td/text()")[0] house.ho_build_size = i.xpath("./td/text()")[3] house.ho_true_size = i.xpath("./td/text()")[4] house.ho_share_size = i.xpath("./td/text()")[5] house.ho_room_type = i.xpath("./td/text()")[2] house.ho_price = i.xpath("./td/text()")[-1] house.orientation = i.xpath("./td/text()")[-2] house.bu_id = re.search('ID=(\d+)',url).group(1) house.insert_db() except Exception as e: print('房号错误,co_index={},url={}'.format(co_index, url), e)
def get_house_info(self, code, co_name): house_url = 'http://house.bffdc.gov.cn/Common/Agents/ExeFunCommon.aspx?' payload = "<?xml version=\"1.0\" encoding=\"utf-8\" standalone=\"yes\"?>\r\n<param funname=\"SouthDigital.Wsba.CBuildTableEx.GetBuildHTMLEx\">\r\n<item>" + \ code[ 0] + "</item>\r\n<item>1</item>\r\n<item>1</item>\r\n<item>55</item>\r\n<item>840</item>\r\n<item>g_oBuildTable</item>\r\n<item>false</item>\r\n<item> 1=1</item>\r\n</param>\r\n" headers = { 'Content-Type': "text/xml", } response = requests.post(url=house_url, data=payload, headers=headers) html = response.text info = re.findall("title='(.*?)'", html, re.S | re.M) for i in info: try: house = House(co_index) house.bu_num = code[1] house.ho_name = re.search('房号:(.*?)\r\n', i).group(1) house.ho_type = re.search('用途:(.*?)\r\n', i).group(1) house.ho_room_type = re.search('户型:(.*?)\r\n', i).group(1) house.ho_build_size = re.search('总面积:(.*?)\r\n', i).group(1) house.co_name = co_name house.insert_db() except Exception as e: print(e)
def get_house_info(self, house_url_list): for i in house_url_list: try: house = House(co_index) house_url = 'http://www.ndjsj.gov.cn/House/' + i house.bu_num = '幢 号:.*?<td.*?>(.*?)<' house.ho_name = '房 号:.*?<td.*?>(.*?)<' house.co_name = '项目名称:.*?<td.*?>(.*?)<' house.ho_build_size = '建筑面积:.*?<td.*?>(.*?)<' house.ho_true_size = '套内面积:.*?<td.*?>(.*?)<' house.ho_share_size = '分摊面积:.*?<td.*?>(.*?)<' house.ho_type = '房屋用途:.*?<td.*?>(.*?)<' house.ho_floor = '所 在 层:.*?<td.*?>(.*?)<' house.ho_room_type = '房屋户型:.*?<td.*?>(.*?)<' p = ProducerListUrl(page_url=house_url, request_type='get', encode='utf-8', analyzer_rules_dict=house.to_dict(), analyzer_type='regex', headers=self.headers) p.get_details() except Exception as e: print('宁德房号错误,url={}'.format(house_url), e)
def get_house_info(self,co_id,bu_id,house_detail_list): for house_detail in house_detail_list: house_url = self.url + house_detail try: house_res = requests.get(house_url,headers=self.headers) house_res.status_code == 200 except Exception as e: print("co_index={},房屋信息错误".format(co_index),e) continue house_con = house_res.text ho = House(co_index) ho.co_id = co_id ho.bu_id = bu_id ho.ho_name = re.search('房号.*?fh">(.*?)</span',house_con,re.S|re.M).group(1) ho.orientation = re.search('朝向.*?Cx">(.*?)</span',house_con,re.S|re.M).group(1) ho.ho_floor = re.search('层.*?lc">(.*?)</span',house_con,re.S|re.M).group(1) ho.ho_room_type = re.search('房型.*?hx">(.*?)</span',house_con,re.S|re.M).group(1) ho.ho_build_size = re.search('建筑面积.*?jzmj">(.*?)</span',house_con,re.S|re.M).group(1) ho.ho_share_size = re.search('分摊面积.*?ftmj">(.*?)</span',house_con,re.S|re.M).group(1) ho.ho_true_size= re.search('套内面积.*?tnmj">(.*?)</span',house_con,re.S|re.M).group(1) ho.ho_type = re.search('用途.*?lx">(.*?)</span',house_con,re.S|re.M).group(1) ho.insert_db()