def ho_info(self, url, co_id, bu_id): ho_url = 'http://www.aqhouse.net/' + url while True: try: proxy = self.proxies[random.randint(0, 9)] ho_res = requests.get(ho_url, headers=self.headers, proxies=proxy) break except Exception as e: print(e) ho_html = etree.HTML(ho_res.text) room_list = ho_html.xpath("//td[@nowrap]/a/..") for room in room_list: try: room_info = room.xpath("./@title")[0] ho = House(co_index) ho.co_id = co_id ho.bu_id = bu_id ho.ho_name = room.xpath("./a/text()")[0] ho.ho_build_size = re.search('建筑面积:(.*?)平方米', room_info).group(1) ho.ho_true_size = re.search('套内面积:(.*?)平方米', room_info).group(1) ho.ho_share_size = re.search('分摊面积:(.*?)平方米', room_info).group(1) ho.ho_room_type = re.search('套型:(.*)', room_info).group(1) ho.ho_price = re.search('价格.*?:(.*?)元/平方米', room_info).group(1) ho.insert_db() except: print('房屋解析失败')
def ho_parse(self, bid, co_id): payload = '<?xml version="1.0" encoding="utf-8" standalone="yes"?><param funname="SouthDigital.CMS.CBuildTableEx.GetBuildHTMLEx"><item>'\ +bid+'</item><item>1</item><item>1</item><item>100</item><item>1000</item><item>g_oBuildTable</item><item> 1=1</item><item>1</item></param>' payload = parse.quote(payload) try: res = requests.post( 'http://www.hbsfdc.com/Common/Agents/ExeFunCommon.aspx', data=payload, headers=self.headers) except Exception as e: log.error("{}楼栋请求失败".format(bid)) con = res.content.decode() ho_list = re.findall("title='(.*?)'>", con, re.S | re.M) for ho in ho_list: house = House(co_index) house.co_id = co_id house.bu_id = bid house.ho_name = re.search('房号:(.*)', ho).group(1) house.ho_type = re.search('用途:(.*)', ho).group(1) house.ho_room_type = re.search('户型:(.*)', ho).group(1) house.ho_build_size = re.search('总面积:(.*)', ho).group(1) if re.search('售价:(.*)', ho): house.ho_price = re.search('售价:(.*)', ho).group(1) else: house.ho_price = None house.insert_db()
def get_house_detail(self, house_url_list, bu_id, co_id): for i in house_url_list: try: house = House(co_index) house_detail_url = 'http://www.lhfdc.gov.cn/templets/lh/aspx/hpms/RoomInfo.aspx?code=' + i response = requests.get(house_detail_url, headers=self.headers) html = response.text house.ho_name = re.search('id="ROOM_ROOMNO">(.*?)<', html, re.S | re.M).group(1) house.ho_room_type = re.search('id="ROOM_FWHX">(.*?)<', html, re.S | re.M).group(1) house.ho_type = re.search('id="ROOM_GHYT">(.*?)<', html, re.S | re.M).group(1) house.ho_build_size = re.search('id="ROOM_YCJZMJ">(.*?)<', html, re.S | re.M).group(1) house.ho_true_size = re.search('id="ROOM_YCTNMJ">(.*?)<', html, re.S | re.M).group(1) house.ho_share_size = re.search('id="ROOM_YCFTMJ">(.*?)<', html, re.S | re.M).group(1) house.bu_id = bu_id house.co_id = co_id house.insert_db() except Exception as e: print( '房号错误,co_index={},url={}'.format(co_index, house_detail_url), e)
def get_house_info(self, co_id, bu_id, id): house_list_url = "http://xx.yyfdcw.com/hetong/fdc_xxdxx.asp?id=" + str( id) res = requests.get(house_list_url, headers=self.headers) con = res.content.decode('gbk') house_list = re.findall("onClick=.*?open\('(.*?)',", con, re.S | re.M) for house_ in house_list: try: house_url = "http://xx.yyfdcw.com/hetong/" + house_ except Exception as e: print("co_index={},房屋信息错误".format(co_index), e) continue ho_res = requests.get(house_url, headers=self.headers) ho_con = ho_res.content.decode('gbk') ho = House(co_index) ho.co_id = co_id ho.bu_id = bu_id ho.ho_name = re.search('室号.*?fafa>(.*?)</TD', ho_con, re.S | re.M).group(1) ho.ho_floor = re.search('实际层.*?fafa>(.*?)</TD', ho_con, re.S | re.M).group(1) ho.ho_build_size = re.search('建筑面积.*?fafa>(.*?)</TD', ho_con, re.S | re.M).group(1) ho.ho_true_size = re.search('套内面积.*?fafa>(.*?)</TD', ho_con, re.S | re.M).group(1) ho.ho_share_size = re.search('分摊面积.*?fafa>(.*?)</TD', ho_con, re.S | re.M).group(1) ho.ho_price = re.search('价格.*?fafa>(.*?)</TD', ho_con, re.S | re.M).group(1) ho.ho_type = re.search('用途.*?fafa>(.*?)</TD', ho_con, re.S | re.M).group(1) ho.insert_db()
def get_house_info(self, bu_id, co_id): url = 'http://www.fzfgj.cn/website/presale/home/HouseTableControl/GetData.aspx?Building_ID=' + bu_id try: response = requests.get(url=url, headers=self.headers) xml = response.text tree = etree.XML(xml) logo = tree.xpath('//LOGICBUILDING_ID/text()')[0] url_2 = 'http://www.fzfgj.cn/website/presale/home/HouseTableControl/GetData.aspx?LogicBuilding_ID=' + logo result = requests.get(url_2, headers=self.headers) xml_2 = result.text tree_2 = etree.XML(xml_2) house_info_list = tree_2.xpath('T_HOUSE') for i in house_info_list: try: house = House(11) ho_name = i.xpath('ROOM_NUMBER/text()')[0] ho_build_size = i.xpath('BUILD_AREA/text()')[0] ho_true_size = i.xpath('BUILD_AREA_INSIDE/text()')[0] ho_share_size = i.xpath('BUILD_AREA_SHARE/text()')[0] ho_floor = i.xpath('FLOOR_REALRIGHT/text()')[0] ho_type = i.xpath('USE_FACT/text()')[0] house.co_id = co_id house.bu_id = bu_id house.ho_build_size = ho_build_size house.ho_true_size = ho_true_size house.ho_share_size = ho_share_size house.ho_floor = ho_floor house.ho_name = ho_name house.ho_type = ho_type house.insert_db() except Exception as e: print('房号错误,co_index={},url={}'.format(co_index, url_2), e) except BaseException as e: print('房号错误,co_index={},url={}'.format(co_index, url), e)
def get_house_detail(self, house_detail_url_list, co_id, bu_id): for i in house_detail_url_list: detail_url = 'http://www.yzfdc.cn/' + i try: house = House(co_index) time.sleep(3) response = self.s.get(detail_url, headers=self.headers) html = response.text house.co_name = re.search('lblxmmc.*?>(.*?)<', html, re.S | re.M).group(1) house.bu_num = re.search('lbldh.*?>(.*?)<', html, re.S | re.M).group(1) house.ho_name = re.search('lblfh.*?>(.*?)<', html, re.S | re.M).group(1) house.ho_build_size = re.search('lbljzmj.*?>(.*?)<', html, re.S | re.M).group(1) house.ho_true_size = re.search('lbltnmj.*?>(.*?)<', html, re.S | re.M).group(1) house.ho_share_size = re.search('lblftmj.*?>(.*?)<', html, re.S | re.M).group(1) house.ho_type = re.search('lblfwxz.*?>(.*?)<', html, re.S | re.M).group(1) house.ho_room_type = re.search('lblhuxin.*?>(.*?)<', html, re.S | re.M).group(1) house.bu_id = bu_id house.co_id = co_id house.insert_db() except Exception as e: print('房号错误,co_index={},url={}'.format(co_index, detail_url), e)
def get_house_detail(self, house_detail_url, co_id, bu_id): try: house = House(co_index) house_detail_url_ = 'http://www.yfci.gov.cn:8080/HousePresell/' + house_detail_url response = requests.get(house_detail_url_, headers=self.headers) html = response.text if '找不到记录' in html: return house.ho_name = re.search('id="HouseNO".*?>(.*?)<', html, re.S | re.M).group(1) house.ho_true_size = re.search('id="HouseArea".*?>(.*?)<', html, re.S | re.M).group(1) house.ho_build_size = re.search('id="SumBuildArea1".*?>(.*?)<', html, re.S | re.M).group(1) house.ho_type = re.search('id="HouseUse".*?>(.*?)<', html, re.S | re.M).group(1) house.orientation = re.search('id="CHX".*?>(.*?)<', html, re.S | re.M).group(1) house.ho_type = re.search('id="CHX".*?>(.*?)<', html, re.S | re.M).group(1) house.co_id = co_id house.bu_id = bu_id house.insert_db() except Exception as e: print( '房号错误,co_index={},url={}'.format(co_index, house_detail_url_), e)
def house_parse(self,bu_id,co_id,sid,propertyid): data = { 'propertyid':propertyid, 'sid':sid, 'buildingid':bu_id, 'tid':'price', 'page':1 } res = requests.post('http://tmsf.qzfdcgl.com/newhouse/property_pricesearch.htm',data=data,headers=self.headers) page = re.search('页数.*?/(\d+)',res.text).group(1) for i in range(1,int(page)+1): data['page'] = i ho_res = requests.post('http://tmsf.qzfdcgl.com/newhouse/property_pricesearch.htm', data=data, headers=self.headers) con = ho_res.text ho_html = etree.HTML(con) house_list = ho_html.xpath("//tr[@onmouseout]") for house in house_list: ho = House(co_index) ho.co_id = co_id ho.bu_id = bu_id ho.ho_name = house.xpath("./td[3]/a/div/text()")[0] ho.unit = house.xpath("./td[2]/a/div/text()")[0] buildsize = house.xpath("./td[4]/a/div/span/@class") truesize = house.xpath("./td[5]/a/div/span/@class") price = house.xpath("./td[9]/a/div/span/@class") ho.ho_build_size = self.number_replace(buildsize) ho.ho_true_size = self.number_replace(truesize) ho.ho_price = self.number_replace(price) ho.insert_db()
def house_info(self, co_id, bu_id, house_url_list): for house_ in house_url_list: house_url = "http://www.njhouse.com.cn/2016/spf/" + house_ try: # ho_res = requests.get(house_url,headers=self.headers) ho_pro = Proxy_contact(app_name="nanjing", method='get', url=house_url, headers=self.headers) ho_con = ho_pro.contact() ho_con = ho_con.decode('gbk') # ho_con = ho_res.content.decode('gbk') ho = House(co_index) ho.co_id = co_id ho.bu_id = bu_id ho.ho_name = re.search('房号.*?;">(.*?)</td', ho_con, re.S | re.M).group(1) ho.ho_price = re.search('价格.*?<td>(.*?)元', ho_con, re.S | re.M).group(1) ho.ho_floor = re.search('楼层.*?;">(.*?)</td', ho_con, re.S | re.M).group(1) ho.ho_build_size = re.search('建筑面积.*?<td>(.*?)m', ho_con, re.S | re.M).group(1) ho.ho_true_size = re.search('套内面积.*?<td>(.*?)m', ho_con, re.S | re.M).group(1) ho.ho_share_size = re.search('分摊面积.*?<td>(.*?)m', ho_con, re.S | re.M).group(1) ho.ho_type = re.search('房屋类型.*?<td>(.*?)</td', ho_con, re.S | re.M).group(1) except Exception as e: log.error("房屋详情页错误{}".format(e)) continue ho.insert_db()
def get_build_info(self, url, co_id): try: building = Building(co_index) response = requests.get(url) html = response.text tree = etree.HTML(html) co_name = tree.xpath('//*[@id="PageB_Location"]/text()')[0] # 小区名字 print(co_name) bu_name = tree.xpath('//*[@id="ItemName"]/text()')[0] # 楼栋名称 bu_num = tree.xpath('//*[@id="PageB_HouseNo"]/text()')[0] # 楼号 栋号 bu_all_house = tree.xpath('//*[@id="lb_countbulidtaoshu"]/text()')[ 0] # 总套数 bu_floor = tree.xpath('//*[@id="cell3-1"]/text()') bu_floor = self.is_none(bu_floor) # 楼层 bu_build_size = tree.xpath('//*[@id="lb_countbulidarea"]/text()')[ 0] # 建筑面积 bu_live_size = tree.xpath('//*[@id="lb_buildarea"]/text()')[ 0] # 住宅面积 bu_price = tree.xpath('//*[@id="lb_buildavg"]/text()') bu_price = self.is_none(bu_price) # 住宅价格 bu_id = re.search('\?(\d+)$', url).group(1) # 楼栋id building.co_id = co_id building.bu_name = bu_name building.bu_num = bu_num building.bu_all_house = bu_all_house building.bu_floor = bu_floor building.bu_build_size = bu_build_size building.bu_live_size = bu_live_size building.bu_price = bu_price building.bu_id = bu_id building.insert_db() house_info_html = re.findall('<tr id="row3">(.*)$', html, re.S | re.M)[0] for i in re.findall('(<td.*?>.*?</td>)', house_info_html, re.S | re.M): if '<br>' not in i: continue ho_name_list = re.findall('<td.*?>(.*?)<br>', i, re.S | re.M) ho_true_size_list = re.findall('<td.*?>.*?<br>(.*?)<br>', i, re.S | re.M) ho_type = re.findall('<td.*?>.*?<br>.*?<br>(.*?)<br>', i, re.S | re.M)[0] for i in range(len(ho_name_list)): try: if 'font' in ho_name_list[i]: ho_name = re.sub('<font.*?>', '', ho_name_list[i]) else: ho_name = ho_name_list[i] house = House(8) house.ho_name = ho_name house.ho_true_size = ho_true_size_list[i] house.co_id = co_id house.bu_id = bu_id house.ho_type = ho_type house.insert_db() except Exception as e: print(e) except BaseException as e: print(e)
def house_parse(self, ho_url, co_id, bu_id): house_url = "http://61.143.241.154/" + ho_url ho_res = requests.get(house_url, headers=headers) html = etree.HTML(ho_res.content.decode('gbk')) detail_list = html.xpath("//td[@height='80']/a/@href") for detail in detail_list: try: detail_url = 'http://61.143.241.154/' + detail res = requests.get(detail_url, headers=headers) con = res.content.decode('gbk') ho = House(co_index) ho.co_id = co_id ho.bu_id = bu_id ho.ho_name = re.search('房屋号.*?">(.*?)</td', con, re.S | re.M).group(1) ho.ho_true_size = re.search('套内面积.*?">(.*?)</td', con, re.S | re.M).group(1) ho.ho_build_size = re.search('建筑面积.*?">(.*?)</td', con, re.S | re.M).group(1) ho.orientation = re.search('房屋朝向.*?">(.*?)</td', con, re.S | re.M).group(1) ho.ho_type = re.search('用途.*?">(.*?)</td', con, re.S | re.M).group(1) ho.ho_price = re.search('申报总价.*?">(.*?)</td', con, re.S | re.M).group(1) ho.insert_db() except Exception as e: log.error("{}房屋请求解析失败{}".format(detail, e))
def house_parse(self, bu_id, co_id): # 房屋信息解析 ho = House(co_index) house_url = "http://ys.tyfdc.gov.cn/Firsthand/tyfc/publish/probld/NBView.do?" formdata = {"nid": bu_id, "projectid": co_id} try: res = requests.post(house_url, data=formdata, headers=self.headers) except Exception as e: print("co_index={},房屋详情页无法访问".format(co_index), e) con = res.text ho_name = re.findall('\'\);">(.*?) ', con, re.S | re.M) ho_build_size = re.findall('<span.*?建筑面积:(.*?)㎡', con, re.S | re.M) ho_true_size = re.findall('<span.*?套内面积:(.*?)分', con, re.S | re.M) ho_share_size = re.findall('<span.*?分摊面积:(.*?)㎡', con, re.S | re.M) ho_type = re.findall('<span.*?用途:(.*?)房', con, re.S | re.M) ho_price = re.findall('<span.*?单价:(.*?)"', con, re.S | re.M) ho_id = re.findall("getHouseBaseInfo\('(.*?)'\)", con, re.S | re.M) for index in range(0, len(ho_id)): ho.co_id = co_id ho.bu_id = bu_id ho.ho_name = ho_name[index] ho.ho_build_size = ho_build_size[index] ho.ho_type = ho_type[index] ho.ho_share_size = ho_share_size[index] ho.ho_price = ho_price[index] ho.ho_true_size = ho_true_size[index] ho.ho_num = ho_id[index] ho.insert_db()
def get_house_info(self, house_url_list, bu_id, co_id): for i in house_url_list: try: house = House(co_index) response = requests.get(i, headers=self.headers) html = response.text house.ho_name = re.search('门牌号:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) house.ho_floor = re.search('所在层:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) house.ho_type = re.search('房屋性质:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) house.ho_build_size = re.search('预测建筑面积:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) house.ho_true_size = re.search('预测套内面积:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) house.ho_share_size = re.search('预测分摊面积:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) house.co_address = re.search('房屋坐落:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) house.bu_id = bu_id house.co_id = co_id house.insert_db() except Exception as e: print('房号错误,co_index={},url={}'.format(co_index, i), e)
def get_house_info(self, house_url_list, bu_id, co_id): for i in house_url_list: try: house = House(co_index) house_url = 'http://www.fjnpfdc.com/House/' + i house_res = requests.get(house_url, headers=self.headers) house_con = house_res.content.decode('gbk') house.bu_id = bu_id house.co_id = co_id house.bu_num = re.search('幢 号:.*?<td>(.*?)<', house_con, re.S | re.M).group(1) house.ho_name = re.search('房 号:.*?<td>(.*?)<', house_con, re.S | re.M).group(1) house.co_name = re.search('项目名称:.*?<td>(.*?)<', house_con, re.S | re.M).group(1) house.ho_build_size = re.search('建筑面积:.*?<td>(.*?)<', house_con, re.S | re.M).group(1) house.ho_true_size = re.search('套内面积:.*?<td>(.*?)<', house_con, re.S | re.M).group(1) house.ho_share_size = re.search('分摊面积:.*?<td>(.*?)<', house_con, re.S | re.M).group(1) house.ho_floor = re.search('所 在 层:.*?<td>(.*?)<', house_con, re.S | re.M).group(1) house.insert_db() except Exception as e: print("co_index={},房屋{}错误".format(co_index, i), e)
def get_house_info(self, bu_id, co_id): house_url = "http://www.xyfdc.gov.cn/wsba/Common/Agents/ExeFunCommon.aspx" payload = "<?xml version=\"1.0\" encoding=\"utf-8\" standalone=\"yes\"?>\r\n<param funname=\"SouthDigital.Wsba.CBuildTableEx.GetBuildHTMLEx\">\r\n<item>" + \ bu_id + "</item>\r\n<item>1</item>\r\n<item>1</item>\r\n<item>80</item>\r\n<item>840</item>\r\n<item>g_oBuildTable</item>\r\n<item> 1=1</item>\r\n<item>1</item>\r\n<item>false</item>\r\n</param>\r\n" headers = { 'Content-Type': "text/xml", } response = requests.request("POST", house_url, data=payload, headers=headers) html = response.text house_info_list = re.findall( "onclick=.g_oBuildTable.clickRoom.*? title='(.*?)'", html, re.S | re.M) for i in house_info_list: try: house = House(co_index) house.ho_name = re.search('房号:(.*?)单元:', i, re.S | re.M).group(1) house.ho_build_size = re.search('总面积:(.*?)平方米', i, re.S | re.M).group(1) house.ho_type = re.search('用途:(.*?)户型', i, re.S | re.M).group(1) house.ho_room_type = re.search('户型:(.*?)状态', i, re.S | re.M).group(1) house.info = i house.bu_id = bu_id house.co_id = co_id house.insert_db() except Exception as e: print( '房号错误,co_index={},url={},data={}'.format( co_index, house_url, payload), e)
def get_house_info(self, co_id, bu_id): house_url = "http://202.103.219.149:7000/LeadingEstate/buildingtable/ShowNewBuildingTable.aspx" payload = "IsShowHouse=1&BuidID=" + bu_id headers = {'Content-Type': "application/x-www-form-urlencoded"} try: response = requests.request("POST", house_url, data=payload, headers=headers) html = response.text house_info_list = re.findall('HouseID.*?\}', html, re.S | re.M) for i in house_info_list: house = House(co_index) house.bu_id = bu_id house.co_id = co_id house.ho_name = re.search('"YCHouseNo":"(.*?)"', i, re.S | re.M).group(1) house.ho_floor = re.search('"ActFLoor":"(.*?)"', i, re.S | re.M).group(1) house.ho_build_size = re.search('"YCJZArea":"(.*?)"', i, re.S | re.M).group(1) house.ho_true_size = re.search('"YCTNJZArea":"(.*?)"', i, re.S | re.M).group(1) house.ho_share_size = re.search('"YCFTJZArea":"(.*?)"', i, re.S | re.M).group(1) house.insert_db() except Exception as e: print('请求错误,url={},data={}'.format(house_url, payload))
def get_house_info(self, house_id_list, bu_id, co_id): for i in house_id_list: house_url = 'http://www.hbczfdc.com:4993/HPMS/RoomInfo.aspx?code=' + i try: house = House(co_index) response = requests.get(house_url, headers=self.headers) html = response.text house.bu_id = bu_id house.co_id = co_id house.ho_name = re.search('id="ROOM_HH">(.*?)<', html, re.S | re.M).group(1) house.ho_floor = re.search('id="ROOM_MYC">(.*?)<', html, re.S | re.M).group(1) house.ho_type = re.search('id="ROOM_FWYT">(.*?)<', html, re.S | re.M).group(1) house.ho_room_type = re.search('id="ROOM_HX">(.*?)<', html, re.S | re.M).group(1) house.ho_build_size = re.search('id="ROOM_YCJZMJ">(.*?)<', html, re.S | re.M).group(1) house.ho_true_size = re.search('id="ROOM_YCTNJZMJ">(.*?)<', html, re.S | re.M).group(1) house.ho_share_size = re.search('id="ROOM_YCFTJZMJ">(.*?)<', html, re.S | re.M).group(1) house.insert_db() except Exception as e: print('房号错误,co_index={},url={}'.format(co_index, house_url), e)
def get_build_info(self, url, response,co_id, bu_id): house = House(co_index) json_html = json.loads(response.text) for i in json_html: ho_name = i['roomno'] # 房号 ho_type = i['ghyt'] # 用途 ho_true_size = i['tnmj'] # 预测套内面积 ho_floor = i['floorindex'] # 楼层 ho_build_size = i['jzmj'] # 建筑面积 house.co_id = co_id house.bu_id = bu_id house_code = i["fwcode"] house.ho_name = ho_name house.ho_type = ho_type house.ho_true_size = ho_true_size house.ho_floor = ho_floor house.ho_build_size = ho_build_size house_detail_url = "http://fsfc.fsjw.gov.cn/hpms_project/roomview.jhtml?id="+str(house_code) try: res = requests.get(house_detail_url,headers=self.headers) house.ho_share_size = re.search('实测分摊面积.*?<td>(.*?)</td>', res.text, re.S | re.M).group(1) house.ho_price = re.search('总价.*?<td>(.*?)</td>', res.text, re.S | re.M).group(1) except Exception as e: print("co_index={},房屋详情页{}请求失败!".format(co_index,house_detail_url)) print(e) continue house.insert_db()
def house_info(self,ho_url,co_id,bu_id): url = "http://222.77.178.63:7002/" + ho_url url.rstrip('=') res = requests.get(url,headers=self.headers) res.encoding = 'gbk' html = etree.HTML(res.text) house_detail_list = html.xpath("//td/a[@target]/@href") for house_detail in house_detail_list: try: detail_url = "http://222.77.178.63:7002/" + house_detail detail_res = requests.get(detail_url,headers=self.headers) detail_res.encoding = 'gbk' con = detail_res.text ho = House(co_index) ho.co_id = co_id ho.bu_id = bu_id ho.ho_name = re.search('室号.*?">(.*?)<',con,re.S|re.M).group(1) ho.ho_floor = re.search('实际层.*?">(.*?)<',con,re.S|re.M).group(1) ho.ho_type = re.search('房屋类型.*?">(.*?)<',con,re.S|re.M).group(1) ho.ho_build_size = re.search('预测建筑面积.*?">(.*?)<',con,re.S|re.M).group(1) ho.ho_true_size = re.search('预测套内面积.*?">(.*?)<',con,re.S|re.M).group(1) ho.ho_share_size = re.search('预测分摊面积.*?">(.*?)<',con,re.S|re.M).group(1) ho.ho_price = re.search('总价.*?">(.*?)<',con,re.S|re.M).group(1) ho.insert_db() except Exception as e: # log.error('房屋信息错误{}'.format(e)) print('房屋信息错误{}'.format(e))
def house_parse(self, house_url, co_id, bu_id): ho = House(co_index) url = "http://spf.tlfdc.cn/" + house_url res = requests.get(url, headers=self.headers) con = res.text ho_name = re.findall('室号:(.*?)套', con, re.S | re.M) ho_room_type = re.findall('套型:(.*?)建', con, re.S | re.M) ho_build_size = re.findall('建筑面积:(.*?)参', con, re.S | re.M) ho_price = re.findall('价格:(.*?)元', con, re.S | re.M) ho_detail = re.findall('href="(show.*?\?id=\d+&id2=\d+&prjid=\d+)"', con, re.S | re.M) for index in range(0, len(ho_name)): try: ho.co_id = co_id ho.bu_id = bu_id ho.ho_name = ho_name[index] ho.ho_room_type = ho_room_type[index] ho.ho_build_size = ho_build_size[index] ho.ho_price = ho_price[index] ho_detail_url = "http://spf.tlfdc.cn/" + ho_detail[index] res = requests.get(ho_detail_url, headers=self.headers) res = res.content.decode('gb2312') ho.ho_floor = re.findall('楼层.*?">(.*?)</td>', res, re.S | re.M)[0].strip() ho.insert_db() except: print('房号错误,co_index={},url={}'.format(co_index, url), e) continue
def house_parse(self, co_id, bu_id, bu_con): name_list = re.findall('<a style.*?\)>(.*?)</a', bu_con) for name in name_list: ho = House(co_index) ho.co_id = co_id ho.bu_id = bu_id ho.ho_name = name ho.insert_db()
def get_house_info(self, house_url, ho_name, bu_id, co_id): house = House(co_index) url = 'http://www.bjjs.gov.cn' + house_url if '#' not in url: house = self.get_house_detail(url, house) house.ho_name = ho_name house.bu_id = bu_id house.co_id = co_id house.insert_db()
def house(self, house_url, bu_id, co_id): ho_url = "http://www.syfc.com.cn" + house_url try: res = requests.get(ho_url, headers=self.headers) con = etree.HTML(res.text) ho_detail_url = con.xpath("//iframe/@src")[0] response = requests.get(ho_detail_url, headers=self.headers) except Exception as e: print("co_index={},楼栋详情页无法访问".format(co_index), e) html = etree.HTML(response.text) content = html.xpath("//td[@width='70']") for td in content: ho = House(co_index) try: room_url = td.xpath("./a/@href")[0] ho.ho_name = td.xpath("./a/text()")[0] # ho.ho_id = re.search('id=(\d+)&', room_url).group(1) ho.bu_id = bu_id ho.co_id = co_id room_url = "http://www.syfc.com.cn" + room_url try: res = requests.get(room_url, headers=self.headers) con = res.text except Exception as e: print("co_idnex={},房屋详情页无法访问".format(co_index), e) # print(con) ho.ho_build_size = re.search('建筑面积.*?">(.*?)<', con, re.S | re.M).group(1) ho.ho_share_size = re.search('分摊面积.*?">(.*?)<', con, re.S | re.M).group(1) ho.ho_true_size = re.search('套内面积.*?">(.*?)<', con, re.S | re.M).group(1) ho.ho_type = re.search('类型.*?">(.*?)<', con, re.S | re.M).group(1) ho.insert_db() except: ho.bu_id = bu_id ho.co_id = co_id ho.ho_name = td.xpath("./text()")[0] ho.insert_db()
def ho_info(self, house_url, co_id, bu_id): res = requests.get(house_url, headers=self.headers) html = etree.HTML(res.text) ho_info_list = html.xpath("//tbody//td[@unitname]") for ho_info in ho_info_list: try: ho = House(co_index) ho.co_id = co_id ho.bu_id = bu_id ho.ho_name = ho_info.xpath("./text()")[0] ho.insert_db() except Exception as e: log.error("小区房屋信息提取失败", e)
def room_crawler(self, room): # 房屋 ho = House(co_index) house_url = "http://www.hzszjj.gov.cn" + room res = requests.get(house_url, ) con = etree.HTML(res.text) ho_table = con.xpath("//tr[@bgcolor='#fbf3e6']") for ho_list in ho_table[1:-1]: ho_floor = ho_list.xpath("./td[@align='center']/text()")[0] honum_list = ho_list.xpath(".//tr/td[@height='40']") for house in honum_list: ho.ho_floor = ho_floor # 楼层 id_num = re.search(r"(\d+)&\w+=(\d+)", room) ho.co_id = id_num.group(1) # 小区id ho.bu_id = id_num.group(2) # 楼栋id ho_url = house.xpath("./a/@href")[0] if len(ho_url) == 1: ho_info = house.xpath("./a/@wf")[0] ho.ho_name = house.xpath("./a/text()")[0] info = re.search( r":(.*?)<br>.*?:(.*?)<br>(.*?)<br><hr>.*?:(.*?)m.*?<br>.*?:(.*?)<br>.*?:(.*?)m", ho_info) ho.ho_type = info.group(5) ho.ho_build_size = info.group(4) ho.ho_room_type = info.group(2) else: detail_url = "http://www.hzszjj.gov.cn/ts_web_dremis/web_house_dir/" + ho_url res = requests.get(detail_url) con = etree.HTML(res.text) ho.ho_name = con.xpath( "//span[@id='ctl00_ContentPlaceHolder2_lb_house_name']/text()" )[0] ho.ho_type = con.xpath( "//span[@id='ctl00_ContentPlaceHolder2_lb_house_type']/text()" )[0] ho.ho_build_size = con.xpath( "//span[@id='ctl00_ContentPlaceHolder2_lb_house_build_area']/text()" )[0] ho.ho_share_size = con.xpath( "//span[@id='ctl00_ContentPlaceHolder2_lb_house_share_area']/text()" )[0] ho.ho_true_size = con.xpath( "//span[@id='ctl00_ContentPlaceHolder2_lb_house_inside_area']/text()" )[0] ho.insert_db()
def ho_info(self,ho_list,co_id,bu_id): for hou in ho_list: try: ho = House(co_index) ho.co_id = co_id ho.bu_id = bu_id ho.ho_name = hou.xpath("./text()")[0] ho_info = hou.xpath("./@title")[0] ho.ho_build_size = re.search('建筑面积:(.*?)\n',ho_info).group(1) ho.ho_share_size = re.search('分摊面积:(.*)',ho_info).group(1) ho.ho_true_size = re.search('套内面积:(.*?)\n',ho_info).group(1) ho.insert_db() except Exception as e: # log.error("房屋信息错误{}".format(e)) print("房屋信息错误{}".format(e))
def get_house_info(self, house_url, bu_id, co_id): try: house = House(co_index) house.bu_id = bu_id house.co_id = co_id response = requests.post(house_url, headers=self.headers) html = response.content.decode('gbk') house.ho_floor = re.search('所在楼层:.*?<td>(.*?)<', html, re.M | re.S).group(1) house.ho_name = re.search('房号:.*?<td>(.*?)<', html, re.M | re.S).group(1) house.ho_build_size = re.search('预测总面积:.*?<td>(.*?)<', html, re.M | re.S).group(1) house.ho_true_size = re.search('预测套内面积.*?<td>(.*?)<', html, re.M | re.S).group(1) house.ho_share_size = re.search('预测公摊面积.*?<td>(.*?)<', html, re.M | re.S).group(1) house.insert_db() except Exception as e: print('房号错误,co_index={},url={}'.format(co_index, house_url), e)
def get_house_info(self, zu_house_url, bu_num, co_id): try: house = House(co_index) house.bu_num = bu_num house.co_id = co_id result = self.s.get(zu_house_url, headers=self.headers).text house.info = re.search('ItemName.*?>(.*?)<', result).group(1).strip() ho_code_list = re.findall("OnClick=.__doPostBack\(.*?,'(.*?)'\)", result, re.S | re.M) ho_msg_list = re.findall("OnClick=.__doPostBack\('(.*?)'", result, re.S | re.M) self.get_house_detail(zu_house_url, ho_msg_list, ho_code_list, house) except Exception as e: print(e)
def house_info(self,co_id,bu_id,bu_url): ho_url = 'http://www.bdfdc.net' + bu_url res = requests.get(ho_url,headers=self.headers) time.sleep(5) html = etree.HTML(res.text) house_info_list = html.xpath("//a[@wf]") for house_info in house_info_list: ho = House(co_index) detail = house_info.xpath("./@wf")[0] ho.ho_name = house_info.xpath("./text()")[0] ho.bu_id = bu_id ho.co_id = co_id ho.ho_build_size = re.search('建筑面积:(.*?)m',detail).group(1) ho.ho_type = re.search('用途:(.*?)<br',detail).group(1) ho.insert_db()
def get_house_info(self, con, co_id, build_id): html_str = re.search('houseTableData.*?特别申明', con, re.S | re.M).group() for info in re.findall('<div style.*?</div>', html_str, re.S | re.M): try: ho = House(co_index) ho.ho_name = re.search("'HC_HOUSENUMB':'(.*?)',", info, re.S | re.M).group(1) ho.ho_room_type = re.search("'HC_HOUSETYPE':'(.*?)',", info, re.S | re.M).group(1) ho.ho_build_size = re.search("'HC_STCTAREA':'(.*?)',", info, re.S | re.M).group(1) ho.bu_id = build_id ho.co_id = co_id ho.insert_db() except Exception as e: print('house error, co_index={}'.format(co_index))