def get_comm_info(self, comm_list, all_page_url): for i in comm_list: try: comm = Comm('中安房') comm.city = '合肥' comm.district_name = re.search('zaf-nowrap.*?>(.*?)<', i, re.S | re.M).group(1).strip() trade_date = re.search('zaf-fblue">(.*?)<', i, re.S | re.M).group(1).strip() if trade_date: t = time.strptime(trade_date, "%Y-%m-%d") y = t.tm_year m = t.tm_mon d = t.tm_mday comm.trade_date = datetime.datetime(y, m, d) total_price = re.search('list-right-data.*?<span.*?>(.*?)<', i, re.S | re.M).group(1).strip() comm.total_price = int(re.search('(\d+)', total_price, re.S | re.M).group(1)) * 10000 info = re.search('list-details-area.*?<span.*?>(.*?)<', i, re.S | re.M).group(1).strip() area = info.split(' ')[0].replace('㎡', '') if area: area = float(area) comm.area = round(area, 2) try: room_type = info.split(' ')[1] except Exception as e: room_type = None try: comm.room = int(re.search('(\d)室', room_type, re.S | re.M).group(1)) except Exception as e: comm.room = 0 try: comm.hall = int(re.search('(\d)厅', room_type, re.S | re.M).group(1)) except Exception as e: comm.hall = None try: comm.toilet = int(re.search('(\d)卫', room_type, re.S | re.M).group(1)) except Exception as e: comm.toilet = None try: avg_price = info.split(' ')[2] comm.avg_price = int(re.search('(\d+)', avg_price, re.S | re.M).group(1)) except Exception as e: comm.avg_price = None info_2 = re.search('list-details-area.*?<span.*?<span>(.*?)<', i, re.S | re.M).group(1).strip() comm.direction = info_2.split(' ')[0] try: comm.fitment = info_2.split(' ')[1] except Exception as e: comm.fitment = None info_3 = re.search('list-details-address1.*?<span>(.*?)<', i, re.S | re.M).group(1).strip() comm.region = info_3.split(' ')[0].strip() comm.insert_db() except Exception as e: log.error('解析错误,source={},url="{}",e="{}"'.format('中安房', all_page_url, e))
def into_mongo(coll): com = Comm('澜斯') results = coll.find(no_cursor_timeout=True) for result in results: # 这个地方写一个try是因为我再测试的时候发现有的木有fj_city try: com.city = result['fj_city'] # 城市 com.region = result['fj_region'] # 区域 except Exception as e: log.error('城市或者区域没有') com.m_date = result['updatedate'] # 更新日期 com.create_date = datetime.datetime.now() # 创建时间 com.fitment = result['newdiskdecoration'] # 装修 com.floor = result['flevel'] # 所在楼层 # try是因为在插入数据库中这几个如果不符合,就不会插入 try: com.district_name = result['fj_name'] # 小区名称 com.avg_price = result['unitprice'] # 单价 com.total_price = result['usd'] # 总价 com.area = result['acreage'] # 面积=建筑面积 t = time.strptime(result['signingdate'].split('T')[0], "%Y-%m-%d") y = t.tm_year m = t.tm_mon d = t.tm_mday com.trade_date = datetime.datetime(y, m, d) except Exception as e: log.error(e) # 这一部分我写了正则从地址中匹配单元号和室号,如果组长感觉不对,,直接注释掉就好 houseaddress = result['houseaddress'] try: res = re.search('(\d+)号(\d+)', houseaddress) com.unit_num = res.group(1) # 单元号 com.room_num = res.group(2) # 室号 except Exception as e: print('无法匹配大盘单元号和室号,houseaddress={}'.find(houseaddress)) # 以下数据库确定无法匹配,写上是为了让您看看 # com.direction = None # 朝向 # com.room = None # 室数 # com.hall = None # 厅数 # com.toilet = None # 卫数 # com.height = None # 总楼层 # com.house_num = None # 楼栋号 # 执行插入操作 com.insert_db()
def room(self, co_list, city_name): for co in co_list: try: co_name = co.xpath("./div[1]/a/text()")[0] co_url = "http:" + co.xpath("./div[1]/a/@href")[0] region = co.xpath("./div[3]/span[1]/a[1]/text()")[0] addr = co.xpath("./div[3]/span[3]/@title")[0] detail = requests.get(co_url, headers=self.headers) html = etree.HTML(detail.text) room_url = "http:" + html.xpath("//div[@class='tab-toolbar pr']//li/a/@href")[-1] page_index = requests.get(room_url, headers=self.headers) except: continue if re.search('共(\d+)页', page_index.text): page_num = re.search('共(\d+)页', page_index.text).group(1) else: log.info('小区无相关数据') continue for i in range(1, int(page_num) + 1): url = re.sub('#.*', 'n', room_url) + str(i) while True: try: res = requests.get(url, headers=self.headers) break except: continue con = res.text room_html = etree.HTML(con) room_list = room_html.xpath("//div[@class='right-information']") for m in room_list: try: room = Comm(source) room.district_name = co_name room.city = city_name room.region = region room_type = m.xpath("./h3/span[2]/text()")[0] try: room.room = int(re.search('(\d)室', room_type, re.S | re.M).group(1)) except Exception as e: room.room = None try: room.hall = int(re.search('(\d)厅', room_type, re.S | re.M).group(1)) except Exception as e: room.hall = None size = m.xpath("./h3/span[3]/text()")[0] area = size.replace('平米', '') if area: area = float(area) room.area = round(area, 2) total_price = m.xpath(".//div[@class='price fs14 ']/em/text()")[0] room.total_price = int(re.search('(\d+)', total_price, re.S | re.M).group(1))*10000 avg_price = m.xpath(".//div[@class='size fs14']/text()")[0] room.avg_price = int(re.search('(\d+)', avg_price, re.S | re.M).group(1)) try: room.fitment = m.xpath(".//div[@class='t1 fs14']/text()[3]")[0] room.direction = m.xpath(".//div[@class='t1 fs14']/text()[2]")[0] # room.use = m.xpath(".//div[@class='t1 fs14']/text()[1]")[0] except: room.fitment = None room.direction = None # room.use = None floor_info = m.xpath(".//div[@class='fs14']/text()[1]")[0] try: floor = re.search('(.*?)/', floor_info).group(1) room.floor = int(re.search('\d+',floor).group(0)) except Exception as e: room.floor = None try: room.height = int(re.search('.*?/(\d+)层', floor_info).group(1)) except: room.height = None trade_date = m.xpath(".//div[@class='date']/text()")[0] if trade_date: t = time.strptime(trade_date, "%Y-%m-%d") y = t.tm_year m = t.tm_mon d = t.tm_mday room.trade_date = datetime.datetime(y, m, d) room.insert_db() except Exception as e: log.error('房屋信息提取失败{}'.format(e))
def get_comm_detail(self, comm_url, region, city): comm = Comm('购房网') comm.url = comm_url comm.region = region.strip() comm.city = city try: response = requests.get(url=comm_url, headers=self.headers, proxies=next(p)) except Exception as e: log.error('请求错误,source="{}",url="{}",e="{}"'.format( '购房网', comm_url, e)) return html = response.text comm.district_name = re.search('title fl.*?<h1>(.*?)</h1>', html, re.S | re.M).group(1).strip() comm_info_html = re.search('<ul class="lscjlist">.*?</ul>', html, re.S | re.M).group() comm_info_list = re.findall('<li>(.*?)</li>', comm_info_html, re.S | re.M) if not comm_info_list: log.info('source={}, 此小区没有数据,url="{}"'.format('购房网', comm_url)) for i in comm_info_list: trade_date = re.search('<span>(.*?)</span>', i, re.S | re.M).group(1).strip() if trade_date: t = time.strptime(trade_date, "%Y-%m-%d") y = t.tm_year m = t.tm_mon d = t.tm_mday comm.trade_date = datetime.datetime(y, m, d) room_type = re.search('<span>.*?<span>(.*?)</span>', i, re.S | re.M).group(1).strip() try: comm.room = int( re.search('(\d)室', room_type, re.S | re.M).group(1)) comm.hall = int( re.search('(\d)厅', room_type, re.S | re.M).group(1)) except Exception as e: comm.room = None comm.hall = None area = re.search('<span>.*?<span>.*?<span>(.*?)</span>', i, re.S | re.M).group(1).strip().replace('㎡', '').replace( '平', '') if area: area = float(area) comm.area = round(area, 2) try: height = re.search( '<span>.*?<span>.*?<span>.*?<span>.*?/(.*?)</span>', i, re.S | re.M).group(1).strip() comm.height = int(re.search('(\d+)', height).group(1)) except Exception as e: comm.height = None comm.fitment = re.search( '<span>.*?<span>.*?<span>.*?<span>.*?<span>(.*?)</span>', i, re.S | re.M).group(1).strip() comm.direction = re.search( '<span>.*?<span>.*?<span>.*?<span>.*?<span>.*?<span>(.*?)</span>', i, re.S | re.M).group(1).strip() avg_price = re.search( '<span>.*?<span>.*?<span>.*?<span>.*?<span>.*?<span>.*?<span>(.*?)</span>', i, re.S | re.M).group(1) comm.avg_price = int( re.search('(\d+)', avg_price, re.S | re.M).group(1)) total_price = re.search( '<span>.*?<span>.*?<span>.*?<span>.*?<span>.*?<span>.*?<span>.*?<span.*?>(.*?)</span>', i, re.S | re.M).group(1) comm.total_price = int( re.search('(\d+)', total_price, re.S | re.M).group(1)) * 10000 comm.insert_db()
def get_city_info(self, city_dict): for city in city_dict: city_url = city_dict[city] + 'chengjiao/' try: response = requests.get(city_url, headers=self.headers) html = response.text area_html = re.search('data-role="ershoufang".*?地铁', html, re.S | re.M).group() area_list_str = re.findall('<a.*?</a>', area_html, re.S | re.M) for area_i in area_list_str: if 'ershoufang' in area_i: continue area_url = re.search('href="(.*?)"', area_i, re.S | re.M).group(1) area = re.search('<a.*?>(.*?)<', area_i, re.S | re.M).group(1) for i in range(1, 101): city_url_ = city_url.replace( '/chengjiao/', '') + area_url + 'pg' + str(i) try: result = requests.get(city_url_, headers=self.headers) content = result.text comm_str_list = re.findall( 'class="info".*?</div></div></li>', content, re.S | re.M) for i in comm_str_list: comm = Comm('链家在线') comm.region = area.strip() comm.city = city.strip() comm.district_name = re.search( 'target="_blank">(.*?)<', i, re.S | re.M).group(1).strip() comm.direction = re.search( 'class="houseIcon"></span>(.*?) \|', i, re.S | re.M).group(1).strip() try: comm.fitment = re.search( 'class="houseIcon"></span>.*? \|(.*?)\| ', i, re.S | re.M).group(1).strip() except Exception as e: comm.fitment = None try: height = re.search( 'class="positionIcon"></span>.*?\((.*?)\)', i, re.S | re.M).group(1).strip() comm.height = int( re.search('(\d+)', height, re.S | re.M).group(1)) except Exception as e: comm.height = None total_price = re.search( "class='number'>(.*?)<", i, re.S | re.M).group(1).strip() if "*" in total_price: continue comm.total_price = int( re.search('(\d+)', total_price, re.S | re.M).group(1)) * 10000 room_type = re.search( 'arget="_blank">.*? (.*?) ', i, re.S | re.M).group(1).strip() try: comm.room = int( re.search('(\d)室', room_type, re.S | re.M).group(1)) except Exception as e: comm.room = 0 try: comm.hall = int( re.search('(\d)厅', room_type, re.S | re.M).group(1)) except Exception as e: comm.hall = None area_ = re.search( 'target="_blank">.*? .*? (.*?平米)', i, re.S | re.M).group(1).strip() if area_: area_ = area_.replace('㎡', '').replace( '平米', '') try: area_ = float(area_) comm.area = round(area_, 2) except Exception as e: comm.area = None trade_date = re.search( 'dealDate">(.*?)<', i, re.S | re.M).group(1).strip() if trade_date: t = time.strptime(trade_date, "%Y.%m.%d") y = t.tm_year m = t.tm_mon d = t.tm_mday comm.trade_date = datetime.datetime( y, m, d) try: comm.avg_price = int(i['total_price'] / i['area']) except Exception as e: comm.avg_price = None comm.insert_db() except Exception as e: log.error( '解析错误,source="{}",html="{}",e="{}"'.format( '链家在线', html, e)) except Exception as e: log.error('请求错误,source="{}",url="{}",e="{}"'.format( '链家在线', city_url, e))