def parse(self, room_url, co_name, region, city_name): try: page_index = requests.get(url=room_url, headers=self.headers, proxies=self.proxies) except Exception as e: log.error('请求错误, source="{}",url="{}",e="{}"'.format( '新浪乐居', room_url, e)) return if re.search('共(\d+)页', page_index.text): page_num = re.search('共(\d+)页', page_index.text).group(1) for i in range(1, int(page_num) + 1): url = re.sub('#.*', 'n', room_url) + str(i) while True: try: res = requests.get(url=url, headers=self.headers, proxies=self.proxies) break except Exception as e: log.error('请求错误, source="{}",url="{}",e="{}"'.format( '新浪乐居', url, e)) continue con = res.text room_html = etree.HTML(con) room_list = room_html.xpath( "//div[@class='right-information']") for m in room_list: room = Base(source) room.url = url # 小区名 room.district_name = co_name # 城市 room.city = city_name # 区域 room.region = region room_type = m.xpath("./h3/span[2]/text()")[0] try: # 室 room.room = int( re.search('(\d)室', room_type, re.S | re.M).group(1)) except: room.room = None try: # 厅 room.hall = int( re.search('(\d)厅', room_type, re.S | re.M).group(1)) except: room.hall = None # 面积 size = m.xpath("./h3/span[3]/text()")[0] area = size.replace('平米', '') if area: area = float(area) room.area = round(area, 2) # 总价 # total_price = m.xpath(".//div[@class='price fs14 ']/em/text()")[0] # room.total_price = int(re.search('(\d+)', total_price, re.S | re.M).group(1))*10000 # 均价 avg_price = m.xpath( ".//div[@class='size fs14']/text()")[0] room.avg_price = int( re.search('(\d+)', avg_price, re.S | re.M).group(1)) try: room.total_price = int( int(room.avg_price) * float(room.area)) except: room.total_price = None try: fitment_direction_info = m.xpath( ".//div[@class='t1 fs14']")[0] fitment_direction_info = fitment_direction_info.xpath( 'string(.)') fitment_direction_info = fitment_direction_info.split( '|') if len(fitment_direction_info) == 2: room.fitment = fitment_direction_info[1] room.direction = fitment_direction_info[0] elif len(fitment_direction_info) == 3: room.fitment = fitment_direction_info[2] room.direction = fitment_direction_info[1] except: room.fitment = None room.direction = None floor_info = m.xpath(".//div[@class='fs14']/text()[1]")[0] try: floor = re.search('(.*?)/', floor_info).group(1) room.floor = int(re.search('\d+', floor).group(0)) except Exception as e: room.floor = None try: room.height = int( re.search('.*?/(\d+)层', floor_info).group(1)) except: room.height = None trade_date = m.xpath(".//div[@class='date']/text()")[0] if trade_date: t = time.strptime(trade_date, "%Y-%m-%d") y = t.tm_year m = t.tm_mon d = t.tm_mday room.trade_date = room.local2utc( datetime.datetime(y, m, d)) room.insert_db() else: log.info('source={}, url={}, 小区无相关数据'.format('新浪乐居', room_url)) return
def final_parse(self, data): final_url = data['link'] city = data['city'] region = data['region'] try: r = requests.get(url=final_url, headers=self.headers, proxies=self.proxies, timeout=60) except Exception as e: log.error('请求失败, source={}, 没有更多小区成交 url={}, e={}'.format( '链家在线', final_url, e)) return tree = etree.HTML(r.text) url_list = tree.xpath("//ul[@class='listContent']/li") if url_list: for info in url_list: comm = Base('链家在线') comm.url = final_url # 区域 comm.region = region.strip() # 城市 comm.city = city.strip() district_name_room_area = info.xpath( "./div/div[@class='title']/a/text()")[0] # 小区名称 comm.district_name = district_name_room_area.split(' ')[0] try: room_hall = district_name_room_area.split(' ')[1] except: room_hall = None try: # 室 comm.room = int( re.search('(\d+)室', room_hall, re.S | re.M).group(1)) except: comm.room = None try: # 厅 comm.hall = int( re.search('(\d+)厅', room_hall, re.S | re.M).group(1)) except: comm.hall = None try: # 面积 area = district_name_room_area.split(' ')[2] area = re.search("(.*?)平米", area, re.S | re.M).group(1) comm.area = round(float(area), 2) except: comm.area = None try: direction_fitment = info.xpath( "./div/div[@class='address']/div[1]/text()")[0].split( '|') # 朝向 comm.direction = direction_fitment[0] # 装修 comm.fitment = direction_fitment[1] except: comm.direction = None comm.fitment = None # 总楼层 try: height = info.xpath( "./div/div[@class='flood']/div[1]/text()")[0] comm.height = int( re.search("共(\d+)层", height, re.S | re.M).group(1)) except: comm.height = None # # 总价 # try: # total_price = info.xpath("./div/div[@class='address']/div[3]/span/text()")[0] # if "*" in total_price: # log.error('source={}, 总价有问题 带*号'.format('链家在线')) # continue # else: # comm.total_price = int(total_price) * 10000 # except: # comm.total_price = None # 交易时间 try: trade_date = info.xpath( "./div/div[@class='address']/div[2]/text()")[0] t = time.strptime(trade_date, "%Y.%m.%d") y = t.tm_year m = t.tm_mon d = t.tm_mday comm.trade_date = comm.local2utc(datetime.datetime( y, m, d)) except: comm.trade_date = None # 均价 try: avg_price = info.xpath( "./div/div[@class='flood']/div[3]/span/text()")[0] comm.avg_price = int(avg_price) except: comm.avg_price = None try: comm.total_price = int( int(comm.avg_price) * float(comm.area)) except: comm.total_price = None comm.insert_db()