def parse(self, response, region): source = '太屋网' city = '上海' try: result_json = response.json() except Exception as e: log.error('无法序列化,source="{}",e="{}"'.format('太屋网', e)) return data_list = result_json['data'] for j in data_list: c = Base(source) # 城市 c.city = city # 区域 c.region = region # 室 c.room = int(j['RoomCount']) # 厅 c.hall = int(j['HollCount']) # 小区名称 c.district_name = j['BuildingName'] # 面积 c.area = round(float(j['BldArea']), 2) # 朝向 c.direction = j['Directed'] # 所在楼层 c.floor = int(j['Floor']) # 总楼层 c.height = int(j['FloorCount']) # 交易日期 trade_date = j['ExDate'] trade_date_ = int(re.search('(\d+)', trade_date).group(1)) t = time.localtime(int(trade_date_ / 1000)) y = t.tm_year m = t.tm_mon d = t.tm_mday c.trade_date = c.local2utc(datetime.datetime(y, m, d)) # 总价 c.total_price = int(j['ExPrice']) # 均价 try: c.avg_price = int(round(c.total_price / c.area, 2)) except: c.avg_price = None # # 总价 # try: # c.total_price = int(int(c.avg_price)*float(c.area)) # except: # c.total_price = None c.insert_db()
def parse(self, room_url, co_name, region, city_name): try: page_index = requests.get(url=room_url, headers=self.headers, proxies=self.proxies) except Exception as e: log.error('请求错误, source="{}",url="{}",e="{}"'.format( '新浪乐居', room_url, e)) return if re.search('共(\d+)页', page_index.text): page_num = re.search('共(\d+)页', page_index.text).group(1) for i in range(1, int(page_num) + 1): url = re.sub('#.*', 'n', room_url) + str(i) while True: try: res = requests.get(url=url, headers=self.headers, proxies=self.proxies) break except Exception as e: log.error('请求错误, source="{}",url="{}",e="{}"'.format( '新浪乐居', url, e)) continue con = res.text room_html = etree.HTML(con) room_list = room_html.xpath( "//div[@class='right-information']") for m in room_list: room = Base(source) room.url = url # 小区名 room.district_name = co_name # 城市 room.city = city_name # 区域 room.region = region room_type = m.xpath("./h3/span[2]/text()")[0] try: # 室 room.room = int( re.search('(\d)室', room_type, re.S | re.M).group(1)) except: room.room = None try: # 厅 room.hall = int( re.search('(\d)厅', room_type, re.S | re.M).group(1)) except: room.hall = None # 面积 size = m.xpath("./h3/span[3]/text()")[0] area = size.replace('平米', '') if area: area = float(area) room.area = round(area, 2) # 总价 # total_price = m.xpath(".//div[@class='price fs14 ']/em/text()")[0] # room.total_price = int(re.search('(\d+)', total_price, re.S | re.M).group(1))*10000 # 均价 avg_price = m.xpath( ".//div[@class='size fs14']/text()")[0] room.avg_price = int( re.search('(\d+)', avg_price, re.S | re.M).group(1)) try: room.total_price = int( int(room.avg_price) * float(room.area)) except: room.total_price = None try: fitment_direction_info = m.xpath( ".//div[@class='t1 fs14']")[0] fitment_direction_info = fitment_direction_info.xpath( 'string(.)') fitment_direction_info = fitment_direction_info.split( '|') if len(fitment_direction_info) == 2: room.fitment = fitment_direction_info[1] room.direction = fitment_direction_info[0] elif len(fitment_direction_info) == 3: room.fitment = fitment_direction_info[2] room.direction = fitment_direction_info[1] except: room.fitment = None room.direction = None floor_info = m.xpath(".//div[@class='fs14']/text()[1]")[0] try: floor = re.search('(.*?)/', floor_info).group(1) room.floor = int(re.search('\d+', floor).group(0)) except Exception as e: room.floor = None try: room.height = int( re.search('.*?/(\d+)层', floor_info).group(1)) except: room.height = None trade_date = m.xpath(".//div[@class='date']/text()")[0] if trade_date: t = time.strptime(trade_date, "%Y-%m-%d") y = t.tm_year m = t.tm_mon d = t.tm_mday room.trade_date = room.local2utc( datetime.datetime(y, m, d)) room.insert_db() else: log.info('source={}, url={}, 小区无相关数据'.format('新浪乐居', room_url)) return
def parse(self, url, city): try: response = requests.get(url=url, headers=self.headers, proxies=self.proxies) except Exception as e: log.error('请求失败,source="{}", url="{}",e="{}"'.format( '房天下', url, e)) return tree = etree.HTML(response.text) info_list = tree.xpath("//div[@class='houseList']/dl") comm = Base('房天下') comm.url = url comm.city = city for info in info_list: district_name_info = info.xpath("./dd/p/a/text()")[0] # 小区名称 comm.district_name = district_name_info.split(' ')[0] if '�' in comm.district_name: log.error('source={}, 网页出现繁体字, url={}'.format('房天下', url)) break # 室 try: comm.room = int( re.search('(\d+)室', district_name_info, re.S | re.M).group(1)) except Exception as e: comm.room = None # 厅 try: comm.hall = int( re.search('(\d+)厅', district_name_info, re.S | re.M).group(1)) except Exception as e: comm.hall = None # 面积 try: comm.area = float( re.search('(\d+\.?\d+?)平米', district_name_info, re.S | re.M).group(1)) except Exception as e: comm.area = None # 区域 try: region_info = info.xpath("./dd/p[2]/text()")[0] comm.region = region_info.split('-')[0] except Exception as e: comm.region = None # 朝向 总楼层 try: direction_info = info.xpath("./dd/p[3]")[0] direction_info = direction_info.xpath('string(.)') comm.direction = direction_info.split('|')[0] comm.height = int( re.search('\(共(.*?)层\)', direction_info, re.S | re.M).group(1)) except Exception as e: comm.direction = None comm.height = None # 时间 try: trade_date = info.xpath("./dd/div[2]/p[1]/text()")[0] t = time.strptime(trade_date, "%Y-%m-%d") y = t.tm_year m = t.tm_mon d = t.tm_mday comm.trade_date = datetime.datetime(y, m, d) except Exception as e: comm.trade_date = None # 总价 try: total_price = info.xpath("./dd/div[3]/p[1]/span[1]/text()")[0] comm.total_price = int(total_price) * 10000 except Exception as e: comm.total_price = None # 均价 try: avg_price_info = info.xpath("./dd/div[3]/p[2]/b[1]/text()")[0] comm.avg_price = int( re.search("(\d+)元", avg_price_info, re.S | re.M).group(1)) except Exception as e: comm.avg_price = None comm.insert_db()
def final_parse(self, data): final_url = data['link'] city = data['city'] region = data['region'] try: r = requests.get(url=final_url, headers=self.headers, proxies=self.proxies, timeout=60) except Exception as e: log.error('请求失败, source={}, 没有更多小区成交 url={}, e={}'.format( '链家在线', final_url, e)) return tree = etree.HTML(r.text) url_list = tree.xpath("//ul[@class='listContent']/li") if url_list: for info in url_list: comm = Base('链家在线') comm.url = final_url # 区域 comm.region = region.strip() # 城市 comm.city = city.strip() district_name_room_area = info.xpath( "./div/div[@class='title']/a/text()")[0] # 小区名称 comm.district_name = district_name_room_area.split(' ')[0] try: room_hall = district_name_room_area.split(' ')[1] except: room_hall = None try: # 室 comm.room = int( re.search('(\d+)室', room_hall, re.S | re.M).group(1)) except: comm.room = None try: # 厅 comm.hall = int( re.search('(\d+)厅', room_hall, re.S | re.M).group(1)) except: comm.hall = None try: # 面积 area = district_name_room_area.split(' ')[2] area = re.search("(.*?)平米", area, re.S | re.M).group(1) comm.area = round(float(area), 2) except: comm.area = None try: direction_fitment = info.xpath( "./div/div[@class='address']/div[1]/text()")[0].split( '|') # 朝向 comm.direction = direction_fitment[0] # 装修 comm.fitment = direction_fitment[1] except: comm.direction = None comm.fitment = None # 总楼层 try: height = info.xpath( "./div/div[@class='flood']/div[1]/text()")[0] comm.height = int( re.search("共(\d+)层", height, re.S | re.M).group(1)) except: comm.height = None # # 总价 # try: # total_price = info.xpath("./div/div[@class='address']/div[3]/span/text()")[0] # if "*" in total_price: # log.error('source={}, 总价有问题 带*号'.format('链家在线')) # continue # else: # comm.total_price = int(total_price) * 10000 # except: # comm.total_price = None # 交易时间 try: trade_date = info.xpath( "./div/div[@class='address']/div[2]/text()")[0] t = time.strptime(trade_date, "%Y.%m.%d") y = t.tm_year m = t.tm_mon d = t.tm_mday comm.trade_date = comm.local2utc(datetime.datetime( y, m, d)) except: comm.trade_date = None # 均价 try: avg_price = info.xpath( "./div/div[@class='flood']/div[3]/span/text()")[0] comm.avg_price = int(avg_price) except: comm.avg_price = None try: comm.total_price = int( int(comm.avg_price) * float(comm.area)) except: comm.total_price = None comm.insert_db()
def get_detail(self, response, city, region, url): html = response.text tree = etree.HTML(html) info_list = tree.xpath("//div[@class='house-detail']/ul/li") for info in info_list: comm = Base('Q房网') # 链接 comm.url = url # 城市 comm.city = city.strip() # 区域 comm.region = region.strip() district_name_room_area = info.xpath("./div[1]/p[1]/a[1]/text()")[0] # 小区名称 comm.district_name = district_name_room_area.split(' ')[0] # 室 try: comm.room = int(re.search("(\d+)室", district_name_room_area, re.S | re.M).group(1)) except: comm.room = None # 厅 try: comm.hall = int(re.search("(\d+)厅", district_name_room_area, re.S | re.M).group(1)) except: comm.hall = None # 面积 try: area = re.search("(\d+.?\d+?)平米", district_name_room_area, re.S | re.M).group(1) comm.area = round(float(area), 2) except: comm.area = None # 朝向 总楼层 try: direction = info.xpath("./div[1]/p[2]/span[4]/text()")[0] if '层' not in direction: comm.direction = direction height = info.xpath("./div[1]/p[2]/span[6]/text()")[0] comm.height = int(re.search("(\d+)层", height, re.S | re.M).group(1)) else: comm.direction = None comm.height = int(re.search("(\d+)层", direction, re.S | re.M).group(1)) except: comm.direction = None comm.height = None # # 总价 # try: # total_price = info.xpath("./div[2]/span[1]/text()")[0] # comm.total_price = int(total_price) * 10000 # except: # comm.total_price = None # 均价 try: avg_price = info.xpath("./div[2]/p[1]/text()")[0] comm.avg_price = int(re.search("\d+", avg_price, re.S | re.M).group(0)) except: comm.avg_price = None # 总价 try: comm.total_price = int(int(comm.avg_price)*float(comm.area)) except: comm.total_price = None # 交易时间 try: trade_date = info.xpath("./div[3]/span[1]/text()")[0] t = time.strptime(trade_date, "%Y.%m.%d") y = t.tm_year m = t.tm_mon d = t.tm_mday comm.trade_date = comm.local2utc(datetime.datetime(y, m, d)) except: comm.trade_date = None comm.insert_db()