def info_parse(self, ro_html, co_name, region, city_name): room_list = ro_html.xpath("//ul[@class='pList zu']/li") for room in room_list: ro = Base(source) # 城市 ro.city = city_name # 小区名 ro.district_name = co_name # 区域 ro.region = region room_type = room.xpath(".//p[@class='sTit']/strong/text()")[0] try: # 室 ro.room = int( re.search('(\d)室', room_type, re.S | re.M).group(1)) except Exception as e: ro.room = None try: # 厅 ro.hall = int( re.search('(\d)厅', room_type, re.S | re.M).group(1)) except Exception as e: ro.hall = None try: # 卫数 ro.toilet = int( re.search('(\d)卫', room_type, re.S | re.M).group(1)) except Exception as e: ro.toilet = None # # 总价 # total_price = room.xpath(".//div[@class='jiage']/strong/text()")[0] # ro.total_price = int(re.search('(\d+)', total_price, re.S | re.M).group(1)) * 10000 # 均价 avg_price = room.xpath(".//div[@class='jiage']/p/text()")[0] ro.avg_price = int( re.search('(\d+)', avg_price, re.S | re.M).group(1)) # 面积 info = room.xpath(".//div/p[2]/text()")[0] area = re.search('·(.*?)平米', info).group(1) area = float(area) ro.area = round(area, 2) try: ro.total_price = int(int(ro.avg_price) * float(ro.area)) except: ro.total_price = None # 朝向 direction = re.search('平米 · (.*)', info).group(1) ro.direction = direction.strip() # 交易日期 trade_date = room.xpath(".//div/p[3]/text()")[0] trade_date = trade_date.strip() t = time.strptime(trade_date, "成交日期:%Y-%m-%d") y = t.tm_year m = t.tm_mon d = t.tm_mday ro.trade_date = ro.local2utc(datetime.datetime(y, m, d)) ro.insert_db()
def parse(self, response, region): source = '太屋网' city = '上海' try: result_json = response.json() except Exception as e: log.error('无法序列化,source="{}",e="{}"'.format('太屋网', e)) return data_list = result_json['data'] for j in data_list: c = Base(source) # 城市 c.city = city # 区域 c.region = region # 室 c.room = int(j['RoomCount']) # 厅 c.hall = int(j['HollCount']) # 小区名称 c.district_name = j['BuildingName'] # 面积 c.area = round(float(j['BldArea']), 2) # 朝向 c.direction = j['Directed'] # 所在楼层 c.floor = int(j['Floor']) # 总楼层 c.height = int(j['FloorCount']) # 交易日期 trade_date = j['ExDate'] trade_date_ = int(re.search('(\d+)', trade_date).group(1)) t = time.localtime(int(trade_date_ / 1000)) y = t.tm_year m = t.tm_mon d = t.tm_mday c.trade_date = c.local2utc(datetime.datetime(y, m, d)) # 总价 c.total_price = int(j['ExPrice']) # 均价 try: c.avg_price = int(round(c.total_price / c.area, 2)) except: c.avg_price = None # # 总价 # try: # c.total_price = int(int(c.avg_price)*float(c.area)) # except: # c.total_price = None c.insert_db()
def parse(self, room_url, co_name, region, city_name): try: page_index = requests.get(url=room_url, headers=self.headers, proxies=self.proxies) except Exception as e: log.error('请求错误, source="{}",url="{}",e="{}"'.format( '新浪乐居', room_url, e)) return if re.search('共(\d+)页', page_index.text): page_num = re.search('共(\d+)页', page_index.text).group(1) for i in range(1, int(page_num) + 1): url = re.sub('#.*', 'n', room_url) + str(i) while True: try: res = requests.get(url=url, headers=self.headers, proxies=self.proxies) break except Exception as e: log.error('请求错误, source="{}",url="{}",e="{}"'.format( '新浪乐居', url, e)) continue con = res.text room_html = etree.HTML(con) room_list = room_html.xpath( "//div[@class='right-information']") for m in room_list: room = Base(source) room.url = url # 小区名 room.district_name = co_name # 城市 room.city = city_name # 区域 room.region = region room_type = m.xpath("./h3/span[2]/text()")[0] try: # 室 room.room = int( re.search('(\d)室', room_type, re.S | re.M).group(1)) except: room.room = None try: # 厅 room.hall = int( re.search('(\d)厅', room_type, re.S | re.M).group(1)) except: room.hall = None # 面积 size = m.xpath("./h3/span[3]/text()")[0] area = size.replace('平米', '') if area: area = float(area) room.area = round(area, 2) # 总价 # total_price = m.xpath(".//div[@class='price fs14 ']/em/text()")[0] # room.total_price = int(re.search('(\d+)', total_price, re.S | re.M).group(1))*10000 # 均价 avg_price = m.xpath( ".//div[@class='size fs14']/text()")[0] room.avg_price = int( re.search('(\d+)', avg_price, re.S | re.M).group(1)) try: room.total_price = int( int(room.avg_price) * float(room.area)) except: room.total_price = None try: fitment_direction_info = m.xpath( ".//div[@class='t1 fs14']")[0] fitment_direction_info = fitment_direction_info.xpath( 'string(.)') fitment_direction_info = fitment_direction_info.split( '|') if len(fitment_direction_info) == 2: room.fitment = fitment_direction_info[1] room.direction = fitment_direction_info[0] elif len(fitment_direction_info) == 3: room.fitment = fitment_direction_info[2] room.direction = fitment_direction_info[1] except: room.fitment = None room.direction = None floor_info = m.xpath(".//div[@class='fs14']/text()[1]")[0] try: floor = re.search('(.*?)/', floor_info).group(1) room.floor = int(re.search('\d+', floor).group(0)) except Exception as e: room.floor = None try: room.height = int( re.search('.*?/(\d+)层', floor_info).group(1)) except: room.height = None trade_date = m.xpath(".//div[@class='date']/text()")[0] if trade_date: t = time.strptime(trade_date, "%Y-%m-%d") y = t.tm_year m = t.tm_mon d = t.tm_mday room.trade_date = room.local2utc( datetime.datetime(y, m, d)) room.insert_db() else: log.info('source={}, url={}, 小区无相关数据'.format('新浪乐居', room_url)) return
def comm_detail(self, comm_url_list, city): for comm_url in comm_url_list[1:]: com_url = city.replace('/xiaoqu/', comm_url) statecode = re.search('xq-(.*)', comm_url).group(1) # R S 两种不同的接口 S代表出售 R代表出租 这里用S comm_detail_url = 'http://sh.centanet.com/apipost/GetDealRecord?estateCode=' + statecode + '&posttype=S&pageindex=1&pagesize=10000' try: com_res = requests.get(url=com_url, headers=self.headers, proxies=self.proxies) except Exception as e: log.error('source={}, 请求失败 url={} e={}'.format( '中原地产', com_url, e)) continue try: res = requests.get(url=comm_detail_url, headers=self.headers, proxies=self.proxies) except Exception as e: log.error('source={}, 请求失败 url={} e={}'.format( '中原地产', comm_detail_url, e)) continue html = etree.HTML(com_res.text) try: data_dict = json.loads(res.text) except Exception as e: log.error('source={}, 序列化失败 e={}'.format('中原地产', e)) continue try: district_name = html.xpath("//div/h3/text()")[0] city_name = html.xpath( "//div[@class='idx-city']/text()")[0].replace( '\n', '').replace('\t', '').replace(' ', '') region = html.xpath("//a[@class='f000']/text()")[0].replace( '\n', '').replace('\t', '').replace(' ', '') except Exception as e: log.error('source={}, 区域解析失败 e={}'.format('中原地产', e)) continue for data in data_dict["result"]: co = Base(source) # 小区名称 co.district_name = district_name.strip() # 区域 co.region = region # 城市 co.city = city_name try: room_type = data["houseType"] # 室数 co.room = int( re.search('(\d)室', room_type, re.S | re.M).group(1)) except Exception as e: co.room = None log.error('source={}, room为空 e={}'.format('中原地产', e)) try: room_type = data["houseType"] # 厅数 co.hall = int( re.search('(\d)厅', room_type, re.S | re.M).group(1)) except Exception as e: co.hall = None log.error('source={}, hall e={}'.format('中原地产', e)) # 面积 area = data['areaSize'].replace('平', '') if area: area = float(area) co.area = round(area, 2) # 朝向 co.direction = data['direction'] # 交易时间 trade_date = '20' + data['dealTime'] if trade_date: t = time.strptime(trade_date, "%Y-%m-%d") y = t.tm_year m = t.tm_mon d = t.tm_mday co.trade_date = co.local2utc(datetime.datetime(y, m, d)) try: # 均价 avg_price = data['unitPrice'] avg_price = int( float( re.search('(\d+\.?\d+)', avg_price, re.S | re.M).group(1)) * 10000) co.avg_price = avg_price except: co.avg_price = None # 总价 # total_price = data['dealPrice'] # co.total_price = int(re.search('(\d+)', total_price, re.S | re.M).group(1)) * 10000 try: co.total_price = int(int(co.avg_price) * float(co.area)) except: co.total_price = None co.url = comm_detail_url co.insert_db()
def parse(self, res, com_url): co = Base(source) co.url = com_url tree = etree.HTML(res.text) # 小区名称 district_name = tree.xpath( "//dl[@class='fl roominfor']/dd/h2/text()")[0].replace(' ', '') co.district_name = district_name # 城市 city = tree.xpath("/html/body/div[3]/div/a[1]/text()")[0].replace( '中原地产', '') co.city = city # 区域 region = tree.xpath("/html/body/div[3]/div/a[3]/text()")[0].replace( '小区', '') co.region = region info_list = tree.xpath( "//div[@class='tablerecord-list']/div[@class='tablerecond-item']") for info in info_list: # 室数 try: room_type = info.xpath("./a/span[1]/text()")[0] room = int(re.search('(\d)室', room_type, re.S | re.M).group(1)) co.room = room except: co.room = None try: # 厅数 room_type = info.xpath("./a/span[1]/text()")[0] hall = int(re.search('(\d)厅', room_type, re.S | re.M).group(1)) co.hall = hall except: co.hall = None # 朝向 try: direction = info.xpath("./a/span[2]/text()")[0].replace( ' ', '').replace('\n', '').replace('\t', '') co.direction = direction except: co.direction = None try: # 面积 area = info.xpath('./a/span[3]/text()')[0].replace('平', '') area = round(float(area), 2) co.area = area except: co.area = None # 交易时间 try: trade_date = info.xpath('./a/span[4]/text()')[0] t = time.strptime(trade_date, "%Y/%m/%d") y = t.tm_year m = t.tm_mon d = t.tm_mday co.trade_date = co.local2utc(datetime.datetime(y, m, d)) except: co.trade_date = None # # 总价 # try: # total_price = info.xpath("./a/span[5]/text()")[0] # total_price = int(re.search('(\d+)', total_price, re.S | re.M).group(1)) * 10000 # co.total_price = total_price # except: # co.total_price = None # 均价 try: avg_price = info.xpath("./a/span[6]/text()")[0] avg_price = int(avg_price.replace('元/平', '')) co.avg_price = avg_price except: co.avg_price = None # 总价 try: co.total_price = int(int(co.avg_price) * float(co.area)) except: co.total_price = None co.insert_db()
def final_parse(self, data): final_url = data['link'] city = data['city'] region = data['region'] try: r = requests.get(url=final_url, headers=self.headers, proxies=self.proxies, timeout=60) except Exception as e: log.error('请求失败, source={}, 没有更多小区成交 url={}, e={}'.format( '链家在线', final_url, e)) return tree = etree.HTML(r.text) url_list = tree.xpath("//ul[@class='listContent']/li") if url_list: for info in url_list: comm = Base('链家在线') comm.url = final_url # 区域 comm.region = region.strip() # 城市 comm.city = city.strip() district_name_room_area = info.xpath( "./div/div[@class='title']/a/text()")[0] # 小区名称 comm.district_name = district_name_room_area.split(' ')[0] try: room_hall = district_name_room_area.split(' ')[1] except: room_hall = None try: # 室 comm.room = int( re.search('(\d+)室', room_hall, re.S | re.M).group(1)) except: comm.room = None try: # 厅 comm.hall = int( re.search('(\d+)厅', room_hall, re.S | re.M).group(1)) except: comm.hall = None try: # 面积 area = district_name_room_area.split(' ')[2] area = re.search("(.*?)平米", area, re.S | re.M).group(1) comm.area = round(float(area), 2) except: comm.area = None try: direction_fitment = info.xpath( "./div/div[@class='address']/div[1]/text()")[0].split( '|') # 朝向 comm.direction = direction_fitment[0] # 装修 comm.fitment = direction_fitment[1] except: comm.direction = None comm.fitment = None # 总楼层 try: height = info.xpath( "./div/div[@class='flood']/div[1]/text()")[0] comm.height = int( re.search("共(\d+)层", height, re.S | re.M).group(1)) except: comm.height = None # # 总价 # try: # total_price = info.xpath("./div/div[@class='address']/div[3]/span/text()")[0] # if "*" in total_price: # log.error('source={}, 总价有问题 带*号'.format('链家在线')) # continue # else: # comm.total_price = int(total_price) * 10000 # except: # comm.total_price = None # 交易时间 try: trade_date = info.xpath( "./div/div[@class='address']/div[2]/text()")[0] t = time.strptime(trade_date, "%Y.%m.%d") y = t.tm_year m = t.tm_mon d = t.tm_mday comm.trade_date = comm.local2utc(datetime.datetime( y, m, d)) except: comm.trade_date = None # 均价 try: avg_price = info.xpath( "./div/div[@class='flood']/div[3]/span/text()")[0] comm.avg_price = int(avg_price) except: comm.avg_price = None try: comm.total_price = int( int(comm.avg_price) * float(comm.area)) except: comm.total_price = None comm.insert_db()
def parse(self, url, city): try: response = requests.get(url=url, headers=self.headers, proxies=self.proxies) except Exception as e: log.error('请求失败,source="{}", url="{}",e="{}"'.format( '房天下', url, e)) return tree = etree.HTML(response.text) info_list = tree.xpath("//div[@class='houseList']/dl") comm = Base('房天下') comm.url = url comm.city = city for info in info_list: district_name_info = info.xpath("./dd/p/a/text()")[0] # 小区名称 comm.district_name = district_name_info.split(' ')[0] if '�' in comm.district_name: log.error('source={}, 网页出现繁体字, url={}'.format('房天下', url)) break # 室 try: comm.room = int( re.search('(\d+)室', district_name_info, re.S | re.M).group(1)) except Exception as e: comm.room = None # 厅 try: comm.hall = int( re.search('(\d+)厅', district_name_info, re.S | re.M).group(1)) except Exception as e: comm.hall = None # 面积 try: comm.area = float( re.search('(\d+\.?\d+?)平米', district_name_info, re.S | re.M).group(1)) except Exception as e: comm.area = None # 区域 try: region_info = info.xpath("./dd/p[2]/text()")[0] comm.region = region_info.split('-')[0] except Exception as e: comm.region = None # 朝向 总楼层 try: direction_info = info.xpath("./dd/p[3]")[0] direction_info = direction_info.xpath('string(.)') comm.direction = direction_info.split('|')[0] comm.height = int( re.search('\(共(.*?)层\)', direction_info, re.S | re.M).group(1)) except Exception as e: comm.direction = None comm.height = None # 时间 try: trade_date = info.xpath("./dd/div[2]/p[1]/text()")[0] t = time.strptime(trade_date, "%Y-%m-%d") y = t.tm_year m = t.tm_mon d = t.tm_mday comm.trade_date = datetime.datetime(y, m, d) except Exception as e: comm.trade_date = None # 总价 try: total_price = info.xpath("./dd/div[3]/p[1]/span[1]/text()")[0] comm.total_price = int(total_price) * 10000 except Exception as e: comm.total_price = None # 均价 try: avg_price_info = info.xpath("./dd/div[3]/p[2]/b[1]/text()")[0] comm.avg_price = int( re.search("(\d+)元", avg_price_info, re.S | re.M).group(1)) except Exception as e: comm.avg_price = None comm.insert_db()
def crawler(self, city_url, city): print(city_url) try: res = requests.get(url=city_url, headers=self.headers, proxies=self.proxies) except Exception as e: log.error('请求错误,source="{}",url="{}",e="{}"'.format('麦田', city_url, e)) return con = etree.HTML(res.text) try: last_page = con.xpath("//a[@class='down_page']/@href")[1] page_num = re.search('\d+', last_page).group(0) except Exception as e: log.error('获取页码失败,source="{}",url="{}",e="{}"'.format('麦田', city_url, e)) return for i in range(1, int(page_num) + 1): page_url = city_url + "/PG" + str(i) try: page_res = requests.get(url=page_url, headers=self.headers, proxies=self.proxies) except Exception as e: log.error('请求错误,source="{}",url="{}",e="{}"'.format('麦田', page_url, e)) continue page_con = etree.HTML(page_res.text) temp = page_con.xpath("//h1/a/@href") for temp_url in temp: com = Base(source) comm_url = city + temp_url com.url = comm_url try: co_res = requests.get(url=comm_url, headers=self.headers, proxies=self.proxies) except Exception as e: log.error('请求错误,source="{}",url="{}",e="{}"'.format('麦田', comm_url, e)) continue co_con = etree.HTML(co_res.text) # 城市 try: com.city = co_con.xpath("//div/a[@class='show']/text()")[0] # 区域 region = co_con.xpath("//section[@class='fl home_main']/p[3]/a/text()")[-1] com.region = re.search("\[(.*)\]", region, re.S | re.M).group(1) # 小区名称 com.district_name = co_con.xpath("//cite/span/text()")[0] info = co_con.xpath("//table/tbody/tr") except Exception as e: log.error('获取城市区域小区名失败, source="{}",url="{}",e="{}"'.format('麦田', comm_url, e)) continue for tag in info: size = tag.xpath("./td[2]/text()")[0] area = size.replace('㎡', '') area = float(area) # 面积 com.area = round(area, 2) # 均价 avg_price = tag.xpath("./td[3]/text()")[0] com.avg_price = int(re.search('(\d+)', avg_price, re.S | re.M).group(1)) # # 总价 # total_price = tag.xpath("./td/span/text()")[0] # com.total_price = int(re.search('(\d+)', total_price, re.S | re.M).group(1)) * 10000 try: com.total_price = int(int(com.avg_price)*float(com.area)) except: com.total_price = None # 成交日期 trade_date = tag.xpath("./td/text()")[-2] if trade_date: t = time.strptime(trade_date, "%Y-%m-%d") y = t.tm_year m = t.tm_mon d = t.tm_mday com.trade_date = com.local2utc(datetime.datetime(y, m, d)) room_type = tag.xpath("./td//p/a/text()")[0] try: # 室 com.room = int(re.search('(\d)室', room_type, re.S | re.M).group(1)) except: com.room = None try: # 厅 com.hall = int(re.search('(\d)厅', room_type, re.S | re.M).group(1)) except: com.hall = None # 总楼层 floor = tag.xpath("./td//p/span/text()")[0] com.floor = int(re.search('(\d+)层', floor, re.S | re.M).group(1)) # 朝向 com.direction = floor.split(' ')[1] com.insert_db()
def get_detail(self, response, city, region, url): html = response.text tree = etree.HTML(html) info_list = tree.xpath("//div[@class='house-detail']/ul/li") for info in info_list: comm = Base('Q房网') # 链接 comm.url = url # 城市 comm.city = city.strip() # 区域 comm.region = region.strip() district_name_room_area = info.xpath("./div[1]/p[1]/a[1]/text()")[0] # 小区名称 comm.district_name = district_name_room_area.split(' ')[0] # 室 try: comm.room = int(re.search("(\d+)室", district_name_room_area, re.S | re.M).group(1)) except: comm.room = None # 厅 try: comm.hall = int(re.search("(\d+)厅", district_name_room_area, re.S | re.M).group(1)) except: comm.hall = None # 面积 try: area = re.search("(\d+.?\d+?)平米", district_name_room_area, re.S | re.M).group(1) comm.area = round(float(area), 2) except: comm.area = None # 朝向 总楼层 try: direction = info.xpath("./div[1]/p[2]/span[4]/text()")[0] if '层' not in direction: comm.direction = direction height = info.xpath("./div[1]/p[2]/span[6]/text()")[0] comm.height = int(re.search("(\d+)层", height, re.S | re.M).group(1)) else: comm.direction = None comm.height = int(re.search("(\d+)层", direction, re.S | re.M).group(1)) except: comm.direction = None comm.height = None # # 总价 # try: # total_price = info.xpath("./div[2]/span[1]/text()")[0] # comm.total_price = int(total_price) * 10000 # except: # comm.total_price = None # 均价 try: avg_price = info.xpath("./div[2]/p[1]/text()")[0] comm.avg_price = int(re.search("\d+", avg_price, re.S | re.M).group(0)) except: comm.avg_price = None # 总价 try: comm.total_price = int(int(comm.avg_price)*float(comm.area)) except: comm.total_price = None # 交易时间 try: trade_date = info.xpath("./div[3]/span[1]/text()")[0] t = time.strptime(trade_date, "%Y.%m.%d") y = t.tm_year m = t.tm_mon d = t.tm_mday comm.trade_date = comm.local2utc(datetime.datetime(y, m, d)) except: comm.trade_date = None comm.insert_db()