def get_sold_house_detail(self, response): house_id = response.url.split('/')[-1].split('.')[0] # 必填项 item = HlHouseItem() item['url'] = response.url item['city'] = 'nj' item['house_id'] = house_id status_tag = response.xpath('//div/div[@class = "wrapper"]/span/text()').extract_first() if status_tag is None: return self.logger.info('house[{0}] status:[{1}]'.format(house_id, status_tag)) if '成交' in status_tag: item['status'] = config.HOUSE_STATUS['DEAL'] # status=4,表示已成交 else: item['status'] = config.HOUSE_STATUS['DEAL_OTHER'] # status=5,其他情况 # 成交房源必填的数据项 item['deal_date'] = '-'.join(status_tag.split(' ')[0].split('.')) item['deal_total_price'] = float(response.xpath('//div/span[@class = "dealTotalPrice"]/i/text()').extract_first()) item['deal_unit_price'] = float(response.xpath('//div[@class = "price"]/b/text()').extract_first()) item['list_total_price'] = float(response.xpath('//div[@class = "msg"]/span/label/text()').extract_first()) try: item['deal_time_span'] = int(response.xpath('//div[@class = "msg"]/span/label/text()').extract()[1]) except Exception as e: self.logger.warning('convert deal time span exception: [{0}]'.format(e)) item['deal_time_span'] = None item['price_change_times'] = int(response.xpath('//div[@class = "msg"]/span/label/text()').extract()[2]) # 成交房源可获取到的其他数据项 info = response.xpath('//div/div[@class = "wrapper"]/text()').extract_first().split(' ') community = ''.join(info[0:-2]) item['community'] = community.replace('▪', '·') item['room_info'] = info[-2] item['district'] = response.xpath('//div[@class = "myAgent"]/div[@class = "name"]/a/text()').extract()[0] item['location'] = response.xpath('//div[@class = "myAgent"]/div[@class = "name"]/a/text()').extract()[-1] item['total_price'] = item['deal_total_price'] item['unit_price'] = item['deal_unit_price'] item['house_size'] = round(item['total_price'] / item['unit_price'] * 10000, 2) if item['deal_time_span'] is not None: list_date = datetime.datetime.strptime(item['deal_date'], "%Y-%m-%d") - datetime.timedelta(days=item['deal_time_span']) item['list_date'] = list_date.strftime('%Y-%m-%d') else: item['list_date'] = None item['list_unit_price'] = round(item['list_total_price'] / item['house_size'] * 10000, 2) return item
def get_selling_house_detail(self, response): house_id = response.url.split('/')[-1].split('.')[0] # 必填项 item = HlHouseItem() item['url'] = response.url item['city'] = 'nj' item['house_id'] = house_id status_tag = response.xpath('//h1[@class = "main"]/span/text()').extract_first() self.logger.info('house[{0}] status:[{1}]'.format(house_id, status_tag)) if status_tag is None: item['status'] = config.HOUSE_STATUS['ON_SALE'] # status=1,表示正常 elif status_tag == '已下架': item['status'] = config.HOUSE_STATUS['OFF_SALE'] # status=2,表示已下架 else: item['status'] = config.HOUSE_STATUS['ON_SALE_OTHER'] # status=3,表示其他状态 # item['deal_date'] = None item['total_price'] = float(response.xpath('//div/span[@class = "total"]/text()').extract_first()) item['unit_price'] = float(response.xpath('//span[@class = "unitPriceValue"]/text()').extract_first()) item['room_info'] = response.xpath( '//div[@class = "houseInfo"]/div[@class = "room"]/div[@class = "mainInfo"]/text()').extract_first() item['floor_info'] = response.xpath( '//div[@class = "houseInfo"]/div[@class = "room"]/div[@class = "subInfo"]/text()').extract_first() item['orientation'] = response.xpath( '//div[@class = "houseInfo"]/div[@class = "type"]/div[@class = "mainInfo"]/text()').extract_first() item['decoration'] = response.xpath( '//div[@class = "houseInfo"]/div[@class = "type"]/div[@class = "subInfo"]/text()').extract_first() # item['house_size'] = response.xpath('//div[@class = "houseInfo"]/div[@class = "area"]/div[@class = "mainInfo"]/text()').extract_first() item['house_size'] = round(item['total_price'] / item['unit_price'] * 10000, 2) item['house_type'] = response.xpath( '//div[@class = "houseInfo"]/div[@class = "area"]/div[@class = "subInfo"]/text()').extract_first() community = ''.join(response.xpath( '//div[@class = "aroundInfo"]/div[@class = "communityName"]//a[contains(@class, "info")]/text()').extract_first().split(' ')) item['community'] = community.replace('▪', '·') location_info = response.xpath('//div[@class = "aroundInfo"]/div[@class = "areaName"]//a/text()').extract() if len(location_info) == 3: item['district'], item['location'], item['subway_info'] = location_info elif len(location_info) == 2: item['district'] = location_info[0] item['location'] = location_info[1] item['subway_info'] = None else: item['district'] = location_info[0] item['location'] = None item['subway_info'] = None # get basic info basic_info = response.xpath( '//div[@class = "introContent"]/div[@class = "base"]/div[@class = "content"]/ul/li/text()').extract() if len(basic_info) >= 14: item['room_structure'] = basic_info[3] item['room_size'] = basic_info[4] item['building_structure'] = basic_info[7] item['elevator_household_ratio'] = basic_info[9] item['elevator_included'] = basic_info[10] item['property_right_deadline'] = basic_info[11] elif len(basic_info) == 9: item['room_structure'] = None item['room_size'] = basic_info[3] item['building_structure'] = basic_info[5] item['elevator_household_ratio'] = None item['elevator_included'] = None item['property_right_deadline'] = basic_info[8] elif len(basic_info) == 12: item['room_structure'] = basic_info[3] item['room_size'] = basic_info[4] item['building_structure'] = basic_info[7] item['elevator_household_ratio'] = basic_info[9] item['elevator_included'] = basic_info[10] item['property_right_deadline'] = basic_info[11] else: self.logger.warning('not xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx nan jing house info: [{0}]'.format(response.url)) return # get transaction info transaction_info = response.xpath('//div[@class = "introContent"]/div[@class = "transaction"]/div[@class = "content"]/ul/li/span[not(@class)]/text()').extract() item['list_date'] = transaction_info[0] item['last_trading_date'] = transaction_info[2] return item