def parseOne(self, one): oneOut = items.HouseItem() oneOut['src'] = self.src oneOut['title'] = ''.join( one.xpath('./div[2]/div[1]/a/text()').extract()).strip() href = ''.join(one.xpath('./div[2]/div[1]/a/@href').extract()).strip() if len(href) > 0: id = '-1' try: id = href.split('?')[0].split('/')[-1][:-5] except Exception as e: logging.warning("parseOne Exception %s" % (str(e))) oneOut['_id'] = id try: tmp = ''.join( one.xpath('./div[2]/div[3]/span/text()').extract()).strip() tmp = tmp.split(' ') if len(tmp) > 0: oneOut['community'] = tmp[0] if len(tmp) > 1: tmp2 = tmp.split('-') if len(tmp2) > 0: oneOut['district'] = tmp2[0] if len(tmp2) > 1: oneOut['subDistrict'] = tmp2[1] totalPrice = String2Number(''.join( one.xpath('./div[3]/span[1]/strong/text()').extract()).strip()) oneOut['totalPrice'] = totalPrice oneOut['unitPrice'] = String2Number(''.join( one.xpath('./div[3]/span[2]/text()').extract()).strip()) oneOut['houseType'] = ''.join( one.xpath('./div[2]/div[2]/span[1]/text()').extract()).strip() oneOut['square'] = String2Number(''.join( one.xpath('./div[2]/div[2]/span[2]/text()').extract()).strip()) oneOut['level'] = ''.join( one.xpath('./div[2]/div[2]/span[3]/text()').extract()).strip() oneOut['structure'] = ''.join( one.xpath('./div[2]/div[2]/span[4]/text()').extract()).strip() oneOut['crawlDate'] = today() except Exception as e: print(e) logging.warning("parseOne Exception %s" % (str(e))) return oneOut
def parseOne(self, one): oneOut = items.HouseItem() oneOut['src'] = self.src oneOut['district'] = ''.join(one.xpath('./div[1]/p[3]/a[1]/text()').extract()).strip() oneOut['subDistrict'] = ''.join(one.xpath('./div[1]/p[3]/a[2]/text()').extract()).strip() oneOut['title'] = ''.join(one.xpath('./div[1]/h4/a/text()').extract()).strip() href = ''.join(one.xpath('./div[1]/h4/a/@href').extract()).strip() if len(href) > 0: id = '-1' try: id = href.split('/')[-1][:-5] except Exception as e: logging.warning("parseOne Exception %s" % (str(e))) oneOut['_id'] = id try: totalPrice = util.String2Number(''.join(one.xpath('./div[2]/p[1]/span/text()').extract()).strip()) oneOut['totalPrice'] = totalPrice oneOut['unitPrice'] = util.String2Number( ''.join(one.xpath('./div[2]/p[2]/text()').extract()).strip()) oneOut['community'] = ''.join(one.xpath('./div[1]/p[1]/a/text()').extract()) # community = community.split(' ') # if len(community) >= 2: # oneOut['community'] = community[1] oneOut['houseType'] = ''.join(one.xpath('./div[1]/p[1]/span[2]/text()').extract()).strip() if oneOut['houseType'] == '|': oneOut['houseType'] = ''.join(one.xpath('./div[1]/p[1]/span[3]/text()').extract()).strip() oneOut['square'] = util.String2Number(''.join(one.xpath('./div[1]/p[1]/span[4]/text()').extract()).strip()) if np.isnan(oneOut['square']): oneOut['square'] = util.String2Number(''.join(one.xpath('./div[1]/p[1]/span[5]/text()').extract()).strip()) oneOut['level'] = ''.join(one.xpath('./div[1]/p[2]/span[1]/text()').extract()).strip() oneOut['crawlDate'] = util.today() except Exception as e: print(e) logging.warning("parseOne Exception %s"%(str(e))) return oneOut
def parseOne(self, one, district, subDistrict): # {'_id': '', # 'area': '', # 'attention': '', # 'community': '', # 'crawlDate': datetime.datetime(2019, 8, 24, 0, 0), # 'district': '朝阳', # 'level': ')', # 'src': 'lj', # 'subDistrict': '通州北苑', # 'title': '', # 'totalPrice': nan, # 'unitPrice': nan} oneOut = items.HouseItem() oneOut['src'] = self.src oneOut['district'] = district oneOut['subDistrict'] = subDistrict oneOut['title'] = ''.join( one.xpath('.//div[1]/div[1]/a/text()').extract()).strip() oneOut['_id'] = ''.join( one.xpath('.//div[1]/div[1]/a/@data-housecode').extract()).strip() try: unitPrice = util.String2Number(''.join( one.xpath( './/div[1]/div[6]/div[2]/span/text()').extract()).strip()) if not np.isnan(unitPrice): oneOut['unitPrice'] = unitPrice oneOut['totalPrice'] = util.String2Number(''.join( one.xpath('.//div[1]/div[6]/div[1]/span/text()').extract() ).strip()) else: # https://sh.lianjia.com/ershoufang/changning/pg96/ oneOut['unitPrice'] = util.String2Number(''.join( one.xpath('.//div[1]/div[7]/div[2]/span/text()').extract() ).strip()) oneOut['totalPrice'] = util.String2Number(''.join( one.xpath('.//div[1]/div[7]/div[1]/span/text()').extract() ).strip()) oneOut['community'] = ''.join( one.xpath('.//div[1]/div[2]/div/a/text()').extract()) houseInfo = ''.join( one.xpath('.//div[1]/div[2]/div/text()').extract()) houseInfo = houseInfo.split('|') if len(houseInfo) > 1: oneOut['houseType'] = houseInfo[1].strip() if len(houseInfo) > 2: oneOut['square'] = util.String2Number(houseInfo[2].strip()) #'/html/body/div[4]/div[1]/ul/li[1]/div[1]/div[3]/div/a' oneOut['area'] = util.ExtractString( one, './/div[1]/div[3]/div/a/text()') positionInfo = ''.join( one.xpath('.//div[1]/div[3]/div/text()').extract()) positionInfo = positionInfo.split(')') if len(positionInfo) > 0: oneOut['level'] = positionInfo[0].strip() + ')' if len(positionInfo) > 1: oneOut['structure'] = positionInfo[1].strip() followInfo = ''.join( one.xpath('.//div[1]/div[4]/text()').extract()) followInfo = followInfo.split('/') if len(followInfo) > 0: oneOut['attention'] = followInfo[0].strip() if len(followInfo) > 1: oneOut['follow'] = followInfo[1].strip() if len(followInfo) > 2: oneOut['release'] = followInfo[2].strip() oneOut['crawlDate'] = util.today() except Exception as e: print(e) logging.warning("parseOne Exception %s" % (str(e))) return oneOut