示例#1
0
    def parseOne(self, one):
        oneOut = items.HouseItem()
        oneOut['src'] = self.src

        oneOut['title'] = ''.join(
            one.xpath('./div[2]/div[1]/a/text()').extract()).strip()
        href = ''.join(one.xpath('./div[2]/div[1]/a/@href').extract()).strip()
        if len(href) > 0:
            id = '-1'
            try:
                id = href.split('?')[0].split('/')[-1][:-5]
            except Exception as e:
                logging.warning("parseOne Exception %s" % (str(e)))
            oneOut['_id'] = id

        try:
            tmp = ''.join(
                one.xpath('./div[2]/div[3]/span/text()').extract()).strip()
            tmp = tmp.split(' ')
            if len(tmp) > 0:
                oneOut['community'] = tmp[0]
                if len(tmp) > 1:
                    tmp2 = tmp.split('-')
                    if len(tmp2) > 0:
                        oneOut['district'] = tmp2[0]
                        if len(tmp2) > 1:
                            oneOut['subDistrict'] = tmp2[1]

            totalPrice = String2Number(''.join(
                one.xpath('./div[3]/span[1]/strong/text()').extract()).strip())
            oneOut['totalPrice'] = totalPrice
            oneOut['unitPrice'] = String2Number(''.join(
                one.xpath('./div[3]/span[2]/text()').extract()).strip())

            oneOut['houseType'] = ''.join(
                one.xpath('./div[2]/div[2]/span[1]/text()').extract()).strip()
            oneOut['square'] = String2Number(''.join(
                one.xpath('./div[2]/div[2]/span[2]/text()').extract()).strip())
            oneOut['level'] = ''.join(
                one.xpath('./div[2]/div[2]/span[3]/text()').extract()).strip()
            oneOut['structure'] = ''.join(
                one.xpath('./div[2]/div[2]/span[4]/text()').extract()).strip()

            oneOut['crawlDate'] = today()

        except Exception as e:
            print(e)
            logging.warning("parseOne Exception %s" % (str(e)))
        return oneOut
示例#2
0
    def parseOne(self, one):
      oneOut = items.HouseItem()
      oneOut['src'] = self.src
      oneOut['district'] = ''.join(one.xpath('./div[1]/p[3]/a[1]/text()').extract()).strip()
      oneOut['subDistrict'] = ''.join(one.xpath('./div[1]/p[3]/a[2]/text()').extract()).strip()
      oneOut['title'] = ''.join(one.xpath('./div[1]/h4/a/text()').extract()).strip()
      href = ''.join(one.xpath('./div[1]/h4/a/@href').extract()).strip()
      if len(href) > 0:
        id = '-1'
        try:
          id = href.split('/')[-1][:-5]
        except Exception as e:
          logging.warning("parseOne Exception %s" % (str(e)))
        oneOut['_id'] = id

      try:
        totalPrice = util.String2Number(''.join(one.xpath('./div[2]/p[1]/span/text()').extract()).strip())
        oneOut['totalPrice'] = totalPrice
        oneOut['unitPrice'] = util.String2Number(
          ''.join(one.xpath('./div[2]/p[2]/text()').extract()).strip())

        oneOut['community'] = ''.join(one.xpath('./div[1]/p[1]/a/text()').extract())
        # community = community.split(' ')
        # if len(community) >= 2:
        #   oneOut['community'] = community[1]

        oneOut['houseType'] = ''.join(one.xpath('./div[1]/p[1]/span[2]/text()').extract()).strip()
        if oneOut['houseType'] == '|':
          oneOut['houseType'] = ''.join(one.xpath('./div[1]/p[1]/span[3]/text()').extract()).strip()

        oneOut['square'] = util.String2Number(''.join(one.xpath('./div[1]/p[1]/span[4]/text()').extract()).strip())
        if np.isnan(oneOut['square']):
          oneOut['square'] = util.String2Number(''.join(one.xpath('./div[1]/p[1]/span[5]/text()').extract()).strip())

        oneOut['level'] = ''.join(one.xpath('./div[1]/p[2]/span[1]/text()').extract()).strip()

        oneOut['crawlDate'] = util.today()

      except Exception as e:
        print(e)
        logging.warning("parseOne Exception %s"%(str(e)))
      return oneOut
示例#3
0
    def parseOne(self, one, district, subDistrict):
        # {'_id': '',
        #  'area': '',
        #  'attention': '',
        #  'community': '',
        #  'crawlDate': datetime.datetime(2019, 8, 24, 0, 0),
        #  'district': '朝阳',
        #  'level': ')',
        #  'src': 'lj',
        #  'subDistrict': '通州北苑',
        #  'title': '',
        #  'totalPrice': nan,
        #  'unitPrice': nan}
        oneOut = items.HouseItem()
        oneOut['src'] = self.src
        oneOut['district'] = district
        oneOut['subDistrict'] = subDistrict
        oneOut['title'] = ''.join(
            one.xpath('.//div[1]/div[1]/a/text()').extract()).strip()
        oneOut['_id'] = ''.join(
            one.xpath('.//div[1]/div[1]/a/@data-housecode').extract()).strip()
        try:
            unitPrice = util.String2Number(''.join(
                one.xpath(
                    './/div[1]/div[6]/div[2]/span/text()').extract()).strip())
            if not np.isnan(unitPrice):
                oneOut['unitPrice'] = unitPrice
                oneOut['totalPrice'] = util.String2Number(''.join(
                    one.xpath('.//div[1]/div[6]/div[1]/span/text()').extract()
                ).strip())
            else:
                # https://sh.lianjia.com/ershoufang/changning/pg96/
                oneOut['unitPrice'] = util.String2Number(''.join(
                    one.xpath('.//div[1]/div[7]/div[2]/span/text()').extract()
                ).strip())
                oneOut['totalPrice'] = util.String2Number(''.join(
                    one.xpath('.//div[1]/div[7]/div[1]/span/text()').extract()
                ).strip())

            oneOut['community'] = ''.join(
                one.xpath('.//div[1]/div[2]/div/a/text()').extract())
            houseInfo = ''.join(
                one.xpath('.//div[1]/div[2]/div/text()').extract())
            houseInfo = houseInfo.split('|')
            if len(houseInfo) > 1:
                oneOut['houseType'] = houseInfo[1].strip()
                if len(houseInfo) > 2:
                    oneOut['square'] = util.String2Number(houseInfo[2].strip())

            #'/html/body/div[4]/div[1]/ul/li[1]/div[1]/div[3]/div/a'
            oneOut['area'] = util.ExtractString(
                one, './/div[1]/div[3]/div/a/text()')
            positionInfo = ''.join(
                one.xpath('.//div[1]/div[3]/div/text()').extract())
            positionInfo = positionInfo.split(')')
            if len(positionInfo) > 0:
                oneOut['level'] = positionInfo[0].strip() + ')'
                if len(positionInfo) > 1:
                    oneOut['structure'] = positionInfo[1].strip()

            followInfo = ''.join(
                one.xpath('.//div[1]/div[4]/text()').extract())
            followInfo = followInfo.split('/')
            if len(followInfo) > 0:
                oneOut['attention'] = followInfo[0].strip()
                if len(followInfo) > 1:
                    oneOut['follow'] = followInfo[1].strip()
                    if len(followInfo) > 2:
                        oneOut['release'] = followInfo[2].strip()

            oneOut['crawlDate'] = util.today()

        except Exception as e:
            print(e)
            logging.warning("parseOne Exception %s" % (str(e)))
        return oneOut