예제 #1
0
 def parseBlock(self, response):
   oneOut = items.BlockItem()
   try:
     oneOut['name'] = util.ExtractString(response, self.xpath['name'])
     oneOut['block'] = util.ExtractString(response, self.xpath['block'])
     oneOut['price'] = util.ExtractNumber(response, self.xpath['price'])
     oneOut['sellCounter'] = util.ExtractNumber(response, self.xpath['sellCounter'])
     oneOut['traded'] = util.ExtractNumber(response, self.xpath['traded'])
     oneOut['lookCounter'] = util.ExtractNumber(response, self.xpath['lookCounter'])
   except Exception as e:
     print(e)
   oneOut['crawlDate'] = util.today()
   return oneOut
예제 #2
0
    def parse(self, response):
      self.received.add(response.url)

      # districts = self.parseDistricts(response)
      # realOut = set(districts) - self.received
      # for one in realOut:
      #   yield Request(one, meta={'step': 0})
      #
      # subDistricts = self.parseSubDistricts(response)
      # realOut = set(subDistricts) - self.received
      # for one in realOut:
      #   yield Request(one, meta={'step': 1, 'url': one})

      blockName = None
      if 'block' not in response.meta:
        #本小区摘要信息
        block = self.parseBlock(response)
        blockName = block['name']
        # yield block
        #所有同级页面
        nextPage = self.nextPage(response, self.head, None)
        realOut = set(nextPage) - self.received
        for one in realOut:
          #这个是一共多少页
          yield Request(one, meta={'step': 2, 'block': block['name']})
      else:
        blockName = response.meta['block']

      ones = response.xpath(self.xpath['lists'])
      for one in ones:
        # 这个是每页多少条
        try:
          url = util.ExtractString(one, './/div[1]/div[1]/a/@href')
          housecode = util.ExtractString(one, './/div[1]/div[1]/a/@data-housecode')
          yield Request(url, meta={'step': 3, 'block': blockName, 'hc': housecode})
        except Exception as e:
          print(e)


      if 'step' in response.meta and response.meta['step'] ==3:
        oneOut = self.parseOne(response, blockName, response.meta['hc'])
        if len(oneOut['_id']):
          yield oneOut
예제 #3
0
    def parseOne(self, one, index):
        oneOut = items.TopListItem()

        try:
            oneOut['index'] = index
            oneOut['name'] = util.ExtractString(one, './/ul/li[3]/a/@title')
            oneOut['href'] = util.ExtractString(one, './/ul/li[3]/a/@href')
            if len(oneOut['href']):
                index = oneOut['href'].find('id=')
                if index != -1:
                    oneOut['_id'] = oneOut['href'][index + 3:] + '_zb'
            oneOut['kind'] = util.ExtractString(one, './/ul/li[4]/text()')
            oneOut['glgzhs'] = util.ExtractNumber(one, './/ul/li[5]/text()')
            oneOut['gshydl'] = util.ExtractString(one, './/ul/li[6]/text()')
            oneOut['czzs'] = util.ExtractNumber(one, './/ul/li[7]/text()')
            oneOut['aldzs'] = util.ExtractNumber(one,
                                                 './/ul/li[8]/span/text()')

        except Exception as e:
            print(e)
            logging.warning("parseOne Exception %s" % (str(e)))
        return oneOut
예제 #4
0
    def nextPagePlusOne(self, response, url):
      np = []
      nextPageText = ''.join(response.xpath(self.xpath['nextPageText']).extract()).strip()
      if nextPageText == '下一页':
        for one in response.xpath(self.xpath['nextPage']).extract():
          np.append(url + one)
      else:
        p = response.xpath(self.xpath['allPage'])
        # 框架支持url排重,这里就不排重了
        for one in p:
          #原始url 最后多一个/,导致无法匹配
          np.append(url + urllib.parse.quote(util.ExtractString(one, './/@href')) + '/')

      return np
예제 #5
0
    def parse(self, response):
        self.received.add(response.url)
        print("receive data...  " + response.url)

        one = items.DetailItem()
        index = response.url.find('id=')
        if index != -1:
            index2 = response.url.find('&', index + 3)
            if index != -1:
                one['_id'] = response.url[index + 3:index2]

        one['logo'] = util.ExtractString(response, '//*[@id="logo"]/@src')
        one['detail'] = util.ExtractString(
            response,
            '//*[@id="detail_page"]/div/div[2]/div[2]/div[5]/div[4]/text()'
        ).strip()
        array = response.xpath(
            '//*[@id="detail_page"]/div/div[2]/div[2]/div[5]/div[2]/ul/li')
        tmpArray = []
        for tmp in array:
            t = util.ExtractString(tmp, './/img/@src')
            tmpArray.append(t)
        one['imageList'] = tmpArray
        yield one
예제 #6
0
    def parseOne(self, one, district, subDistrict):
        # {'_id': '',
        #  'area': '',
        #  'attention': '',
        #  'community': '',
        #  'crawlDate': datetime.datetime(2019, 8, 24, 0, 0),
        #  'district': '朝阳',
        #  'level': ')',
        #  'src': 'lj',
        #  'subDistrict': '通州北苑',
        #  'title': '',
        #  'totalPrice': nan,
        #  'unitPrice': nan}
        oneOut = items.HouseItem()
        oneOut['src'] = self.src
        oneOut['district'] = district
        oneOut['subDistrict'] = subDistrict
        oneOut['title'] = ''.join(
            one.xpath('.//div[1]/div[1]/a/text()').extract()).strip()
        oneOut['_id'] = ''.join(
            one.xpath('.//div[1]/div[1]/a/@data-housecode').extract()).strip()
        try:
            unitPrice = util.String2Number(''.join(
                one.xpath(
                    './/div[1]/div[6]/div[2]/span/text()').extract()).strip())
            if not np.isnan(unitPrice):
                oneOut['unitPrice'] = unitPrice
                oneOut['totalPrice'] = util.String2Number(''.join(
                    one.xpath('.//div[1]/div[6]/div[1]/span/text()').extract()
                ).strip())
            else:
                # https://sh.lianjia.com/ershoufang/changning/pg96/
                oneOut['unitPrice'] = util.String2Number(''.join(
                    one.xpath('.//div[1]/div[7]/div[2]/span/text()').extract()
                ).strip())
                oneOut['totalPrice'] = util.String2Number(''.join(
                    one.xpath('.//div[1]/div[7]/div[1]/span/text()').extract()
                ).strip())

            oneOut['community'] = ''.join(
                one.xpath('.//div[1]/div[2]/div/a/text()').extract())
            houseInfo = ''.join(
                one.xpath('.//div[1]/div[2]/div/text()').extract())
            houseInfo = houseInfo.split('|')
            if len(houseInfo) > 1:
                oneOut['houseType'] = houseInfo[1].strip()
                if len(houseInfo) > 2:
                    oneOut['square'] = util.String2Number(houseInfo[2].strip())

            #'/html/body/div[4]/div[1]/ul/li[1]/div[1]/div[3]/div/a'
            oneOut['area'] = util.ExtractString(
                one, './/div[1]/div[3]/div/a/text()')
            positionInfo = ''.join(
                one.xpath('.//div[1]/div[3]/div/text()').extract())
            positionInfo = positionInfo.split(')')
            if len(positionInfo) > 0:
                oneOut['level'] = positionInfo[0].strip() + ')'
                if len(positionInfo) > 1:
                    oneOut['structure'] = positionInfo[1].strip()

            followInfo = ''.join(
                one.xpath('.//div[1]/div[4]/text()').extract())
            followInfo = followInfo.split('/')
            if len(followInfo) > 0:
                oneOut['attention'] = followInfo[0].strip()
                if len(followInfo) > 1:
                    oneOut['follow'] = followInfo[1].strip()
                    if len(followInfo) > 2:
                        oneOut['release'] = followInfo[2].strip()

            oneOut['crawlDate'] = util.today()

        except Exception as e:
            print(e)
            logging.warning("parseOne Exception %s" % (str(e)))
        return oneOut
예제 #7
0
    def parseOne(self, one, block, housecode):
      oneOut = items.HouseItem2()
      oneOut['src'] = self.src

      try:
        #/html/body/div[3]/div/div/div[1]/h1
        oneOut['title'] = util.ExtractString(one, '/html/body/div[3]/div/div/div[1]/h1/text()')
        # 这个是链家编号+crawldate
        oneOut['_id'] = housecode#util.ExtractString(one, '/html/body/div[5]/div[2]/div[6]/div[4]/span[2]')
        # 这个是真实的链家编号
        oneOut['houseID'] = oneOut['_id']

        oneOut['_id'] += '_' + util.todayString()

        oneOut['unitPrice'] = util.ExtractNumber(one, '/html/body/div[5]/div[2]/div[4]/div[1]/div[1]/span')
        oneOut['totalPrice'] = util.ExtractNumber(one, '/html/body/div[5]/div[2]/div[4]/span[1]')

        if len(block):
          oneOut['community'] = block
        else:
          oneOut['community'] = util.ExtractString(one, '/html/body/div[5]/div[2]/div[6]/div[1]/a[1]/text()')

        oneOut['houseType'] = util.ExtractString(one, '/html/body/div[5]/div[2]/div[5]/div[1]/div[1]/text()')#
        oneOut['square'] = util.ExtractNumber(one, '/html/body/div[5]/div[2]/div[5]/div[3]/div[1]')

        oneOut['level'] = util.ExtractString(one, '/html/body/div[5]/div[2]/div[5]/div[1]/div[2]/text()')
        oneOut['structure'] = util.ExtractString(one, '/html/body/div[7]/div[1]/div[1]/div/div/div[1]/div[2]/ul/li[1]/text()')

        oneOut['thb'] = util.ExtractString(one, '/html/body/div[7]/div[1]/div[1]/div/div/div[1]/div[2]/ul/li[10]/text()')
        oneOut['lx'] = util.ExtractString(one, '/html/body/div[7]/div[1]/div[1]/div/div/div[1]/div[2]/ul/li[6]/text()')

        oneOut['heating'] = util.ExtractString(one, '/html/body/div[7]/div[1]/div[1]/div/div/div[1]/div[2]/ul/li[11]/text()')
        oneOut['property'] = util.ExtractString(one, '/html/body/div[7]/div[1]/div[1]/div/div/div[1]/div[2]/ul/li[13]/text()')

        oneOut['attention'] = util.ExtractNumber(one, '//*[@id="favCount"]')
        oneOut['follow'] = util.ExtractNumber(one, '//*[@id="cartCount"]')
        oneOut['release'] = util.ExtractString(one, '/html/body/div[7]/div[1]/div[1]/div/div/div[2]/div[2]/ul/li[1]/span[2]/text()')
        oneOut['lastTrade'] = util.ExtractString(one, '/html/body/div[7]/div[1]/div[1]/div/div/div[2]/div[2]/ul/li[3]/span[2]/text()')
        oneOut['years'] = util.ExtractString(one, '/html/body/div[7]/div[1]/div[1]/div/div/div[2]/div[2]/ul/li[5]/span[2]/text()')
        oneOut['mortgage'] = util.ExtractString(one, '/html/body/div[7]/div[1]/div[1]/div/div/div[2]/div[2]/ul/li[7]/span[2]/text()').strip()

        #/html/body/div[7]/div[1]/div[1]/div/div/div[2]/div[2]/ul/li[6]/span[2]
        oneOut['ownership'] = util.ExtractString(one, '/html/body/div[7]/div[1]/div[1]/div/div/div[2]/div[2]/ul/li[2]/span[2]/text()')
        oneOut['use'] = util.ExtractString(one, '/html/body/div[7]/div[1]/div[1]/div/div/div[2]/div[2]/ul/li[4]/span[2]/text()')
        oneOut['propertyRight'] = util.ExtractString(one, '/html/body/div[7]/div[1]/div[1]/div/div/div[2]/div[2]/ul/li[6]/span[2]/text()')
        oneOut['book'] = util.ExtractString(one, '/html/body/div[7]/div[1]/div[1]/div/div/div[2]/div[2]/ul/li[8]/span[2]/text()')

        oneOut['crawlDate'] = util.today()

      except Exception as e:
        print(e)
        logging.warning("parseOne Exception %s"%(str(e)))
      return oneOut