def getLatLon(house):
     try:
         return Utils.geoSearchWithAddress(house.city,
                                           house.district + house.campus)
     except:
         Utils.write_error('%s 赶集网解析经纬度错误 %s' %
                           (Utils.getCurrentTime(), house.url))
         return 0, 0, ''
示例#2
0
    def parseHouse(self, url):
        response = self.downloader.download_html_response(url)
        if response is None:
            return True
        try:
            notExist = response.xpath('//h1/text()').extract()[0].strip()
            if re.search(r'不在这个星球上', notExist):
                print('%s 58网页不存在 %s' % (Utils.getCurrentTime(), url))
                Utils.write_error('58网页不存在 %s' % url)
                return True
            else:
                pass
        except:
            pass

        try:
            if re.search(r'firewall', response.url):
                self.runSpider = False
                return False
        except:
            pass

        if response.status == 404:
            print('%s 58网页不存在 %s' % (Utils.getCurrentTime(), url))
            Utils.write_error('58网页不存在 %s' % url)
            return

        house = House()
        house.url = self.getMobileUrl(response)
        house.title = self.getTitle(response)
        house.image_url = self.getImageUrl(response)
        house.city = self.getCity(response)
        house.district = self.getDistrict(response)
        house.rental = self.getRental(response)
        house.campus = self.getCampus(response)
        house.date = self.getDate(response)
        house.address = self.getAddress(response)
        house.source = self.source
        house.house_type = self.getHouseType(response)
        house.rooms = self.getRooms(house.house_type)
        house.area = self.getArea(response)
        house.floor = self.getFloor(response)
        house.contact = self.getContact(response)
        house.phone = self.getPhone(response)
        house.rent_type = self.getRentType(response)
        house.lat, house.lon = self.getLatLon(house)
        house.md5 = Utils.generateMD5(house.url)
        house.time = Utils.getCurrentTime()

        timedelta = datetime.datetime.now() - datetime.datetime.strptime(
            house.date, '%Y-%m-%d')
        days = timedelta.days
        if days > 7:
            print('%s 58过期房源:%s;%s;%s' %
                  (Utils.getCurrentTime(), house.date, house.title, house.url))
        elif house.isValidHouse():
            DB_Manager(house)
        return True  #继续运行这个爬虫
 def getCampus(response):
     try:
         return response.xpath(
             '//ul[@class="er-list-two f-clear"]/li/span[@class="content"]/a/text()'
         ).extract()[0]
     except:
         print('Error! getCampus()')
         Utils.write_error('%s 赶集网解析小区错误 %s' %
                           (Utils.getCurrentTime(), response.url))
    def parseHouse(self):
        if not self.https:
            response = self.downloader.download_html_response(self.url)
        else:
            response = self.downloader.download_https_response(self.url)
        if self.isInvalidPage(response):
            return

        if self.isMeetFireWall(response):
            self.runSpider = False
            redis = RedisUtils()
            redis.add_to_redis(self.name, self.url)
            return

        if response.status == 404:
            print('%s>%s网页不存在 %s' %
                  (Utils.getCurrentTime(), self.source, self.url))
            Utils.write_error('%s>%s网页不存在 %s' %
                              (Utils.getCurrentTime(), self.source, self.url))
            return

        house = House()
        house.url = self.getMobileUrl(response)
        house.title = self.getTitle(response)
        house.image_url = self.getImageUrl(response)
        house.city = self.getCity(response)
        house.district = self.getDistrict(response)
        house.rental = self.getRental(response)
        house.campus = self.getCampus(response)
        house.date = self.getDate(response)
        house.address = self.getAddress(response)
        house.source = self.source
        house.house_type = self.getHouseType(response)
        house.rooms = self.getRooms(house.house_type)
        house.area = self.getArea(response)
        house.floor = self.getFloor(response)
        house.contact = self.getContact(response)
        house.phone = self.getPhone(response)
        house.rent_type = self.getRentType(response)
        house.lat, house.lon = self.getLatLon(house)
        house.md5 = Utils.generateMD5(house.url)
        house.time = Utils.getCurrentTime()

        timedelta = datetime.datetime.now() - datetime.datetime.strptime(
            house.date, '%Y-%m-%d')
        days = timedelta.days
        if days > 10:
            print('%s 58过期房源:%s;%s;%s' %
                  (Utils.getCurrentTime(), house.date, house.title, house.url))
            return

        DB_Manager(house)
示例#5
0
    def parseHouse(self, url):
        response = self.downloader.download_html_response(url)
        if response is None:
            print('%s 房天下下载网页失败%s' % (Utils.getCurrentTime(), url))
            return True

        if response.status == 404:
            Utils.write_error('房天下网页不存在 %s' % url)
            print('%s 房天下网页不存在 %s' % (Utils.getCurrentTime(), url))
            return True

        if self.titleNotExist(response):
            Utils.write_error('房天下网页不存在 %s' % url)
            print('%s 房天下网页不存在 %s' % (Utils.getCurrentTime(), url))
            return True

        house = House()
        house.url = self.getMobileUrl(response)
        house.title = self.getTitle(response)
        house.image_url = self.getImageUrl(response)
        house.city = self.getCity(response)
        house.district = self.getDistrict(response)
        house.rental = self.getRental(response)
        house.campus = self.getCampus(response)
        house.date = self.getDate(response)
        house.address = self.getAddress(response)
        house.source = self.source
        house.house_type = self.getHouseType(response)
        house.rooms = self.getRooms(house.house_type)
        house.area = self.getArea(response)
        house.floor = self.getFloor(response)
        house.contact = self.getContact(response)
        house.phone = self.getPhone(response)
        house.rent_type = self.getRentType(response)
        house.lat, house.lon = self.getLatLon(house)
        house.md5 = Utils.generateMD5(house.url)
        house.time = Utils.getCurrentTime()

        timedelta = datetime.datetime.now() - datetime.datetime.strptime(
            house.date, '%Y-%m-%d')
        days = timedelta.days
        if days > 7:
            print('%s 房天下过期房源:%s;%s;%s' %
                  (Utils.getCurrentTime(), house.date, house.title, house.url))
        elif house.isValidHouse():
            DB_Manager(house)
        return True
    def parseHouse(self, url):
        response = self.downloader.download_https_response(url)
        if response is None:
            return True

        if re.search(r'访问验证', response.body.decode('utf-8')):
            self.runSpider = False
            self.redis.add_to_redis("SpiderAnjuke", response.url)
            return False

        if response.status == 404:
            print('%s 安居客网页不存在 %s' % (Utils.getCurrentTime(), url))
            Utils.write_error('安居客网页不存在 %s' % url)
            return True
        house = House()
        house.url = self.getMobileUrl(response)
        house.title = self.getTitle(response)
        house.image_url = self.getImageUrl(response)
        house.city = self.getCity(response)
        house.district = self.getDistrict(response)
        house.rental = self.getRental(response)
        house.campus = self.getCampus(response)
        house.date = self.getDate(response)
        house.address = self.getAddress(response)
        house.source = self.source
        house.house_type = self.getHouseType(response)
        house.rooms = self.getRooms(house.house_type)
        house.area = self.getArea(response)
        house.floor = self.getFloor(response)
        house.contact = self.getContact(response)
        house.phone = self.getPhone(response)
        house.rent_type = self.getRentType(response)
        house.lat, house.lon, house.address = self.getLatLon(house)
        house.md5 = Utils.generateMD5(house.url)
        house.time = Utils.getCurrentTime()

        timedelta = datetime.datetime.now() - datetime.datetime.strptime(
            house.date, '%Y-%m-%d')
        days = timedelta.days
        if days > 7:
            print('%s 安居客过期房源:%s;%s;%s' %
                  (Utils.getCurrentTime(), house.date, house.title, house.url))
        elif house.isValidHouse():
            DB_Manager(house)
        return True