def getLatLon(house): try: return Utils.geoSearchWithAddress(house.city, house.district + house.campus) except: Utils.write_error('%s 赶集网解析经纬度错误 %s' % (Utils.getCurrentTime(), house.url)) return 0, 0, ''
def parseHouse(self, url): response = self.downloader.download_html_response(url) if response is None: return True try: notExist = response.xpath('//h1/text()').extract()[0].strip() if re.search(r'不在这个星球上', notExist): print('%s 58网页不存在 %s' % (Utils.getCurrentTime(), url)) Utils.write_error('58网页不存在 %s' % url) return True else: pass except: pass try: if re.search(r'firewall', response.url): self.runSpider = False return False except: pass if response.status == 404: print('%s 58网页不存在 %s' % (Utils.getCurrentTime(), url)) Utils.write_error('58网页不存在 %s' % url) return house = House() house.url = self.getMobileUrl(response) house.title = self.getTitle(response) house.image_url = self.getImageUrl(response) house.city = self.getCity(response) house.district = self.getDistrict(response) house.rental = self.getRental(response) house.campus = self.getCampus(response) house.date = self.getDate(response) house.address = self.getAddress(response) house.source = self.source house.house_type = self.getHouseType(response) house.rooms = self.getRooms(house.house_type) house.area = self.getArea(response) house.floor = self.getFloor(response) house.contact = self.getContact(response) house.phone = self.getPhone(response) house.rent_type = self.getRentType(response) house.lat, house.lon = self.getLatLon(house) house.md5 = Utils.generateMD5(house.url) house.time = Utils.getCurrentTime() timedelta = datetime.datetime.now() - datetime.datetime.strptime( house.date, '%Y-%m-%d') days = timedelta.days if days > 7: print('%s 58过期房源:%s;%s;%s' % (Utils.getCurrentTime(), house.date, house.title, house.url)) elif house.isValidHouse(): DB_Manager(house) return True #继续运行这个爬虫
def getCampus(response): try: return response.xpath( '//ul[@class="er-list-two f-clear"]/li/span[@class="content"]/a/text()' ).extract()[0] except: print('Error! getCampus()') Utils.write_error('%s 赶集网解析小区错误 %s' % (Utils.getCurrentTime(), response.url))
def parseHouse(self): if not self.https: response = self.downloader.download_html_response(self.url) else: response = self.downloader.download_https_response(self.url) if self.isInvalidPage(response): return if self.isMeetFireWall(response): self.runSpider = False redis = RedisUtils() redis.add_to_redis(self.name, self.url) return if response.status == 404: print('%s>%s网页不存在 %s' % (Utils.getCurrentTime(), self.source, self.url)) Utils.write_error('%s>%s网页不存在 %s' % (Utils.getCurrentTime(), self.source, self.url)) return house = House() house.url = self.getMobileUrl(response) house.title = self.getTitle(response) house.image_url = self.getImageUrl(response) house.city = self.getCity(response) house.district = self.getDistrict(response) house.rental = self.getRental(response) house.campus = self.getCampus(response) house.date = self.getDate(response) house.address = self.getAddress(response) house.source = self.source house.house_type = self.getHouseType(response) house.rooms = self.getRooms(house.house_type) house.area = self.getArea(response) house.floor = self.getFloor(response) house.contact = self.getContact(response) house.phone = self.getPhone(response) house.rent_type = self.getRentType(response) house.lat, house.lon = self.getLatLon(house) house.md5 = Utils.generateMD5(house.url) house.time = Utils.getCurrentTime() timedelta = datetime.datetime.now() - datetime.datetime.strptime( house.date, '%Y-%m-%d') days = timedelta.days if days > 10: print('%s 58过期房源:%s;%s;%s' % (Utils.getCurrentTime(), house.date, house.title, house.url)) return DB_Manager(house)
def parseHouse(self, url): response = self.downloader.download_html_response(url) if response is None: print('%s 房天下下载网页失败%s' % (Utils.getCurrentTime(), url)) return True if response.status == 404: Utils.write_error('房天下网页不存在 %s' % url) print('%s 房天下网页不存在 %s' % (Utils.getCurrentTime(), url)) return True if self.titleNotExist(response): Utils.write_error('房天下网页不存在 %s' % url) print('%s 房天下网页不存在 %s' % (Utils.getCurrentTime(), url)) return True house = House() house.url = self.getMobileUrl(response) house.title = self.getTitle(response) house.image_url = self.getImageUrl(response) house.city = self.getCity(response) house.district = self.getDistrict(response) house.rental = self.getRental(response) house.campus = self.getCampus(response) house.date = self.getDate(response) house.address = self.getAddress(response) house.source = self.source house.house_type = self.getHouseType(response) house.rooms = self.getRooms(house.house_type) house.area = self.getArea(response) house.floor = self.getFloor(response) house.contact = self.getContact(response) house.phone = self.getPhone(response) house.rent_type = self.getRentType(response) house.lat, house.lon = self.getLatLon(house) house.md5 = Utils.generateMD5(house.url) house.time = Utils.getCurrentTime() timedelta = datetime.datetime.now() - datetime.datetime.strptime( house.date, '%Y-%m-%d') days = timedelta.days if days > 7: print('%s 房天下过期房源:%s;%s;%s' % (Utils.getCurrentTime(), house.date, house.title, house.url)) elif house.isValidHouse(): DB_Manager(house) return True
def parseHouse(self, url): response = self.downloader.download_https_response(url) if response is None: return True if re.search(r'访问验证', response.body.decode('utf-8')): self.runSpider = False self.redis.add_to_redis("SpiderAnjuke", response.url) return False if response.status == 404: print('%s 安居客网页不存在 %s' % (Utils.getCurrentTime(), url)) Utils.write_error('安居客网页不存在 %s' % url) return True house = House() house.url = self.getMobileUrl(response) house.title = self.getTitle(response) house.image_url = self.getImageUrl(response) house.city = self.getCity(response) house.district = self.getDistrict(response) house.rental = self.getRental(response) house.campus = self.getCampus(response) house.date = self.getDate(response) house.address = self.getAddress(response) house.source = self.source house.house_type = self.getHouseType(response) house.rooms = self.getRooms(house.house_type) house.area = self.getArea(response) house.floor = self.getFloor(response) house.contact = self.getContact(response) house.phone = self.getPhone(response) house.rent_type = self.getRentType(response) house.lat, house.lon, house.address = self.getLatLon(house) house.md5 = Utils.generateMD5(house.url) house.time = Utils.getCurrentTime() timedelta = datetime.datetime.now() - datetime.datetime.strptime( house.date, '%Y-%m-%d') days = timedelta.days if days > 7: print('%s 安居客过期房源:%s;%s;%s' % (Utils.getCurrentTime(), house.date, house.title, house.url)) elif house.isValidHouse(): DB_Manager(house) return True