def getLatLon(house): try: return Utils.geoSearchWithAddress(house.city, house.district + house.campus) except: Utils.write_error('%s 赶集网解析经纬度错误 %s' % (Utils.getCurrentTime(), house.url)) return 0, 0, ''
def getCampus(response): try: return response.xpath( '//ul[@class="er-list-two f-clear"]/li/span[@class="content"]/a/text()' ).extract()[0] except: print('Error! getCampus()') Utils.write_error('%s 赶集网解析小区错误 %s' % (Utils.getCurrentTime(), response.url))
def update_house(self, house): title = house['title'] url = house['url'] rental = house['rental'] date = house['date'] city = house['city'] time = Utils.getCurrentTime() table = self.get_city_pinyin_table(city) sql = "update " + table + " set date= %s, rental = %s, time = %s where url = %s" self.cursor.execute(sql, (date, rental, time, url)) self.conn.commit() print(Utils.getCurrentTime(), '在云端数据库中更新房源:' + city, url)
def parseHouse(self, url): response = self.downloader.download_html_response(url) if response is None: return True try: notExist = response.xpath('//h1/text()').extract()[0].strip() if re.search(r'不在这个星球上', notExist): print('%s 58网页不存在 %s' % (Utils.getCurrentTime(), url)) Utils.write_error('58网页不存在 %s' % url) return True else: pass except: pass try: if re.search(r'firewall', response.url): self.runSpider = False return False except: pass if response.status == 404: print('%s 58网页不存在 %s' % (Utils.getCurrentTime(), url)) Utils.write_error('58网页不存在 %s' % url) return house = House() house.url = self.getMobileUrl(response) house.title = self.getTitle(response) house.image_url = self.getImageUrl(response) house.city = self.getCity(response) house.district = self.getDistrict(response) house.rental = self.getRental(response) house.campus = self.getCampus(response) house.date = self.getDate(response) house.address = self.getAddress(response) house.source = self.source house.house_type = self.getHouseType(response) house.rooms = self.getRooms(house.house_type) house.area = self.getArea(response) house.floor = self.getFloor(response) house.contact = self.getContact(response) house.phone = self.getPhone(response) house.rent_type = self.getRentType(response) house.lat, house.lon = self.getLatLon(house) house.md5 = Utils.generateMD5(house.url) house.time = Utils.getCurrentTime() timedelta = datetime.datetime.now() - datetime.datetime.strptime( house.date, '%Y-%m-%d') days = timedelta.days if days > 7: print('%s 58过期房源:%s;%s;%s' % (Utils.getCurrentTime(), house.date, house.title, house.url)) elif house.isValidHouse(): DB_Manager(house) return True #继续运行这个爬虫
def crawlSpider(self): while True: url = self.readUrls() if url is None: break self.parseHouse(url) time.sleep(random.randint(10, 30)) if not self.runSpider: print('%s 遇到赶集网防火墙' % Utils.getCurrentTime()) Utils.write_firewall_error('%s 遇到赶集网防火墙' % Utils.getCurrentTime()) break
def crawl(self): while True: if self.runSpider: url = self.readUrls() if url is not None: threading.Thread(target=self.parseHouse, args=(url, )).start() else: print('%s Redis服务器中已经没有未爬取的%s链接' % (Utils.getCurrentTime(), self.__class__.__name__)) else: print('%s 赶集网遇到验证码' % Utils.getCurrentTime()) time.sleep(INTERVAL)
def parseHouse(self, url): response = self.downloader.download_html_response(url) if response is None: print('%s 房天下下载网页失败%s' % (Utils.getCurrentTime(), url)) return True if response.status == 404: Utils.write_error('房天下网页不存在 %s' % url) print('%s 房天下网页不存在 %s' % (Utils.getCurrentTime(), url)) return True if self.titleNotExist(response): Utils.write_error('房天下网页不存在 %s' % url) print('%s 房天下网页不存在 %s' % (Utils.getCurrentTime(), url)) return True house = House() house.url = self.getMobileUrl(response) house.title = self.getTitle(response) house.image_url = self.getImageUrl(response) house.city = self.getCity(response) house.district = self.getDistrict(response) house.rental = self.getRental(response) house.campus = self.getCampus(response) house.date = self.getDate(response) house.address = self.getAddress(response) house.source = self.source house.house_type = self.getHouseType(response) house.rooms = self.getRooms(house.house_type) house.area = self.getArea(response) house.floor = self.getFloor(response) house.contact = self.getContact(response) house.phone = self.getPhone(response) house.rent_type = self.getRentType(response) house.lat, house.lon = self.getLatLon(house) house.md5 = Utils.generateMD5(house.url) house.time = Utils.getCurrentTime() timedelta = datetime.datetime.now() - datetime.datetime.strptime( house.date, '%Y-%m-%d') days = timedelta.days if days > 7: print('%s 房天下过期房源:%s;%s;%s' % (Utils.getCurrentTime(), house.date, house.title, house.url)) elif house.isValidHouse(): DB_Manager(house) return True
def getDate(response): try: date_ = response.xpath( '//div[@class="mod-title bottomed"]/div/text()').extract()[0] date = re.findall(r'\d+年\d+月\d+日', date_)[0] return Utils.transformDate(date) except: print('Error!getDate()')
def get_expired_house_count(self): sum = 0 date = Utils.getDeltaDate(days=6) for city in self.city_list: sql = 'SELECT count(*) FROM %s WHERE date < "%s"' % (city, date) self.cursor.execute(sql) self.conn.commit() result = self.cursor.fetchone()['count(*)'] sum += result return sum
def requestUrl(self, spider): # 定义一个数据通道 channel = grpc.insecure_channel('39.108.51.140:16305') # 定义一条管道 stub = message_pb2_grpc.SpiderServerStub(channel) # 获取服务器返回的数据 response = stub.req( Request(host=Utils.getIp(), timestamp=Utils.getTime(), spider=spider)) if response.url is not None and response.url != '': pool = multiprocessing.Pool(1) # 注意了,向pool的map传递多个参数时,要使用partial,第二个参数以后的每个参数都要包含进来,一一对应好 result = pool.map(partial(self.runSpider, url=response.url), (spider, )) if not result[0]: self.status[spider] = False print("%s is blocked by firewall!") self.status[spider] = pool.map(self.waitFireWallExpire, (60 * 60, ))[0]
def backup_history_record(self): date = Utils.getDeltaDate(days=7) for city in self.city_list: sql = 'SELECT * FROM ' + city + ' WHERE date < "%s"' % (date) self.cursor.execute(sql) self.conn.commit() result = self.cursor.fetchall() self.write_history_houses(result) sql = 'DELETE FROM ' + city + ' WHERE date< "%s"' % (date) self.cursor.execute(sql) self.conn.commit()
def write_house(self, house): title = house.title campus = house['campus'] url = house['url'] image_url = house['image_url'] rental = house['rental'] if type(rental) == type(0.02): rental = int(rental) area = house['area'] house_type = house['house_type'] source = house['source'] date = house['date'] rent_type = house['rent_type'] floor = house['floor'] address = house['address'] district = house['district'] district = self.check_guangzhou_district(district) city = house['city'] lat = house['lat'] lon = house['lon'] md5 = Utils.generateMD5(url) try: rooms = house['rooms'] except: rooms = Utils.get_rooms(house_type) time = Utils.getCurrentTime() import datetime timedelta = datetime.datetime.now() - datetime.datetime.strptime( date, '%Y-%m-%d') days = timedelta.days if days > self.expire_days: return table = self.get_city_pinyin_table(city) sql = "insert into " + table + "(rental, title, campus, house_type, date, rent_type, area, floor, district, address, url,image_url, source, city, lat, lon, time, rooms, md5) values( %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) on duplicate key update time = %s, date = %s, rental = %s" self.cursor.execute( sql, (rental, title, campus, house_type, date, rent_type, area, floor, district, address, url, image_url, source, city, lat, lon, time, rooms, md5, time, date, rental)) self.conn.commit() print(Utils.getCurrentTime(), '在云端数据库中写入新房源:' + city, campus, url)
def parseHouse(self): if not self.https: response = self.downloader.download_html_response(self.url) else: response = self.downloader.download_https_response(self.url) if self.isInvalidPage(response): return if self.isMeetFireWall(response): self.runSpider = False redis = RedisUtils() redis.add_to_redis(self.name, self.url) return if response.status == 404: print('%s>%s网页不存在 %s' % (Utils.getCurrentTime(), self.source, self.url)) Utils.write_error('%s>%s网页不存在 %s' % (Utils.getCurrentTime(), self.source, self.url)) return house = House() house.url = self.getMobileUrl(response) house.title = self.getTitle(response) house.image_url = self.getImageUrl(response) house.city = self.getCity(response) house.district = self.getDistrict(response) house.rental = self.getRental(response) house.campus = self.getCampus(response) house.date = self.getDate(response) house.address = self.getAddress(response) house.source = self.source house.house_type = self.getHouseType(response) house.rooms = self.getRooms(house.house_type) house.area = self.getArea(response) house.floor = self.getFloor(response) house.contact = self.getContact(response) house.phone = self.getPhone(response) house.rent_type = self.getRentType(response) house.lat, house.lon = self.getLatLon(house) house.md5 = Utils.generateMD5(house.url) house.time = Utils.getCurrentTime() timedelta = datetime.datetime.now() - datetime.datetime.strptime( house.date, '%Y-%m-%d') days = timedelta.days if days > 10: print('%s 58过期房源:%s;%s;%s' % (Utils.getCurrentTime(), house.date, house.title, house.url)) return DB_Manager(house)
def parseHouse(self, url): response = self.downloader.download_https_response(url) if response is None: return True if re.search(r'访问验证', response.body.decode('utf-8')): self.runSpider = False self.redis.add_to_redis("SpiderAnjuke", response.url) return False if response.status == 404: print('%s 安居客网页不存在 %s' % (Utils.getCurrentTime(), url)) Utils.write_error('安居客网页不存在 %s' % url) return True house = House() house.url = self.getMobileUrl(response) house.title = self.getTitle(response) house.image_url = self.getImageUrl(response) house.city = self.getCity(response) house.district = self.getDistrict(response) house.rental = self.getRental(response) house.campus = self.getCampus(response) house.date = self.getDate(response) house.address = self.getAddress(response) house.source = self.source house.house_type = self.getHouseType(response) house.rooms = self.getRooms(house.house_type) house.area = self.getArea(response) house.floor = self.getFloor(response) house.contact = self.getContact(response) house.phone = self.getPhone(response) house.rent_type = self.getRentType(response) house.lat, house.lon, house.address = self.getLatLon(house) house.md5 = Utils.generateMD5(house.url) house.time = Utils.getCurrentTime() timedelta = datetime.datetime.now() - datetime.datetime.strptime( house.date, '%Y-%m-%d') days = timedelta.days if days > 7: print('%s 安居客过期房源:%s;%s;%s' % (Utils.getCurrentTime(), house.date, house.title, house.url)) elif house.isValidHouse(): DB_Manager(house) return True
def getLatLon(house): return Utils.geoSearch( house.city, house.district + house.campus ) if house.district != "" else Utils.geoSearchWithAddress( house.city, house.address + house.campus)
def getDate(response): _date = response.xpath( '//div[@class="house-title"]/p/text()').extract()[0].strip() return Utils.transformDate(_date) pass
def sadd(self, name, url): key = 'HouseMasterSpider:%s_filter' % name value = Utils.generateMD5(url) return self.conn.sadd(key, value)
def getLatLon(house): return Utils.geoSearch(house.city, house.district + house.campus)
def getRooms(houseType): return Utils.transformHouseType(houseType)
def getDate(response): date_ = response.xpath( '//div[@class="card-status f-clear"]/ul/li/text()').extract()[0] return Utils.transformDate(date_) pass
def getDate(response): datetime = response.xpath( '//p[@class="gray9 fybh-zf"]/span[2]/text()').extract()[0] _date = re.findall(r'\d+-\d+-\d+', datetime)[0] return Utils.transformDate(_date)