def getLatLon(house):
     try:
         return Utils.geoSearchWithAddress(house.city,
                                           house.district + house.campus)
     except:
         Utils.write_error('%s 赶集网解析经纬度错误 %s' %
                           (Utils.getCurrentTime(), house.url))
         return 0, 0, ''
 def getCampus(response):
     try:
         return response.xpath(
             '//ul[@class="er-list-two f-clear"]/li/span[@class="content"]/a/text()'
         ).extract()[0]
     except:
         print('Error! getCampus()')
         Utils.write_error('%s 赶集网解析小区错误 %s' %
                           (Utils.getCurrentTime(), response.url))
示例#3
0
 def update_house(self, house):
     title = house['title']
     url = house['url']
     rental = house['rental']
     date = house['date']
     city = house['city']
     time = Utils.getCurrentTime()
     table = self.get_city_pinyin_table(city)
     sql = "update " + table + " set date= %s, rental = %s, time = %s where url = %s"
     self.cursor.execute(sql, (date, rental, time, url))
     self.conn.commit()
     print(Utils.getCurrentTime(), '在云端数据库中更新房源:' + city, url)
示例#4
0
    def parseHouse(self, url):
        response = self.downloader.download_html_response(url)
        if response is None:
            return True
        try:
            notExist = response.xpath('//h1/text()').extract()[0].strip()
            if re.search(r'不在这个星球上', notExist):
                print('%s 58网页不存在 %s' % (Utils.getCurrentTime(), url))
                Utils.write_error('58网页不存在 %s' % url)
                return True
            else:
                pass
        except:
            pass

        try:
            if re.search(r'firewall', response.url):
                self.runSpider = False
                return False
        except:
            pass

        if response.status == 404:
            print('%s 58网页不存在 %s' % (Utils.getCurrentTime(), url))
            Utils.write_error('58网页不存在 %s' % url)
            return

        house = House()
        house.url = self.getMobileUrl(response)
        house.title = self.getTitle(response)
        house.image_url = self.getImageUrl(response)
        house.city = self.getCity(response)
        house.district = self.getDistrict(response)
        house.rental = self.getRental(response)
        house.campus = self.getCampus(response)
        house.date = self.getDate(response)
        house.address = self.getAddress(response)
        house.source = self.source
        house.house_type = self.getHouseType(response)
        house.rooms = self.getRooms(house.house_type)
        house.area = self.getArea(response)
        house.floor = self.getFloor(response)
        house.contact = self.getContact(response)
        house.phone = self.getPhone(response)
        house.rent_type = self.getRentType(response)
        house.lat, house.lon = self.getLatLon(house)
        house.md5 = Utils.generateMD5(house.url)
        house.time = Utils.getCurrentTime()

        timedelta = datetime.datetime.now() - datetime.datetime.strptime(
            house.date, '%Y-%m-%d')
        days = timedelta.days
        if days > 7:
            print('%s 58过期房源:%s;%s;%s' %
                  (Utils.getCurrentTime(), house.date, house.title, house.url))
        elif house.isValidHouse():
            DB_Manager(house)
        return True  #继续运行这个爬虫
 def crawlSpider(self):
     while True:
         url = self.readUrls()
         if url is None:
             break
         self.parseHouse(url)
         time.sleep(random.randint(10, 30))
         if not self.runSpider:
             print('%s 遇到赶集网防火墙' % Utils.getCurrentTime())
             Utils.write_firewall_error('%s 遇到赶集网防火墙' %
                                        Utils.getCurrentTime())
             break
 def crawl(self):
     while True:
         if self.runSpider:
             url = self.readUrls()
             if url is not None:
                 threading.Thread(target=self.parseHouse,
                                  args=(url, )).start()
             else:
                 print('%s Redis服务器中已经没有未爬取的%s链接' %
                       (Utils.getCurrentTime(), self.__class__.__name__))
         else:
             print('%s 赶集网遇到验证码' % Utils.getCurrentTime())
         time.sleep(INTERVAL)
示例#7
0
    def parseHouse(self, url):
        response = self.downloader.download_html_response(url)
        if response is None:
            print('%s 房天下下载网页失败%s' % (Utils.getCurrentTime(), url))
            return True

        if response.status == 404:
            Utils.write_error('房天下网页不存在 %s' % url)
            print('%s 房天下网页不存在 %s' % (Utils.getCurrentTime(), url))
            return True

        if self.titleNotExist(response):
            Utils.write_error('房天下网页不存在 %s' % url)
            print('%s 房天下网页不存在 %s' % (Utils.getCurrentTime(), url))
            return True

        house = House()
        house.url = self.getMobileUrl(response)
        house.title = self.getTitle(response)
        house.image_url = self.getImageUrl(response)
        house.city = self.getCity(response)
        house.district = self.getDistrict(response)
        house.rental = self.getRental(response)
        house.campus = self.getCampus(response)
        house.date = self.getDate(response)
        house.address = self.getAddress(response)
        house.source = self.source
        house.house_type = self.getHouseType(response)
        house.rooms = self.getRooms(house.house_type)
        house.area = self.getArea(response)
        house.floor = self.getFloor(response)
        house.contact = self.getContact(response)
        house.phone = self.getPhone(response)
        house.rent_type = self.getRentType(response)
        house.lat, house.lon = self.getLatLon(house)
        house.md5 = Utils.generateMD5(house.url)
        house.time = Utils.getCurrentTime()

        timedelta = datetime.datetime.now() - datetime.datetime.strptime(
            house.date, '%Y-%m-%d')
        days = timedelta.days
        if days > 7:
            print('%s 房天下过期房源:%s;%s;%s' %
                  (Utils.getCurrentTime(), house.date, house.title, house.url))
        elif house.isValidHouse():
            DB_Manager(house)
        return True
 def getDate(response):
     try:
         date_ = response.xpath(
             '//div[@class="mod-title bottomed"]/div/text()').extract()[0]
         date = re.findall(r'\d+年\d+月\d+日', date_)[0]
         return Utils.transformDate(date)
     except:
         print('Error!getDate()')
示例#9
0
 def get_expired_house_count(self):
     sum = 0
     date = Utils.getDeltaDate(days=6)
     for city in self.city_list:
         sql = 'SELECT count(*) FROM %s WHERE date < "%s"' % (city, date)
         self.cursor.execute(sql)
         self.conn.commit()
         result = self.cursor.fetchone()['count(*)']
         sum += result
     return sum
示例#10
0
 def requestUrl(self, spider):
     # 定义一个数据通道
     channel = grpc.insecure_channel('39.108.51.140:16305')
     # 定义一条管道
     stub = message_pb2_grpc.SpiderServerStub(channel)
     # 获取服务器返回的数据
     response = stub.req(
         Request(host=Utils.getIp(),
                 timestamp=Utils.getTime(),
                 spider=spider))
     if response.url is not None and response.url != '':
         pool = multiprocessing.Pool(1)
         # 注意了,向pool的map传递多个参数时,要使用partial,第二个参数以后的每个参数都要包含进来,一一对应好
         result = pool.map(partial(self.runSpider, url=response.url),
                           (spider, ))
         if not result[0]:
             self.status[spider] = False
             print("%s is blocked by firewall!")
             self.status[spider] = pool.map(self.waitFireWallExpire,
                                            (60 * 60, ))[0]
示例#11
0
 def backup_history_record(self):
     date = Utils.getDeltaDate(days=7)
     for city in self.city_list:
         sql = 'SELECT * FROM ' + city + ' WHERE date < "%s"' % (date)
         self.cursor.execute(sql)
         self.conn.commit()
         result = self.cursor.fetchall()
         self.write_history_houses(result)
         sql = 'DELETE FROM ' + city + ' WHERE date< "%s"' % (date)
         self.cursor.execute(sql)
         self.conn.commit()
示例#12
0
 def write_house(self, house):
     title = house.title
     campus = house['campus']
     url = house['url']
     image_url = house['image_url']
     rental = house['rental']
     if type(rental) == type(0.02):
         rental = int(rental)
     area = house['area']
     house_type = house['house_type']
     source = house['source']
     date = house['date']
     rent_type = house['rent_type']
     floor = house['floor']
     address = house['address']
     district = house['district']
     district = self.check_guangzhou_district(district)
     city = house['city']
     lat = house['lat']
     lon = house['lon']
     md5 = Utils.generateMD5(url)
     try:
         rooms = house['rooms']
     except:
         rooms = Utils.get_rooms(house_type)
     time = Utils.getCurrentTime()
     import datetime
     timedelta = datetime.datetime.now() - datetime.datetime.strptime(
         date, '%Y-%m-%d')
     days = timedelta.days
     if days > self.expire_days:
         return
     table = self.get_city_pinyin_table(city)
     sql = "insert into " + table + "(rental, title, campus, house_type, date, rent_type, area, floor, district, address, url,image_url, source, city, lat, lon, time, rooms, md5) values( %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) on duplicate key update time = %s, date = %s, rental = %s"
     self.cursor.execute(
         sql, (rental, title, campus, house_type, date, rent_type, area,
               floor, district, address, url, image_url, source, city, lat,
               lon, time, rooms, md5, time, date, rental))
     self.conn.commit()
     print(Utils.getCurrentTime(), '在云端数据库中写入新房源:' + city, campus, url)
示例#13
0
    def parseHouse(self):
        if not self.https:
            response = self.downloader.download_html_response(self.url)
        else:
            response = self.downloader.download_https_response(self.url)
        if self.isInvalidPage(response):
            return

        if self.isMeetFireWall(response):
            self.runSpider = False
            redis = RedisUtils()
            redis.add_to_redis(self.name, self.url)
            return

        if response.status == 404:
            print('%s>%s网页不存在 %s' %
                  (Utils.getCurrentTime(), self.source, self.url))
            Utils.write_error('%s>%s网页不存在 %s' %
                              (Utils.getCurrentTime(), self.source, self.url))
            return

        house = House()
        house.url = self.getMobileUrl(response)
        house.title = self.getTitle(response)
        house.image_url = self.getImageUrl(response)
        house.city = self.getCity(response)
        house.district = self.getDistrict(response)
        house.rental = self.getRental(response)
        house.campus = self.getCampus(response)
        house.date = self.getDate(response)
        house.address = self.getAddress(response)
        house.source = self.source
        house.house_type = self.getHouseType(response)
        house.rooms = self.getRooms(house.house_type)
        house.area = self.getArea(response)
        house.floor = self.getFloor(response)
        house.contact = self.getContact(response)
        house.phone = self.getPhone(response)
        house.rent_type = self.getRentType(response)
        house.lat, house.lon = self.getLatLon(house)
        house.md5 = Utils.generateMD5(house.url)
        house.time = Utils.getCurrentTime()

        timedelta = datetime.datetime.now() - datetime.datetime.strptime(
            house.date, '%Y-%m-%d')
        days = timedelta.days
        if days > 10:
            print('%s 58过期房源:%s;%s;%s' %
                  (Utils.getCurrentTime(), house.date, house.title, house.url))
            return

        DB_Manager(house)
    def parseHouse(self, url):
        response = self.downloader.download_https_response(url)
        if response is None:
            return True

        if re.search(r'访问验证', response.body.decode('utf-8')):
            self.runSpider = False
            self.redis.add_to_redis("SpiderAnjuke", response.url)
            return False

        if response.status == 404:
            print('%s 安居客网页不存在 %s' % (Utils.getCurrentTime(), url))
            Utils.write_error('安居客网页不存在 %s' % url)
            return True
        house = House()
        house.url = self.getMobileUrl(response)
        house.title = self.getTitle(response)
        house.image_url = self.getImageUrl(response)
        house.city = self.getCity(response)
        house.district = self.getDistrict(response)
        house.rental = self.getRental(response)
        house.campus = self.getCampus(response)
        house.date = self.getDate(response)
        house.address = self.getAddress(response)
        house.source = self.source
        house.house_type = self.getHouseType(response)
        house.rooms = self.getRooms(house.house_type)
        house.area = self.getArea(response)
        house.floor = self.getFloor(response)
        house.contact = self.getContact(response)
        house.phone = self.getPhone(response)
        house.rent_type = self.getRentType(response)
        house.lat, house.lon, house.address = self.getLatLon(house)
        house.md5 = Utils.generateMD5(house.url)
        house.time = Utils.getCurrentTime()

        timedelta = datetime.datetime.now() - datetime.datetime.strptime(
            house.date, '%Y-%m-%d')
        days = timedelta.days
        if days > 7:
            print('%s 安居客过期房源:%s;%s;%s' %
                  (Utils.getCurrentTime(), house.date, house.title, house.url))
        elif house.isValidHouse():
            DB_Manager(house)
        return True
示例#15
0
 def getLatLon(house):
     return Utils.geoSearch(
         house.city, house.district + house.campus
     ) if house.district != "" else Utils.geoSearchWithAddress(
         house.city, house.address + house.campus)
示例#16
0
 def getDate(response):
     _date = response.xpath(
         '//div[@class="house-title"]/p/text()').extract()[0].strip()
     return Utils.transformDate(_date)
     pass
 def sadd(self, name, url):
     key = 'HouseMasterSpider:%s_filter' % name
     value = Utils.generateMD5(url)
     return self.conn.sadd(key, value)
示例#18
0
 def getLatLon(house):
     return Utils.geoSearch(house.city, house.district + house.campus)
示例#19
0
 def getRooms(houseType):
     return Utils.transformHouseType(houseType)
 def getDate(response):
     date_ = response.xpath(
         '//div[@class="card-status f-clear"]/ul/li/text()').extract()[0]
     return Utils.transformDate(date_)
     pass
示例#21
0
 def getDate(response):
     datetime = response.xpath(
         '//p[@class="gray9 fybh-zf"]/span[2]/text()').extract()[0]
     _date = re.findall(r'\d+-\d+-\d+', datetime)[0]
     return Utils.transformDate(_date)