예제 #1
0
 def __init__(self):
     # self.queueName = queueName
     # 初始化消息队列
     self.mqObj = rabbitmqConnection()
     self.pushChannel = self.mqObj.getChannel(targetQueue)
     # 获取runId
     self.run_id = getUUID()
예제 #2
0
 def __init__(self):
     # self.queueName = queueName
     # 初始化消息队列
     self.mqObj = rabbitmqConnection()
     # self.myConnection = self.mqObj.getRabbitmqConnection()
     self.readChannel = self.mqObj.getChannel(processQueue)
     # 获取runId
     self.run_id = getUUID()
예제 #3
0
    def __init__(self):
        # self.queueName = queueName
        # 初始化消息队列
        self.mqObj = rabbitmqConnection()
        self.channel = self.mqObj.getChannel(queueName)

        # 初始化mysql连接
        self.mysqlSession = mysqlConnection()
        # 获取runId
        self.run_id = getUUID()
예제 #4
0
    def parseErrorLog(self, message):
        dict = json.loads(message)
        run_id = dict.get('run_id')

        file_name = dict.get('file_name')
        error_message = dict.get('error_message')
        create_time = formatDateTime(0)
        soup = dict.get('soup')
        id = getUUID()

        # 异常入库(为本次运行统计做准备)
        self.mysqlSession.addOne(
            ErrorMessageEntity(id=id,
                               run_id=run_id,
                               file_name=file_name,
                               message=error_message,
                               create_time=create_time,
                               soup=soup))
        logger.info('已入库,异常id:%s,异常信息:%s' % (id, error_message[0:100]))
예제 #5
0
    def parse(self, message):
        dictObj = json.loads(message)

        province_id = dictObj.get('province_id')
        province_name = dictObj.get('province_name')
        city_name = dictObj.get('city_name')
        cityUrl = dictObj.get('cityUrl')
        cityOnsaleUrl = dictObj.get('cityOnsaleUrl')

        # # 目前已知雄安是没有二手在售的
        # if city_name == '雄安新区' or city_name =='江阴':
        #     logger.info('[%s]市跳过,暂不处理')
        #     return
        logger.info('处理[%s]市,在售Url:[%s]' % (city_name, cityOnsaleUrl))
        # ,这里比较特殊,因为在bk执行https://diqing.fang.ke.com/loupan/楼盘url的时候,如果不存在会走302跳转,这里不让他跳转,否则后续判断不对
        cityOnsaleUrl = 'https://xan.ke.com/ershoufang/'
        status_code, text, content = myGetRequest(cityOnsaleUrl,
                                                  Config.PROXY_FLAG,
                                                  allowRedirects=False)
        soup = BeautifulSoup(text, "html.parser")
        id = getUUID()
        # 插入市到数据库
        city_id = regionService.addOrInsert(
            RegionEntity(id=id,
                         pid=province_id,
                         name=city_name,
                         level=Config.CITY_LEVEL))
        if city_id != id:
            logger.warning('城市id:[%s],name:[%s],已存在' % (city_id, city_name))
        else:
            logger.info('城市id:[%s],name:[%s],入库成功' % (city_id, city_name))

        if status_code == 200:
            # 判断当前页面的总套数
            totalOnsale = soup.find_all(
                'h2',
                class_='total fl')[0].find_all('span')[0].getText().strip()
            if totalOnsale == '0':
                # 有页面但没有在售数据
                logger.warning('[%s]市没有在售数据,url:%s' %
                               (city_name, cityOnsaleUrl))
            # 有在售
            # 抓区县信息
            areaSoup = soup.find_all('a', attrs={'data-click-evtid': '12339'})
            for area in areaSoup:
                areaUri = area['href']
                area_name = area.getText().strip()
                areaUrl = cityUrl + areaUri

                id = getUUID()

                # 插入区域数据库
                area_id = regionService.addOrInsert(
                    RegionEntity(id=id,
                                 pid=province_id,
                                 name=city_name,
                                 level=Config.AREA_LEVEL))

                # 组装数据推送到区域队列
                dict = {
                    'province_id': province_id,
                    'province_name': province_name,
                    'city_id': city_id,
                    'city_name': city_name,
                    'cityUrl': cityUrl,
                    'area_id': area_id,
                    'area_name': area_name,
                    'areaUrl': areaUrl
                }
                self.pushResult(dict)

        else:
            logger.info('[%s]市没有在售数据,url:%s' % (city_name, cityOnsaleUrl))
예제 #6
0
    def parse(self, message):
        dictObj = json.loads(message)

        province_id = dictObj.get('province_id')
        province_name = dictObj.get('province_name')
        city_id = dictObj.get('city_id')
        city_name = dictObj.get('city_name')
        cityUrl = dictObj.get('cityUrl')
        area_id=dictObj.get('area_id')
        area_name = dictObj.get('area_name')
        areaUrl = dictObj.get('areaUrl')

        logger.info('处理[%s]区,url:[%s] ' % (area_name, areaUrl))
        # 打开区县url,抓取商圈信息
        status_code, text, content = myGetRequest(areaUrl, Config.PROXY_FLAG)

        soup = BeautifulSoup(text,"html.parser")
        # 获取到该区下面的全部商圈的url然后推送
        townSoupTemp = soup.find_all(attrs={"data-role": "ershoufang"})

        # if townSoupTemp == []:
        if len(townSoupTemp[0].find_all('div')) == 1:
            # 如果是空集合,表示该区县下没有商圈,采用区县名称作为商圈名称
            town_name = area_name
            id = getUUID()
            townSoupTemp = BeautifulSoup(text, "html.parser")

            # 插入商圈到数据库
            town_id = regionService.addOrInsert(RegionEntity(id=id,
                                                             pid=province_id,
                                                             name=city_name,
                                                             level=Config.TOWN_LEVEL
                                                             ))

            townUrl = areaUrl
            logger.info('处理[%s]商圈,url:%s ' % (town_name, townUrl))
            # 抓取商圈页面的小区总数-用于计算总页数
            totalOnsaleNum = int(
                townSoupTemp.find_all('div', class_='total fl')[0].find_all('span')[0].getText().strip())
            # 计算总页数
            totalPageNum = self.getCurrentPages(totalOnsaleNum)

            for x in range(1,totalPageNum+1):
                townUrlByPage=townUrl+'/pg'+str(x)
                # 组装数据推送到区域队列
                dict = {
                    'province_id': province_id,
                    'province_name': province_name,
                    'city_id': city_id,
                    'city_name': city_name,
                    'cityUrl': cityUrl,
                    'area_id': area_id,
                    'area_name': area_name,
                    'town_id': town_id,
                    'town_name':town_name,
                    'townUrlByPage':townUrlByPage
                }
                self.pushResult(dict)


        else:
            for t in townSoupTemp[0].find_all('div')[1].find_all('a'):
                town_name = t.getText()
                id = getUUID()
                townUri = t['href']
                townUrl = cityUrl + townUri

                townSoupTemp = soup

                # 插入商圈到数据库
                town_id = regionService.addOrInsert(RegionEntity(id=id,
                                                                 pid=province_id,
                                                                 name=city_name,
                                                                 level=Config.TOWN_LEVEL
                                                                 ))

                # 抓取商圈页面的小区总数-用于计算总页数
                totalOnsaleNum = int(
                    townSoupTemp.find_all('div', class_='total fl')[0].find_all('span')[0].getText())
                # 计算总页数
                totalPageNum = self.getCurrentPages(totalOnsaleNum)

                logger.info('处理[%s]商圈,url:%s,获取到[%d]页数据 ' % (town_name, townUrl,totalPageNum))

                for x in range(1, totalPageNum + 1):
                    townUrlByPage = townUrl + 'pg' + str(x)
                    # 组装数据推送到区域队列
                    dict = {
                        'province_id': province_id,
                        'province_name': province_name,
                        'city_id': city_id,
                        'city_name': city_name,
                        'area_id': area_id,
                        'area_name': area_name,
                        'town_id': town_id,
                        'town_name': town_name,
                        'townUrlByPage': townUrlByPage
                    }
                    self.pushResult(dict)