def __init__(self): # self.queueName = queueName # 初始化消息队列 self.mqObj = rabbitmqConnection() self.pushChannel = self.mqObj.getChannel(targetQueue) # 获取runId self.run_id = getUUID()
def __init__(self): # self.queueName = queueName # 初始化消息队列 self.mqObj = rabbitmqConnection() # self.myConnection = self.mqObj.getRabbitmqConnection() self.readChannel = self.mqObj.getChannel(processQueue) # 获取runId self.run_id = getUUID()
def __init__(self): # self.queueName = queueName # 初始化消息队列 self.mqObj = rabbitmqConnection() self.channel = self.mqObj.getChannel(queueName) # 初始化mysql连接 self.mysqlSession = mysqlConnection() # 获取runId self.run_id = getUUID()
def parseErrorLog(self, message): dict = json.loads(message) run_id = dict.get('run_id') file_name = dict.get('file_name') error_message = dict.get('error_message') create_time = formatDateTime(0) soup = dict.get('soup') id = getUUID() # 异常入库(为本次运行统计做准备) self.mysqlSession.addOne( ErrorMessageEntity(id=id, run_id=run_id, file_name=file_name, message=error_message, create_time=create_time, soup=soup)) logger.info('已入库,异常id:%s,异常信息:%s' % (id, error_message[0:100]))
def parse(self, message): dictObj = json.loads(message) province_id = dictObj.get('province_id') province_name = dictObj.get('province_name') city_name = dictObj.get('city_name') cityUrl = dictObj.get('cityUrl') cityOnsaleUrl = dictObj.get('cityOnsaleUrl') # # 目前已知雄安是没有二手在售的 # if city_name == '雄安新区' or city_name =='江阴': # logger.info('[%s]市跳过,暂不处理') # return logger.info('处理[%s]市,在售Url:[%s]' % (city_name, cityOnsaleUrl)) # ,这里比较特殊,因为在bk执行https://diqing.fang.ke.com/loupan/楼盘url的时候,如果不存在会走302跳转,这里不让他跳转,否则后续判断不对 cityOnsaleUrl = 'https://xan.ke.com/ershoufang/' status_code, text, content = myGetRequest(cityOnsaleUrl, Config.PROXY_FLAG, allowRedirects=False) soup = BeautifulSoup(text, "html.parser") id = getUUID() # 插入市到数据库 city_id = regionService.addOrInsert( RegionEntity(id=id, pid=province_id, name=city_name, level=Config.CITY_LEVEL)) if city_id != id: logger.warning('城市id:[%s],name:[%s],已存在' % (city_id, city_name)) else: logger.info('城市id:[%s],name:[%s],入库成功' % (city_id, city_name)) if status_code == 200: # 判断当前页面的总套数 totalOnsale = soup.find_all( 'h2', class_='total fl')[0].find_all('span')[0].getText().strip() if totalOnsale == '0': # 有页面但没有在售数据 logger.warning('[%s]市没有在售数据,url:%s' % (city_name, cityOnsaleUrl)) # 有在售 # 抓区县信息 areaSoup = soup.find_all('a', attrs={'data-click-evtid': '12339'}) for area in areaSoup: areaUri = area['href'] area_name = area.getText().strip() areaUrl = cityUrl + areaUri id = getUUID() # 插入区域数据库 area_id = regionService.addOrInsert( RegionEntity(id=id, pid=province_id, name=city_name, level=Config.AREA_LEVEL)) # 组装数据推送到区域队列 dict = { 'province_id': province_id, 'province_name': province_name, 'city_id': city_id, 'city_name': city_name, 'cityUrl': cityUrl, 'area_id': area_id, 'area_name': area_name, 'areaUrl': areaUrl } self.pushResult(dict) else: logger.info('[%s]市没有在售数据,url:%s' % (city_name, cityOnsaleUrl))
def parse(self, message): dictObj = json.loads(message) province_id = dictObj.get('province_id') province_name = dictObj.get('province_name') city_id = dictObj.get('city_id') city_name = dictObj.get('city_name') cityUrl = dictObj.get('cityUrl') area_id=dictObj.get('area_id') area_name = dictObj.get('area_name') areaUrl = dictObj.get('areaUrl') logger.info('处理[%s]区,url:[%s] ' % (area_name, areaUrl)) # 打开区县url,抓取商圈信息 status_code, text, content = myGetRequest(areaUrl, Config.PROXY_FLAG) soup = BeautifulSoup(text,"html.parser") # 获取到该区下面的全部商圈的url然后推送 townSoupTemp = soup.find_all(attrs={"data-role": "ershoufang"}) # if townSoupTemp == []: if len(townSoupTemp[0].find_all('div')) == 1: # 如果是空集合,表示该区县下没有商圈,采用区县名称作为商圈名称 town_name = area_name id = getUUID() townSoupTemp = BeautifulSoup(text, "html.parser") # 插入商圈到数据库 town_id = regionService.addOrInsert(RegionEntity(id=id, pid=province_id, name=city_name, level=Config.TOWN_LEVEL )) townUrl = areaUrl logger.info('处理[%s]商圈,url:%s ' % (town_name, townUrl)) # 抓取商圈页面的小区总数-用于计算总页数 totalOnsaleNum = int( townSoupTemp.find_all('div', class_='total fl')[0].find_all('span')[0].getText().strip()) # 计算总页数 totalPageNum = self.getCurrentPages(totalOnsaleNum) for x in range(1,totalPageNum+1): townUrlByPage=townUrl+'/pg'+str(x) # 组装数据推送到区域队列 dict = { 'province_id': province_id, 'province_name': province_name, 'city_id': city_id, 'city_name': city_name, 'cityUrl': cityUrl, 'area_id': area_id, 'area_name': area_name, 'town_id': town_id, 'town_name':town_name, 'townUrlByPage':townUrlByPage } self.pushResult(dict) else: for t in townSoupTemp[0].find_all('div')[1].find_all('a'): town_name = t.getText() id = getUUID() townUri = t['href'] townUrl = cityUrl + townUri townSoupTemp = soup # 插入商圈到数据库 town_id = regionService.addOrInsert(RegionEntity(id=id, pid=province_id, name=city_name, level=Config.TOWN_LEVEL )) # 抓取商圈页面的小区总数-用于计算总页数 totalOnsaleNum = int( townSoupTemp.find_all('div', class_='total fl')[0].find_all('span')[0].getText()) # 计算总页数 totalPageNum = self.getCurrentPages(totalOnsaleNum) logger.info('处理[%s]商圈,url:%s,获取到[%d]页数据 ' % (town_name, townUrl,totalPageNum)) for x in range(1, totalPageNum + 1): townUrlByPage = townUrl + 'pg' + str(x) # 组装数据推送到区域队列 dict = { 'province_id': province_id, 'province_name': province_name, 'city_id': city_id, 'city_name': city_name, 'area_id': area_id, 'area_name': area_name, 'town_id': town_id, 'town_name': town_name, 'townUrlByPage': townUrlByPage } self.pushResult(dict)