def parse(self, message): dictObj = json.loads(message) province_id = dictObj.get('province_id') province_name = dictObj.get('province_name') city_id = dictObj.get('city_id') city_name = dictObj.get('city_name') area_id = dictObj.get('area_id') area_name = dictObj.get('area_name') town_id = dictObj.get('town_id') town_name = dictObj.get('town_name') townUrlByPage = dictObj.get('townUrlByPage') logger.info('处理[%s]市[%s]区,分页url:[%s] ' % (city_name, area_name, townUrlByPage)) # 打开分页url,抓取信息 status_code, text, content = myGetRequest(townUrlByPage, Config.PROXY_FLAG) soup = BeautifulSoup(text, "html.parser") infoSoup = soup.find_all('div', class_='info') for info in infoSoup: title = info.find_all('div', class_='title')[0].find_all( 'a', attrs={'data-click-evtid': '15432'})[0].getText().strip() community_name = title.split(' ')[0] unit_price = info.find_all('div', class_='unitPrice')[0].find_all( 'span', class_='number')[0].getText().strip() deal_date = info.find_all('div', class_='dealDate')[0].getText().strip() url = info.find_all('div', class_='title')[0].find_all( 'a', attrs={'data-click-evtid': '15432'})[0]['href'] # 推送商圈信息 # 组装数据推送到区域队列 dict = { 'province_id': province_id, 'province_name': province_name, 'city_id': city_id, 'city_name': city_name, 'area_id': area_id, 'area_name': area_name, 'town_id': town_id, 'town_name': town_name, 'title': title, 'deal_date': deal_date, 'community_name': community_name, 'unit_price': unit_price, 'url': url } # 推送本次结果 self.pushResult(dict)
def getBDLocation(self,address,output='json'): ak='BQ5ngn47h1qhvnV9lAWh1NNPZaS4vTrW' url = '''http://api.map.baidu.com/geocoding/v3/?address=%s&output=%s&ak=%s''' % (address,output,ak) status_code, text, content = myGetRequest(url,0) resultJson =json.loads(text) status=resultJson['status'] if status != 0: # 告警,退出 logger.error('获取经纬度出现异常,异常代码:[%s]' % str(status)) lng = resultJson['result']['location']['lng'] lat = resultJson['result']['location']['lat'] # 手工截取小数点后六位 # print(lng) # print(lat) lng = Decimal(lng).quantize(Decimal('0.000000')) lat = Decimal(lat).quantize(Decimal('0.000000')) return lng,lat
def parse(self, message): dictObj = json.loads(message) province_id = dictObj.get('province_id') province_name = dictObj.get('province_name') city_id = dictObj.get('city_id') city_name = dictObj.get('city_name') area_id = dictObj.get('area_id') area_name = dictObj.get('area_name') town_id = dictObj.get('town_id') town_name = dictObj.get('town_name') title = dictObj.get('title') deal_date = dictObj.get('deal_date') community_name = dictObj.get('community_name') unit_price = dictObj.get('unit_price') url = dictObj.get('url') status_code, text, content = myGetRequest(url, Config.PROXY_FLAG) logger.info('开始处理[%s],url:%s' % (community_name, url)) soup = BeautifulSoup(text, "html.parser") infoSoup = soup.find_all('div', class_='info fr')[0] deal_price = infoSoup.find_all( 'span', class_='dealTotalPrice')[0].getText().strip() infoMsgSoup = infoSoup.find_all('div', class_='msg')[0].find_all('label') list_price = infoMsgSoup[0].getText().strip() cost_days = infoMsgSoup[1].getText().strip() reprice = infoMsgSoup[2].getText().strip() visit_num = infoMsgSoup[3].getText().strip() # 基本属性 baseSoup = soup.find_all('div', class_='base')[0].find_all('li') if len(baseSoup) != 14: raise IndexError # 先获取全部key keyList = [] for base in baseSoup: keyList.append(base.find_all('span')[0].getText()) # print(keyList) # 再获取全部value valueList = [] for base in baseSoup: # 去除span标签 [s.extract() for s in base("span")] valueList.append(base.getText().strip()) # print(valueList) # 组装字典 baseDict = {} for x in range(0, len(keyList)): baseDict[keyList[x]] = valueList[x] # print(baseDict) rooms = baseDict['房屋户型'] floor_info = baseDict['所在楼层'] floor_area = baseDict['建筑面积'] house_structure = baseDict['户型结构'] actual_area = baseDict['套内面积'] building_type = baseDict['建筑类型'] north = baseDict['房屋朝向'] build_year = baseDict['建成年代'] decorate_type = baseDict['装修情况'] building_structure = baseDict['建筑结构'] hot = baseDict['供暖方式'] elevator_rate = baseDict['梯户比例'] property_limit = baseDict['产权年限'] backup_elevator = baseDict['配备电梯'] # 交易属性 transSoup = soup.find_all('div', class_='transaction')[0].find_all('li') if len(transSoup) != 6: raise IndexError # 先获取全部key keyList = [] for base in transSoup: keyList.append(base.find_all('span')[0].getText()) # print(keyList) # 再获取全部value valueList = [] for base in transSoup: # 去除span标签 [s.extract() for s in base("span")] valueList.append(base.getText().strip()) # print(valueList) # 组装字典 transDict = {} for x in range(0, len(keyList)): transDict[keyList[x]] = valueList[x] bk_id = transDict['链家编号'] deal_belong = transDict['交易权属'] list_date = transDict['挂牌时间'] house_usage = transDict['房屋用途'] house_age = transDict['房屋年限'] house_belong = transDict['房权所属'] create_time = formatDateTime(0) # 经纬度 lng, lat = locationTool.getBDLocation(city_name + area_name + community_name) # 入库 id = getUUID() chengjiaoId = chengjiaoService.addOrInsert( OnsaleEntity(id=id, bk_id=bk_id, province_id=province_id, province_name=province_name, city_id=city_id, city_name=city_name, area_id=area_id, area_name=area_name, town_id=town_id, town_name=town_name, title=title, community_name=community_name, create_time=create_time, unit_price=unit_price, list_price=list_price, deal_price=deal_price, reprice=reprice, cost_days=cost_days, visit_num=visit_num, deal_date=deal_date, list_date=list_date, rooms=rooms, floor_info=floor_info, floor_area=floor_area, actual_area=actual_area, house_structure=house_structure, building_type=building_type, building_structure=building_structure, build_year=build_year, decorate_type=decorate_type, north=north, hot=hot, elevator_rate=elevator_rate, property_limit=property_limit, backup_elevator=backup_elevator, house_age=house_age, deal_belong=deal_belong, house_usage=house_usage, house_belong=house_belong, lng=lng, lat=lat, url=url)) if chengjiaoId != id: logger.debug('[%s]省,[%s]市,[%s]区,[%s]商圈,[%s]楼盘,在售id:[%s],已存在' % (province_name, city_name, area_name, town_name, community_name, chengjiaoId)) else: logger.info('[%s]省,[%s]市,[%s]区,[%s]商圈,[%s]楼盘,在售id:[%s],入库成功' % (province_name, city_name, area_name, town_name, community_name, chengjiaoId))
def run(self): try: status_code, text, content = myGetRequest(allCityPageUrl, Config.PROXY_FLAG) html_data = text soup = BeautifulSoup(html_data, "html.parser") allProviceElements = soup.find_all('div', class_="city_province") for p in allProviceElements: province_name = p.find_all( 'div', class_="city_list_tit c_b")[0].getText().strip() province_id = regionService.getIdByNameLevel( province_name, Config.PROVINCE_LEVEL) # print(province_name) # if province_name != '陕西': # continue print(province_name) if province_name == '美国': break logger.info('处理[%s]省' % (province_name)) # 处理城市数据 for c in p.find_all('li', class_="CLICKDATA"): # 组装数据,推送到城市队列 city_name = c.getText().strip() # if city_name !='西安': # continue # id = getUUID() cityUrl = 'https:' + c.find_all('a')[0]['href'] logger.info('处理[%s]市,cityUrl:[%s]' % (city_name, cityUrl)) # 判断城市是否有楼盘 cityOnsaleUrl = cityUrl + chengjiaoUri # matchObj = re.match( r'(.*)fang.ke.com(.*)', cityOnsaleUrl, re.M|re.I) # if not matchObj: # cityOnsaleUrl=cityOnsaleUrl.replace('ke.com','fang.ke.com') dict = { 'province_id': province_id, 'province_name': province_name, 'city_name': city_name, 'cityUrl': cityUrl, 'cityOnsaleUrl': cityOnsaleUrl } self.pushResult(dict) # 由于在售数据量比较大,先只获取各省会城市的数据 exit() except Exception: # 打印异常 traceback.print_exc() # 推送异常 error_message = traceback.format_exc() dict = { 'run_id': self.run_id, 'file_name': currentFileName, 'error_message': error_message, 'soup': '' } self.mqObj.pushMessageToMq(Config.ERROR_QUEUE, json.dumps(dict))
def parse(self, message): dictObj = json.loads(message) province_id = dictObj.get('province_id') province_name = dictObj.get('province_name') city_name = dictObj.get('city_name') cityUrl = dictObj.get('cityUrl') cityOnsaleUrl = dictObj.get('cityOnsaleUrl') # # 目前已知雄安是没有二手在售的 # if city_name == '雄安新区' or city_name =='江阴': # logger.info('[%s]市跳过,暂不处理') # return logger.info('处理[%s]市,在售Url:[%s]' % (city_name, cityOnsaleUrl)) # ,这里比较特殊,因为在bk执行https://diqing.fang.ke.com/loupan/楼盘url的时候,如果不存在会走302跳转,这里不让他跳转,否则后续判断不对 cityOnsaleUrl = 'https://xan.ke.com/ershoufang/' status_code, text, content = myGetRequest(cityOnsaleUrl, Config.PROXY_FLAG, allowRedirects=False) soup = BeautifulSoup(text, "html.parser") id = getUUID() # 插入市到数据库 city_id = regionService.addOrInsert( RegionEntity(id=id, pid=province_id, name=city_name, level=Config.CITY_LEVEL)) if city_id != id: logger.warning('城市id:[%s],name:[%s],已存在' % (city_id, city_name)) else: logger.info('城市id:[%s],name:[%s],入库成功' % (city_id, city_name)) if status_code == 200: # 判断当前页面的总套数 totalOnsale = soup.find_all( 'h2', class_='total fl')[0].find_all('span')[0].getText().strip() if totalOnsale == '0': # 有页面但没有在售数据 logger.warning('[%s]市没有在售数据,url:%s' % (city_name, cityOnsaleUrl)) # 有在售 # 抓区县信息 areaSoup = soup.find_all('a', attrs={'data-click-evtid': '12339'}) for area in areaSoup: areaUri = area['href'] area_name = area.getText().strip() areaUrl = cityUrl + areaUri id = getUUID() # 插入区域数据库 area_id = regionService.addOrInsert( RegionEntity(id=id, pid=province_id, name=city_name, level=Config.AREA_LEVEL)) # 组装数据推送到区域队列 dict = { 'province_id': province_id, 'province_name': province_name, 'city_id': city_id, 'city_name': city_name, 'cityUrl': cityUrl, 'area_id': area_id, 'area_name': area_name, 'areaUrl': areaUrl } self.pushResult(dict) else: logger.info('[%s]市没有在售数据,url:%s' % (city_name, cityOnsaleUrl))
def parse(self, message): dictObj = json.loads(message) province_id = dictObj.get('province_id') province_name = dictObj.get('province_name') city_id = dictObj.get('city_id') city_name = dictObj.get('city_name') cityUrl = dictObj.get('cityUrl') area_id=dictObj.get('area_id') area_name = dictObj.get('area_name') areaUrl = dictObj.get('areaUrl') logger.info('处理[%s]区,url:[%s] ' % (area_name, areaUrl)) # 打开区县url,抓取商圈信息 status_code, text, content = myGetRequest(areaUrl, Config.PROXY_FLAG) soup = BeautifulSoup(text,"html.parser") # 获取到该区下面的全部商圈的url然后推送 townSoupTemp = soup.find_all(attrs={"data-role": "ershoufang"}) # if townSoupTemp == []: if len(townSoupTemp[0].find_all('div')) == 1: # 如果是空集合,表示该区县下没有商圈,采用区县名称作为商圈名称 town_name = area_name id = getUUID() townSoupTemp = BeautifulSoup(text, "html.parser") # 插入商圈到数据库 town_id = regionService.addOrInsert(RegionEntity(id=id, pid=province_id, name=city_name, level=Config.TOWN_LEVEL )) townUrl = areaUrl logger.info('处理[%s]商圈,url:%s ' % (town_name, townUrl)) # 抓取商圈页面的小区总数-用于计算总页数 totalOnsaleNum = int( townSoupTemp.find_all('div', class_='total fl')[0].find_all('span')[0].getText().strip()) # 计算总页数 totalPageNum = self.getCurrentPages(totalOnsaleNum) for x in range(1,totalPageNum+1): townUrlByPage=townUrl+'/pg'+str(x) # 组装数据推送到区域队列 dict = { 'province_id': province_id, 'province_name': province_name, 'city_id': city_id, 'city_name': city_name, 'cityUrl': cityUrl, 'area_id': area_id, 'area_name': area_name, 'town_id': town_id, 'town_name':town_name, 'townUrlByPage':townUrlByPage } self.pushResult(dict) else: for t in townSoupTemp[0].find_all('div')[1].find_all('a'): town_name = t.getText() id = getUUID() townUri = t['href'] townUrl = cityUrl + townUri townSoupTemp = soup # 插入商圈到数据库 town_id = regionService.addOrInsert(RegionEntity(id=id, pid=province_id, name=city_name, level=Config.TOWN_LEVEL )) # 抓取商圈页面的小区总数-用于计算总页数 totalOnsaleNum = int( townSoupTemp.find_all('div', class_='total fl')[0].find_all('span')[0].getText()) # 计算总页数 totalPageNum = self.getCurrentPages(totalOnsaleNum) logger.info('处理[%s]商圈,url:%s,获取到[%d]页数据 ' % (town_name, townUrl,totalPageNum)) for x in range(1, totalPageNum + 1): townUrlByPage = townUrl + 'pg' + str(x) # 组装数据推送到区域队列 dict = { 'province_id': province_id, 'province_name': province_name, 'city_id': city_id, 'city_name': city_name, 'area_id': area_id, 'area_name': area_name, 'town_id': town_id, 'town_name': town_name, 'townUrlByPage': townUrlByPage } self.pushResult(dict)